In [None]:
# default_exp core

# mahoudata

> API details.

In [None]:
#hide
from nbdev.showdoc import *

from sklearn.preprocessing import MinMaxScaler
from scipy.spatial.distance import pdist, squareform
from pandas_profiling import ProfileReport

import pandas as pd
import nltk

In [None]:
#export
class PreProcess:
    "Preprocess class to include all data preparation functions"
    def __init__(self, ctx):
        self.ctx = ctx
        
    def clean_duplicates(self):
        "Clean duplicates method"
        #TODO:
        #   CHECK FOR DUPLICATES BASED ON DESCRIPTION AND ATTRIBUTES
        # REMOVE THEM
        return 1
        
    def cols_munging(self, dataframe, fillna = True):
        "Columns preparation method"
        #Rename column
        df = dataframe.rename(columns={"Temperatura Servicio":"temperatura"})
        #Create ID for beers
        df['beerID'] = (range(1, len(df) + 1))
        df = df.set_index(df['beerID'].astype(str))
        #Move beerID to first col
        cols = df.columns.tolist()
        cols.insert(0, cols.pop(cols.index('beerID')))
        df = df.reindex(columns= cols)
        
        #fillna with 0
        #TODO: Augment to replace by median/mean
        if fillna:
            df = df.fillna(0)
        
        return df
    
    def scale_cols(self, dataframe):
        "Min Max scaler for numeric columns"
        scaler = MinMaxScaler()
        df_scaled = pd.DataFrame(
            scaler.fit_transform(dataframe[self.ctx['numeric_cols']]), 
                                 columns=dataframe[self.ctx['numeric_cols']].columns
            )
        return df_scaled

In [None]:
show_doc(PreProcess.clean_duplicates)
show_doc(PreProcess.cols_munging)
show_doc(PreProcess.scale_cols)

<h4 id="PreProcess.clean_duplicates" class="doc_header"><code>PreProcess.clean_duplicates</code><a href="__main__.py#L7" class="source_link" style="float:right">[source]</a></h4>

> <code>PreProcess.clean_duplicates</code>()

Clean duplicates method

<h4 id="PreProcess.cols_munging" class="doc_header"><code>PreProcess.cols_munging</code><a href="__main__.py#L14" class="source_link" style="float:right">[source]</a></h4>

> <code>PreProcess.cols_munging</code>(**`dataframe`**, **`fillna`**=*`True`*)

Columns preparation method

<h4 id="PreProcess.scale_cols" class="doc_header"><code>PreProcess.scale_cols</code><a href="__main__.py#L33" class="source_link" style="float:right">[source]</a></h4>

> <code>PreProcess.scale_cols</code>(**`dataframe`**)

Min Max scaler for numeric columns

In [None]:
#export
class RecommenderStrategyFactory:
    "Strategy factory"
    def __init__(self, ctx):
        self.context = ctx
        
    def createStrategy(self, strategy):
        recommender_strategy = strategy.lower()
        
        if recommender_strategy == 'numeric':
            instance = NumericStrategy(self.context)
            
        else:
            instance = DescriptionAndNumeric(self.context)
            
        return instance

In [None]:
show_doc(RecommenderStrategyFactory.createStrategy)

<h4 id="RecommenderStrategyFactory.createStrategy" class="doc_header"><code>RecommenderStrategyFactory.createStrategy</code><a href="__main__.py#L7" class="source_link" style="float:right">[source]</a></h4>

> <code>RecommenderStrategyFactory.createStrategy</code>(**`strategy`**)



In [None]:
#export
class NumericStrategy:
    "Numeric based recommender system"
    def __init__(self, ctx):
        self.ctx = ctx
    
    def model_builder(self, dataframe):
        preprocessor = PreProcess(self.ctx)
        df = preprocessor.cols_munging(dataframe, fillna = True)
        df = preprocessor.scale_cols(df)
        return df
    
    def exec_strategy(self, dataframe, distance = 'cosine'):
        if distance == 'euclidean':
             recommender_df = pd.DataFrame(
             squareform(pdist(dataframe[self.ctx['numeric_cols']])),
             columns = dataframe.index.astype(str),
             index = dataframe.index
             )
            
        else:
            recommender_df = pd.DataFrame(
            squareform(pdist(dataframe[self.ctx['numeric_cols']], metric = 'cosine')),
            columns = dataframe.index,
            index = dataframe.index
            )
            
        return recommender_df   

In [None]:
show_doc(NumericStrategy.model_builder)
show_doc(NumericStrategy.exec_strategy)

<h4 id="NumericStrategy.model_builder" class="doc_header"><code>NumericStrategy.model_builder</code><a href="__main__.py#L7" class="source_link" style="float:right">[source]</a></h4>

> <code>NumericStrategy.model_builder</code>(**`dataframe`**)



<h4 id="NumericStrategy.exec_strategy" class="doc_header"><code>NumericStrategy.exec_strategy</code><a href="__main__.py#L13" class="source_link" style="float:right">[source]</a></h4>

> <code>NumericStrategy.exec_strategy</code>(**`dataframe`**, **`distance`**=*`'cosine'`*)



# Explore Data

In [None]:
df = pd.read_csv("./data/dataset-datathon.csv")

In [None]:
profile = ProfileReport(df, title='Pandas Profiling Report', html={'style':{'full_width':True}})
profile.to_notebook_iframe()

## Remove duplicates

According to profile there are 60% duplicates. Get rid of them

In [None]:
df_clean = df.drop_duplicates(
#subset = df.columns.difference(['vajilla'])
)

In [None]:
profile = ProfileReport(df_clean, title='Pandas Profiling Report', html={'style':{'full_width':True}})
profile.to_notebook_iframe()

# Run Recommender

In [None]:
context = {'numeric_cols' : ['lupulo_afrutado_citrico', 
                             'lupulo_floral_herbal','amargor', 'color', 
                             'maltoso', 'licoroso', 'afrutado', 'especias','acidez']
}

f = RecommenderStrategyFactory(context)

strategy = f.createStrategy('numeric')

datamodel = strategy.model_builder(df_clean)

recommender_df = strategy.exec_strategy(datamodel)

recommender_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,476,477,478,479,480,481,482,483,484,485
0,0.000000,0.000000,0.042737,0.014204,0.019602,0.003507,0.046649,0.079535,0.019307,0.019307,...,0.107993,0.039501,0.178008,0.153839,0.372661,0.048717,0.037445,0.063611,0.034118,0.033039
1,0.000000,0.000000,0.042737,0.014204,0.019602,0.003507,0.046649,0.079535,0.019307,0.019307,...,0.107993,0.039501,0.178008,0.153839,0.372661,0.048717,0.037445,0.063611,0.034118,0.033039
2,0.042737,0.042737,0.000000,0.027731,0.111271,0.045083,0.139327,0.149810,0.109016,0.109016,...,0.186975,0.019876,0.262107,0.083787,0.430263,0.024581,0.057169,0.110674,0.015178,0.009456
3,0.014204,0.014204,0.027731,0.000000,0.042773,0.014581,0.072928,0.073416,0.040805,0.040805,...,0.122566,0.015192,0.194159,0.123382,0.377082,0.024367,0.027006,0.058254,0.011111,0.016657
4,0.019602,0.019602,0.111271,0.042773,0.000000,0.016331,0.029392,0.063220,0.008608,0.008608,...,0.081491,0.092742,0.127279,0.209331,0.319723,0.095347,0.063383,0.051801,0.088428,0.089179
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
481,0.048717,0.048717,0.024581,0.024367,0.095347,0.044618,0.097419,0.099593,0.087663,0.087663,...,0.106695,0.022456,0.219157,0.050305,0.295921,0.000000,0.027633,0.048033,0.017491,0.012491
482,0.037445,0.037445,0.057169,0.027006,0.063383,0.038345,0.038369,0.109984,0.049632,0.049632,...,0.095466,0.046965,0.176550,0.113228,0.301670,0.027633,0.000000,0.049326,0.030405,0.046380
483,0.063611,0.063611,0.110674,0.058254,0.051801,0.046130,0.050065,0.060642,0.059130,0.059130,...,0.027529,0.092380,0.117532,0.093103,0.154117,0.048033,0.049326,0.000000,0.089897,0.081997
484,0.034118,0.034118,0.015178,0.011111,0.088428,0.041219,0.107349,0.101764,0.073097,0.073097,...,0.153602,0.004043,0.269176,0.114322,0.419526,0.017491,0.030405,0.089897,0.000000,0.008074


In [None]:
recommendations_example = pd.DataFrame(recommender_df[1].sort_values(ascending=True))
recommendations_example

Unnamed: 0,1
0,0.000000
1,0.000000
452,0.000000
5,0.003507
305,0.003507
...,...
473,0.647605
142,0.660262
193,
195,
