In [None]:
# default_exp core

# mahoudata

> API details.

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
from sklearn.preprocessing import MinMaxScaler
from scipy.spatial.distance import pdist, squareform
from pandas_profiling import ProfileReport

import pandas as pd
import numpy as np
import nltk

In [None]:
#export
class PreProcess:
    "Preprocess class to include all data preparation functions"
    def __init__(self, ctx):
        self.ctx = ctx
        
    def clean_duplicates(self):
        "Clean duplicates method"
        #TODO:
        #   CHECK FOR DUPLICATES BASED ON DESCRIPTION AND ATTRIBUTES
        # REMOVE THEM
        return 1
        
    def cols_munging(self, dataframe, fillna = True):
        "Columns preparation method"
        #Rename column
        df = dataframe.rename(columns={'Temperatura Servicio':'temperatura'})
        #Create ID for beers
        df['beerID'] = (range(1, len(df) + 1))
        df = df.set_index(df['beerID'].astype(str))
        #Move beerID to first col
        cols = df.columns.tolist()
        cols.insert(0, cols.pop(cols.index('beerID')))
        df = df.reindex(columns= cols)   
        
        #Convert to lowercase
        df = df.applymap(lambda s:s.lower() if type(s) == str else s)
        
        #Removes c from 
        df['temperatura'] = df['temperatura'].replace('c', '')
        
        return df
    
    def fill_na(self, dataframe, method = 'median'):
        "Replaces NaN values with method"
        if method == '0':
            df = dataframe.fillna(0)
            
        if method == 'mean':
            df = dataframe.fillna(dataframe.mean())
        
        else:
            df = dataframe.fillna(dataframe.median())
        
        return df
    
            
    
    def scale_cols(self, dataframe):
        "Min Max scaler for numeric columns"
        #num_cols = dataframe.columns[dataframe.dtypes.apply(lambda c: np.issubdtype(c, np.number))]
        
        scaler = MinMaxScaler()
        #dataframe[num_cols] = scaler.fit_transform(dataframe[num_cols])
        df_scaled = pd.DataFrame(
            scaler.fit_transform(dataframe[self.ctx['numeric_cols']]), 
                                 columns=dataframe[self.ctx['numeric_cols']].columns
            )
        
        return df_scaled

In [None]:
show_doc(PreProcess.clean_duplicates)
show_doc(PreProcess.cols_munging)
show_doc(PreProcess.fill_na)
show_doc(PreProcess.scale_cols)

<h4 id="PreProcess.clean_duplicates" class="doc_header"><code>PreProcess.clean_duplicates</code><a href="__main__.py#L7" class="source_link" style="float:right">[source]</a></h4>

> <code>PreProcess.clean_duplicates</code>()

Clean duplicates method

<h4 id="PreProcess.cols_munging" class="doc_header"><code>PreProcess.cols_munging</code><a href="__main__.py#L14" class="source_link" style="float:right">[source]</a></h4>

> <code>PreProcess.cols_munging</code>(**`dataframe`**, **`fillna`**=*`True`*)

Columns preparation method

<h4 id="PreProcess.fill_na" class="doc_header"><code>PreProcess.fill_na</code><a href="__main__.py#L34" class="source_link" style="float:right">[source]</a></h4>

> <code>PreProcess.fill_na</code>(**`dataframe`**, **`method`**=*`'median'`*)

Replaces NaN values with method

<h4 id="PreProcess.scale_cols" class="doc_header"><code>PreProcess.scale_cols</code><a href="__main__.py#L49" class="source_link" style="float:right">[source]</a></h4>

> <code>PreProcess.scale_cols</code>(**`dataframe`**)

Min Max scaler for numeric columns

In [None]:
#export
class RecommenderStrategyFactory:
    "Strategy factory"
    def __init__(self, ctx):
        self.context = ctx
        
    def createStrategy(self, strategy):
        recommender_strategy = strategy.lower()
        
        if recommender_strategy == 'numeric':
            instance = NumericStrategy(self.context)
            
        else:
            instance = DescriptionAndNumeric(self.context)
            
        return instance

In [None]:
show_doc(RecommenderStrategyFactory.createStrategy)

<h4 id="RecommenderStrategyFactory.createStrategy" class="doc_header"><code>RecommenderStrategyFactory.createStrategy</code><a href="__main__.py#L7" class="source_link" style="float:right">[source]</a></h4>

> <code>RecommenderStrategyFactory.createStrategy</code>(**`strategy`**)



In [None]:
#export
class NumericStrategy:
    "Numeric based recommender system"
    def __init__(self, ctx):
        self.ctx = ctx
    
    def model_builder(self, dataframe):
        preprocessor = PreProcess(self.ctx)
        df = preprocessor.cols_munging(dataframe)
        df = preprocessor.fill_na(df, 'median')
        df = preprocessor.scale_cols(df)
        
        
        return df
    
    def exec_strategy(self, dataframe, distance = 'cosine'):
        if distance == 'euclidean':
             recommender_df = pd.DataFrame(
             squareform(pdist(dataframe[self.ctx['numeric_cols']])),
             columns = dataframe.index.astype(str),
             index = dataframe.index
             )
            
        else:
            recommender_df = pd.DataFrame(
            squareform(pdist(dataframe[self.ctx['numeric_cols']], metric = 'cosine')),
            columns = dataframe.index,
            index = dataframe.index
            )
            
        return recommender_df   

In [None]:
show_doc(NumericStrategy.model_builder)
show_doc(NumericStrategy.exec_strategy)

<h4 id="NumericStrategy.model_builder" class="doc_header"><code>NumericStrategy.model_builder</code><a href="__main__.py#L7" class="source_link" style="float:right">[source]</a></h4>

> <code>NumericStrategy.model_builder</code>(**`dataframe`**)



<h4 id="NumericStrategy.exec_strategy" class="doc_header"><code>NumericStrategy.exec_strategy</code><a href="__main__.py#L16" class="source_link" style="float:right">[source]</a></h4>

> <code>NumericStrategy.exec_strategy</code>(**`dataframe`**, **`distance`**=*`'cosine'`*)



In [None]:
#export
class RecommenderHelper:
    "Class for obtaining computed recommendations"
    def __init__(self):
        pass
    
    def get_top_recommendations(rec_dataframe, beerID , topk = 5, sort_asc = True):
        recommendations = pd.DataFrame(rec_dataframe[beerID].sort_values(ascending=sort_asc))
        recommendations = recommendations.drop([beerID], axis=0)
        recommendations.reset_index(level=0, inplace=True)
        recommendations.columns = ['beerID','cosine_dist']

        rec = recommendations[0:topk]
        
        return rec

In [None]:
show_doc(RecommenderHelper.get_top_recommendations)

<h4 id="RecommenderHelper.get_top_recommendations" class="doc_header"><code>RecommenderHelper.get_top_recommendations</code><a href="__main__.py#L7" class="source_link" style="float:right">[source]</a></h4>

> <code>RecommenderHelper.get_top_recommendations</code>(**`rec_dataframe`**, **`beerID`**, **`topk`**=*`5`*, **`sort_asc`**=*`True`*)



# Explore Data

In [None]:
df = pd.read_csv("./data/dataset-datathon.csv")

context = {'numeric_cols' : ['lupulo_afrutado_citrico', 
                             'lupulo_floral_herbal','amargor', 'color', 
                             'maltoso', 'licoroso', 'afrutado', 'especias','acidez']
}

In [None]:
profile = ProfileReport(df, title='Pandas Profiling Report', html={'style':{'full_width':True}})
profile.to_notebook_iframe()

## Remove duplicates

According to profile there are 60% duplicates. Get rid of them

In [None]:
df_clean = df.drop_duplicates(
subset = df.columns.difference(['vajilla'])
)

In [None]:
profile = ProfileReport(df_clean, title='Pandas Profiling Report', html={'style':{'full_width':True}})
profile.to_notebook_iframe()

# Run Recommender

In [None]:
f = RecommenderStrategyFactory(context)

strategy = f.createStrategy('numeric')

datamodel = strategy.model_builder(df_clean)

recommender_df = strategy.exec_strategy(datamodel)

recommender_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,472,473,474,475,476,477,478,479,480,481
0,0.000000,0.047415,0.003247,0.018953,0.003421,0.091687,0.074829,0.022629,0.022629,0.111810,...,0.105950,0.030003,0.044675,0.145143,0.256348,0.039177,0.026858,0.050713,0.024794,0.049222
1,0.047415,0.000000,0.028834,0.012647,0.050154,0.063655,0.053232,0.010847,0.010847,0.095622,...,0.086773,0.020033,0.032089,0.082647,0.239943,0.027402,0.062483,0.117934,0.015956,0.014698
2,0.003247,0.028834,0.000000,0.011132,0.007807,0.082056,0.057230,0.011863,0.011863,0.104905,...,0.093416,0.016190,0.033315,0.123624,0.250812,0.025874,0.028131,0.058189,0.012319,0.028870
3,0.018953,0.012647,0.011132,0.000000,0.015547,0.063519,0.058663,0.010086,0.010086,0.080221,...,0.077451,0.026505,0.017011,0.079057,0.225650,0.022433,0.040835,0.067101,0.020558,0.024629
4,0.003421,0.050154,0.007807,0.015547,0.000000,0.093026,0.079839,0.028052,0.028052,0.104384,...,0.097089,0.041824,0.036260,0.128222,0.249017,0.040898,0.033240,0.039917,0.036607,0.052592
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
477,0.039177,0.027402,0.025874,0.022433,0.040898,0.040726,0.029397,0.010698,0.010698,0.044549,...,0.032916,0.021580,0.016716,0.050831,0.149350,0.000000,0.026939,0.048856,0.016969,0.029919
478,0.026858,0.062483,0.028131,0.040835,0.033240,0.049601,0.086540,0.025646,0.025646,0.051417,...,0.070806,0.045837,0.062186,0.114410,0.136284,0.026939,0.000000,0.045937,0.030399,0.069017
479,0.050713,0.117934,0.058189,0.067101,0.039917,0.102068,0.084820,0.072307,0.072307,0.069321,...,0.050673,0.091401,0.048418,0.099959,0.157469,0.048856,0.045937,0.000000,0.088857,0.110005
480,0.024794,0.015956,0.012319,0.020558,0.036607,0.060153,0.040360,0.004062,0.004062,0.092834,...,0.082440,0.003796,0.040091,0.112092,0.224286,0.016969,0.030399,0.088857,0.000000,0.021496


Get an example of recommendations for beerID = 1

In [None]:
RecommenderHelper.get_top_recommendations(recommender_df, beerID=1, topk=6, sort_asc=True)

Unnamed: 0,beerID,cosine_dist
0,454,1.110223e-16
1,8,0.01084706
2,7,0.01084706
3,461,0.01154931
4,3,0.01264741
5,334,0.01457539
