In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import set_config; set_config(display='diagram')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler, RobustScaler
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.metrics.pairwise import sigmoid_kernel, cosine_similarity, linear_kernel

from zytholic_project.base_model import BaseModel
from zytholic_project.evaluate import evaluate_proximity, content, test_prediction

In [None]:
dfbrew = pd.read_csv("../raw_data/Beers_Breweries_and_Beer Reviews/breweries.csv")
dfbeer = pd.read_csv("../raw_data/beers_style_renamed.csv")
dftop = pd.read_csv("../raw_data/top_beer_info_style_renamed.csv")

In [None]:
dftop

In [None]:
#read correspondance brewery
corres_xls = pd.read_csv('../assets/matching_brewery_names.csv')
corres_xls.set_index('bbr', inplace=True)
corres= corres_xls.to_dict()
#corres[0]

In [None]:
dfbrew = dfbrew.rename(columns={"name": "brewery"})
dfbrew = dfbrew.rename(columns={"id": "brewery_id"})


dfbrewb =  pd.merge(dfbeer,dfbrew[['brewery_id','brewery']],how='left',on=['brewery_id'])

dftopbrew = pd.merge(dftop,dfbrewb[['name', 'brewery', 'state', 'country', 'retired']],
                     how='inner',on=['name','brewery'])

In [None]:
working_df = dftopbrew.drop(['description', 'key', 'style key'], axis= 1).drop_duplicates()
print(working_df.shape)
working_df = working_df[working_df.retired == 'f']
working_df.shape
#working_df['style'] = [st.split(' - ')[0] for st in working_df['style']]

working_df_train, working_df_test = train_test_split(working_df, test_size=0.2, stratify=working_df['style'])


In [None]:
working_df.sample(5)

# Pipeline

In [None]:
tastes_features = working_df.select_dtypes(np.number).columns[2:]
tastes_features

pipe_style_country = make_pipeline(OneHotEncoder(sparse=False, handle_unknown='ignore'))
pipe_abv_rating = make_pipeline(MinMaxScaler())
pipe_taste_features = make_pipeline(MinMaxScaler())
pipe_state = make_pipeline(
    SimpleImputer(strategy='constant', fill_value=''),
    OneHotEncoder(sparse=False, handle_unknown='ignore')
)

preprocess = make_column_transformer(
    (pipe_style_country, ['style', 
                          #'country'
                         ]),
    #(pipe_state, ['state']),
    (pipe_abv_rating, ['abv', 'ave rating']),
    (pipe_taste_features, tastes_features)
)

In [None]:
preprocess

In [None]:
preprocess.fit(X_train)
X_train_proc = preprocess.transform(X_train)
X_test_proc = preprocess.transform(X_test)

In [None]:
X_train_proc.shape, X_test_proc.shape

# Inertia calculations

In [None]:

inertias2 = []
for k in range(30, 100):
    res = KMeans(n_clusters=k)
    res.fit(X)
    inertias.append(res.inertia_)

In [None]:
plt.plot(range(3, 100),inertias)

In [None]:
inertias2 = []
for k in range(10, 30):
    res = KMeans(n_clusters=k)
    res.fit(X)
    inertias2.append(res.inertia_)

In [None]:
plt.plot(range(10, 30),inertias2)

In [None]:
cluster = KMeans(n_clusters=20)
cluster.fit(X)

In [None]:
X_test['group'] = cluster.predict(y)

In [None]:
X_test.group.value_counts()

# BaseModel Class usage

In [None]:
model = BaseModel()

model.get_data()

model.set_preprocess_pipeline()

model.process_data()


# compute sigmoid distance
sigmoid = sigmoid_kernel(
    model.X_train_proc,
    model.X_train_proc
)

#base_results = evaluate_proximity(model.X_train, sigmoid)

In [None]:
inertias2 = []
for k in range(155, 256, 5):
    res = model.fit(clusts= k)
    inertias2.append(res.kmeans_fit.inertia_)

In [None]:
plt.plot(range(155, 256, 5), inertias2)

In [None]:
base_results.plot.scatter(x='style', y='matching_percent')

In [None]:
base_results.describe().T

In [None]:
df.sample(10)

In [None]:
final_style.sort_values('matching_percent', ascending=False).tail(20)

In [None]:
final_style.sort_values('matching_percent', ascending=False).head(20)

In [None]:
def reformat_styles(working_df, ohe=True):
    """
    Simplify the columns 'style' of an input DF
    Converts various features insides style name to OHE features
    """
    # specific columns to implement
    features_to_implement = ['milk', 'old', 'dark', 'wild', 'pale', 'red', 'imperial']

    # Get matching table for styles names and format it
    style_xls = pd.read_excel('../assets/style_convert.xlsx')
    style_xls.columns = style_xls.iloc[0, :]
    style_xls = style_xls.iloc[1:, 1:]


    # creation of a dictionary to replace automatically
    style_dict = style_xls.set_index('Converted').to_dict()
    style_dict = style_dict['Simplified']
    style_dict

    #styles_test = working_df[['style']].drop_duplicates()

    working_df['simple_style'] = working_df['style'].replace(style_dict)
    
    # One-Hot-Encoding of featrues_to_implement
    if ohe:
        for feat in features_to_implement:
            working_df[feat] = [1 if feat in elm.lower() else 0 for elm in working_df['style']]
        
    working_df.rename(columns={'style':'original_style', 'simple_style':'style'}, inplace=True)

    return working_df

# Evaluation of average recommended style

In [None]:
reformated_df = reformat_styles(working_df)

preprocess.fit(reformated_df)

X_train = preprocess.transform(reformated_df)

sigmoid = sigmoid_kernel(X_train, X_train)

results = evaluate_proximity(working_df, sigmoid)

Results with fearture_to_impelement OHE + Simple style

In [None]:
results.plot.scatter(x='style', y='matching_percent')

In [None]:
results.describe().T

Results without fearture_to_impelement OHE + Simple style

In [None]:
reformated_df = reformat_styles(working_df, ohe=False)

preprocess.fit(reformated_df)

X_train = preprocess.transform(reformated_df)

sigmoid = sigmoid_kernel(X_train, X_train)

results = evaluate_proximity(working_df, sigmoid)

In [None]:
results.plot.scatter(x='style', y='matching_percent')

In [None]:
results.describe().T

In [None]:
results['style'].nunique()

Notes:
    
    10 beers in recommendations might be to hight
    try to reduce id to 5
    
    Also increase number of samples to test from

In [None]:
reformated_df = working_df_train
preprocess.fit(reformated_df)
X_train = preprocess.transform(reformated_df)
sigmoid = sigmoid_kernel(X_train, X_train)

base_results = evaluate_proximity(working_df_train,  n_recomm=10, tests=30, sim_matrix=sigmoid)

In [None]:
base_results.plot.scatter(x='style', y='matching_percent')

In [None]:
base_results.describe().T

In [None]:
base_results

In [None]:
reformated_df = reformat_styles(working_df_train)
preprocess.fit(reformated_df)
X_train = preprocess.transform(reformated_df)
sigmoid = sigmoid_kernel(X_train, X_train)

results = evaluate_proximity(working_df_train,  n_recomm=5, tests=30, sim_matrix=sigmoid)

In [None]:
results.plot.scatter(x='style', y='matching_percent')

In [None]:
results.describe().T

In [None]:
reformated_df = reformat_styles(working_df_train, ohe=False)
preprocess.fit(reformated_df)
X_train = preprocess.transform(reformated_df)
sigmoid = sigmoid_kernel(X_train, X_train)

results = evaluate_proximity(working_df_train,  n_recomm=5, tests=30, sim_matrix=sigmoid)

In [None]:
results.plot.scatter(x='style', y='matching_percent')

In [None]:
results.describe().T

In [None]:
X_train.shape

In [None]:
working_df_train

# Comparison of similarity metrics

In [None]:
model = BaseModel()

model.get_data()

model.set_preprocess_pipeline()

model.process_data()

In [None]:
# compute sigmoid distance
sigmoid = sigmoid_kernel(
    model.X_train_proc,
    model.X_train_proc
)

sig_results = evaluate_proximity(model.X_train,  n_recomm=5, tests=30, sim_matrix=sigmoid)
sig_results.describe().T

In [None]:
cosine = cosine_similarity(
    model.X_train_proc,
    model.X_train_proc
)

cosine_results = evaluate_proximity(model.X_train,  n_recomm=5, tests=30, sim_matrix=cosine)
cosine_results.describe().T

In [None]:
linear = linear_kernel(
    model.X_train_proc,
    model.X_train_proc
)

linear_results = evaluate_proximity(model.X_train,  n_recomm=5, tests=30, sim_matrix=linear)
linear_results.describe().T

# Predictions for beers in the test set

In [None]:
def test_prediction(model, n_recomm = 10):
    
    # Use the features to calculate kernel (calculate all distances at once?)
    sigmoid = cosine_similarity(model.X_test_proc, model.X_train_proc)

    matching_results = []
    substyles = []
    # Predict the closest beers in reference dataset
    for idx in range(sigmoid.shape[0]):
        distances = sigmoid[idx, :]
        sorted_distances = sorted(
                list(enumerate(distances)),
                key=lambda x:x[1],reverse=True)
        # get the top n predictions
        closest_items = sorted_distances[0:n_recomm+1]
        beers_indices = [i[0] for i in closest_items]
        
        # check original_style == predict_style percentage
        original_style = model.X_test["style"].iloc[idx]
        propositions = model.X_train.iloc[beers_indices, :]
        matching_percent = propositions[propositions['style'] == original_style].shape[0]\
            / propositions.shape[0] * 100
        matching_results.append(matching_percent)
        
        #compare substyle matching
        original_substyle = model.X_test.iloc[idx, -7:]
        comp = propositions.iloc[:, -7:]
        substyle_match = ((original_substyle == comp).sum()/ comp.shape[0]).min()
        substyles.append(substyle_match)
    
    results = model.X_test.copy()
    results['matching_percent'] = matching_results
    results['substyle'] = substyles
    return results

In [None]:
results = test_prediction(model)


results.groupby('style')[['matching_percent', 'substyle']].agg('describe')

In [None]:
a = model.X_train.iloc[0, -7:]

b = model.X_train.iloc[0:20, -7:]

In [None]:
((a == b).sum() / b.shape[0])

In [None]:
results = test_prediction(model)


results.groupby('style')[['matching_percent', 'matching_substyle']].agg('describe')


# Api Call Functions

In [41]:
from zytholic_project.apicall import *

In [43]:
get_most_similar_beers('Donnybrook Stout', n_beers=5)

Unnamed: 0,name,original_style,brewery,abv,ave rating,min ibu,max ibu,astringency,body,alcohol,...,country,retired,style,milk,old,dark,wild,pale,red,imperial
3678,Murphy's Irish Stout,Stout Dry - Irish,Murphy Brewery Ireland Limited,4.0,3.78,30,40,15,98,9,...,IE,f,Stout,0,0,0,0,0,0,0
3703,Boston Irish Stout,Stout Dry - Irish,Harpoon Brewery & Beer Hall,4.3,3.81,30,40,15,84,4,...,US,f,Stout,0,0,0,0,0,0,0
3698,O'Reilly's Irish Stout,Stout Dry - Irish,Sly Fox Brewing Company,3.6,3.71,30,40,23,92,2,...,US,f,Stout,0,0,0,0,0,0,0
3690,O.V.L. Stout,Stout Dry - Irish,Russian River Brewing Company,4.4,3.65,30,40,16,77,2,...,US,f,Stout,0,0,0,0,0,0,0
3687,Guinness Draught Extra Cold,Stout Dry - Irish,Guinness Ltd.,4.2,3.56,30,40,14,84,8,...,IE,f,Stout,0,0,0,0,0,0,0
