In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import set_config; set_config(display='diagram')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler, RobustScaler
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.metrics.pairwise import sigmoid_kernel, cosine_similarity, linear_kernel

from zytholic_project.base_model import BaseModel
from zytholic_project.evaluate import evaluate_proximity, get_recommendations, test_prediction

In [4]:
dfbrew = pd.read_csv("../raw_data/Beers_Breweries_and_Beer Reviews/breweries.csv")
dfbeer = pd.read_csv("../raw_data/beers_style_renamed.csv")
dftop = pd.read_csv("../raw_data/top_beer_info_style_renamed.csv")

In [21]:
dftop.head()

Unnamed: 0,name,key,style,style key,brewery,description,abv,ave rating,min ibu,max ibu,...,body,alcohol,bitter,sweet,sour,salty,fruits,hoppy,spices,malty
0,Amber,251,Altbier,8,Alaskan Brewing Co.,"Notes:Richly malty and long on the palate, wit...",5.3,3.65,25,50,...,32,9,47,74,33,0,33,57,8,111
1,Double Bag,252,Altbier,8,Long Trail Brewing Co.,"Notes:This malty, full-bodied double alt is al...",7.2,3.9,25,50,...,57,18,33,55,16,0,24,35,12,84
2,Long Trail Ale,253,Altbier,8,Long Trail Brewing Co.,Notes:Long Trail Ale is a full-bodied amber al...,5.0,3.58,25,50,...,37,6,42,43,11,0,10,54,4,62
3,Doppelsticke,254,Altbier,8,Uerige Obergärige Hausbrauerei,Notes:,8.5,4.15,25,50,...,55,31,47,101,18,1,49,40,16,119
4,Scurry,255,Altbier,8,Off Color Brewing,Notes:Just cause it's dark and German doesn't ...,5.3,3.67,25,50,...,69,10,63,120,14,0,19,36,15,218


In [None]:
#read correspondance brewery
corres_xls = pd.read_csv('../assets/matching_brewery_names.csv')
corres_xls.set_index('bbr', inplace=True)
corres= corres_xls.to_dict()
#corres[0]

In [None]:
dfbrew = dfbrew.rename(columns={"name": "brewery"})
dfbrew = dfbrew.rename(columns={"id": "brewery_id"})


dfbrewb =  pd.merge(dfbeer,dfbrew[['brewery_id','brewery']],how='left',on=['brewery_id'])

dftopbrew = pd.merge(dftop,dfbrewb[['name', 'brewery', 'state', 'country', 'retired']],
                     how='inner',on=['name','brewery'])

In [None]:
working_df = dftopbrew.drop(['description', 'key', 'style key'], axis= 1).drop_duplicates()
print(working_df.shape)
working_df = working_df[working_df.retired == 'f']
working_df.shape
#working_df['style'] = [st.split(' - ')[0] for st in working_df['style']]

working_df_train, working_df_test = train_test_split(working_df, test_size=0.2, stratify=working_df['style'])


In [None]:
working_df.sample(5)

# Pipeline

In [None]:
tastes_features = working_df.select_dtypes(np.number).columns[2:]
tastes_features

pipe_style_country = make_pipeline(OneHotEncoder(sparse=False, handle_unknown='ignore'))
pipe_abv_rating = make_pipeline(MinMaxScaler())
pipe_taste_features = make_pipeline(MinMaxScaler())
pipe_state = make_pipeline(
    SimpleImputer(strategy='constant', fill_value=''),
    OneHotEncoder(sparse=False, handle_unknown='ignore')
)

preprocess = make_column_transformer(
    (pipe_style_country, ['style', 
                          #'country'
                         ]),
    #(pipe_state, ['state']),
    (pipe_abv_rating, ['abv', 'ave rating']),
    (pipe_taste_features, tastes_features)
)

In [None]:
preprocess

In [None]:
preprocess.fit(X_train)
X_train_proc = preprocess.transform(X_train)
X_test_proc = preprocess.transform(X_test)

In [None]:
X_train_proc.shape, X_test_proc.shape

# Inertia calculations

In [None]:

inertias2 = []
for k in range(30, 100):
    res = KMeans(n_clusters=k)
    res.fit(X)
    inertias.append(res.inertia_)

In [None]:
plt.plot(range(3, 100),inertias)

In [None]:
inertias2 = []
for k in range(10, 30):
    res = KMeans(n_clusters=k)
    res.fit(X)
    inertias2.append(res.inertia_)

In [None]:
plt.plot(range(10, 30),inertias2)

In [None]:
cluster = KMeans(n_clusters=20)
cluster.fit(X)

In [None]:
X_test['group'] = cluster.predict(y)

In [None]:
X_test.group.value_counts()

# BaseModel Class usage

In [None]:
model = BaseModel()

model.get_data()

model.set_preprocess_pipeline()

model.process_data()


# compute sigmoid distance
sigmoid = sigmoid_kernel(
    model.X_train_proc,
    model.X_train_proc
)

#base_results = evaluate_proximity(model.X_train, sigmoid)

In [None]:
inertias2 = []
for k in range(155, 256, 5):
    res = model.fit(clusts= k)
    inertias2.append(res.kmeans_fit.inertia_)

In [None]:
plt.plot(range(155, 256, 5), inertias2)

In [None]:
base_results.plot.scatter(x='style', y='matching_percent')

In [None]:
base_results.describe().T

In [None]:
df.sample(10)

In [None]:
final_style.sort_values('matching_percent', ascending=False).tail(20)

In [None]:
final_style.sort_values('matching_percent', ascending=False).head(20)

In [None]:
def reformat_styles(working_df, ohe=True):
    """
    Simplify the columns 'style' of an input DF
    Converts various features insides style name to OHE features
    """
    # specific columns to implement
    features_to_implement = ['milk', 'old', 'dark', 'wild', 'pale', 'red', 'imperial']

    # Get matching table for styles names and format it
    style_xls = pd.read_excel('../assets/style_convert.xlsx')
    style_xls.columns = style_xls.iloc[0, :]
    style_xls = style_xls.iloc[1:, 1:]


    # creation of a dictionary to replace automatically
    style_dict = style_xls.set_index('Converted').to_dict()
    style_dict = style_dict['Simplified']
    style_dict

    #styles_test = working_df[['style']].drop_duplicates()

    working_df['simple_style'] = working_df['style'].replace(style_dict)
    
    # One-Hot-Encoding of featrues_to_implement
    if ohe:
        for feat in features_to_implement:
            working_df[feat] = [1 if feat in elm.lower() else 0 for elm in working_df['style']]
        
    working_df.rename(columns={'style':'original_style', 'simple_style':'style'}, inplace=True)

    return working_df

# Evaluation of average recommended style

In [None]:
reformated_df = reformat_styles(working_df)

preprocess.fit(reformated_df)

X_train = preprocess.transform(reformated_df)

sigmoid = sigmoid_kernel(X_train, X_train)

results = evaluate_proximity(working_df, sigmoid)

Results with fearture_to_impelement OHE + Simple style

In [None]:
results.plot.scatter(x='style', y='matching_percent')

In [None]:
results.describe().T

Results without fearture_to_impelement OHE + Simple style

In [None]:
reformated_df = reformat_styles(working_df, ohe=False)

preprocess.fit(reformated_df)

X_train = preprocess.transform(reformated_df)

sigmoid = sigmoid_kernel(X_train, X_train)

results = evaluate_proximity(working_df, sigmoid)

In [None]:
results.plot.scatter(x='style', y='matching_percent')

In [None]:
results.describe().T

In [None]:
results['style'].nunique()

Notes:
    
    10 beers in recommendations might be to hight
    try to reduce id to 5
    
    Also increase number of samples to test from

In [None]:
reformated_df = working_df_train
preprocess.fit(reformated_df)
X_train = preprocess.transform(reformated_df)
sigmoid = sigmoid_kernel(X_train, X_train)

base_results = evaluate_proximity(working_df_train,  n_recomm=10, tests=30, sim_matrix=sigmoid)

In [None]:
base_results.plot.scatter(x='style', y='matching_percent')

In [None]:
base_results.describe().T

In [None]:
base_results

In [None]:
reformated_df = reformat_styles(working_df_train)
preprocess.fit(reformated_df)
X_train = preprocess.transform(reformated_df)
sigmoid = sigmoid_kernel(X_train, X_train)

results = evaluate_proximity(working_df_train,  n_recomm=5, tests=30, sim_matrix=sigmoid)

In [None]:
results.plot.scatter(x='style', y='matching_percent')

In [None]:
results.describe().T

In [None]:
reformated_df = reformat_styles(working_df_train, ohe=False)
preprocess.fit(reformated_df)
X_train = preprocess.transform(reformated_df)
sigmoid = sigmoid_kernel(X_train, X_train)

results = evaluate_proximity(working_df_train,  n_recomm=5, tests=30, sim_matrix=sigmoid)

In [None]:
results.plot.scatter(x='style', y='matching_percent')

In [None]:
results.describe().T

In [None]:
X_train.shape

In [None]:
working_df_train

# Comparison of similarity metrics

In [None]:
model = BaseModel()

model.get_data()

model.set_preprocess_pipeline()

model.process_data()

In [None]:
# compute sigmoid distance
sigmoid = sigmoid_kernel(
    model.X_train_proc,
    model.X_train_proc
)

sig_results = evaluate_proximity(model.X_train,  n_recomm=5, tests=30, sim_matrix=sigmoid)
sig_results.describe().T

In [None]:
cosine = cosine_similarity(
    model.X_train_proc,
    model.X_train_proc
)

cosine_results = evaluate_proximity(model.X_train,  n_recomm=5, tests=30, sim_matrix=cosine)
cosine_results.describe().T

In [None]:
linear = linear_kernel(
    model.X_train_proc,
    model.X_train_proc
)

linear_results = evaluate_proximity(model.X_train,  n_recomm=5, tests=30, sim_matrix=linear)
linear_results.describe().T

# Predictions for beers in the test set

In [None]:
def test_prediction(model, n_recomm = 10):
    
    # Use the features to calculate kernel (calculate all distances at once?)
    sigmoid = cosine_similarity(model.X_test_proc, model.X_train_proc)

    matching_results = []
    substyles = []
    # Predict the closest beers in reference dataset
    for idx in range(sigmoid.shape[0]):
        distances = sigmoid[idx, :]
        sorted_distances = sorted(
                list(enumerate(distances)),
                key=lambda x:x[1],reverse=True)
        # get the top n predictions
        closest_items = sorted_distances[0:n_recomm+1]
        beers_indices = [i[0] for i in closest_items]
        
        # check original_style == predict_style percentage
        original_style = model.X_test["style"].iloc[idx]
        propositions = model.X_train.iloc[beers_indices, :]
        matching_percent = propositions[propositions['style'] == original_style].shape[0]\
            / propositions.shape[0] * 100
        matching_results.append(matching_percent)
        
        #compare substyle matching
        original_substyle = model.X_test.iloc[idx, -7:]
        comp = propositions.iloc[:, -7:]
        substyle_match = ((original_substyle == comp).sum()/ comp.shape[0]).min()
        substyles.append(substyle_match)
    
    results = model.X_test.copy()
    results['matching_percent'] = matching_results
    results['substyle'] = substyles
    return results

In [None]:
results = test_prediction(model)


results.groupby('style')[['matching_percent', 'substyle']].agg('describe')

In [None]:
a = model.X_train.iloc[0, -7:]

b = model.X_train.iloc[0:20, -7:]

In [None]:
((a == b).sum() / b.shape[0])

In [None]:
results = test_prediction(model)


results.groupby('style')[['matching_percent', 'matching_substyle']].agg('describe')


# Api Call Functions

In [3]:
from zytholic_project.apicall import *

In [45]:
# Import data, preprocess it
# To extract for function and to be executed only once
model = BaseModel()
model.get_data()
model.set_preprocess_pipeline()
model.preprocess.fit(model.working_df)
model.X = model.preprocess.transform(model.working_df)

# Get similarity scores between beers
# To extract for function and to be executed only once

kernel = cosine_similarity(model.X, model.X)


In [199]:
abv = 4.9
name = 'Double Bag'

In [229]:
model.working_df.sample(3)

Unnamed: 0,name,original_style,brewery,abv,ave rating,min ibu,max ibu,astringency,body,alcohol,...,country,retired,style,milk,old,dark,wild,pale,red,imperial
3706,Nitro Dry Irish Stout,Stout Dry - Irish,Breckenridge Brewery,4.8,3.77,30,40,24,102,4,...,US,f,Stout,0,0,0,0,0,0,0
2251,Lambic Blend,Gueuze - Belgian,Gueuzerie Tilquin,6.5,4.14,0,10,5,2,0,...,BE,f,Lambic,0,0,0,0,0,0,0
4195,Schneider Weisse Tap 1 Mein Blondes,Wheat Beer Hefeweizen,Weisses Bräuhaus G. Schneider & Sohn GmbH,4.9,4.17,10,15,22,38,14,...,DE,f,Wheat,0,0,0,0,0,0,0


In [58]:
model.working_df

Unnamed: 0,name,original_style,brewery,abv,ave rating,min ibu,max ibu,astringency,body,alcohol,...,country,retired,style,milk,old,dark,wild,pale,red,imperial
0,Double Bag,Altbier,Long Trail Brewing Co.,7.2,3.90,25,50,12,57,18,...,US,f,Altbier,0,0,0,0,0,0,0
1,Long Trail Ale,Altbier,Long Trail Brewing Co.,5.0,3.58,25,50,14,37,6,...,US,f,Altbier,0,0,0,0,0,0,0
2,Scurry,Altbier,Off Color Brewing,5.3,3.67,25,50,21,69,10,...,US,f,Altbier,0,0,0,0,0,0,0
3,Sleigh'r Dark Doüble Alt Ale,Altbier,Ninkasi Brewing Company,7.2,3.78,25,50,25,51,26,...,US,f,Altbier,0,0,0,0,0,0,0
4,Okto Festival Ale,Altbier,Widmer Brothers Brewing Company,5.3,3.46,25,50,28,40,3,...,US,f,Altbier,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4389,The First Snow Ale,Winter Warmer,RJ Rockers Brewing Company,6.0,3.63,35,50,15,31,23,...,US,f,Winter Warmer,0,0,0,0,0,0,0
4390,Red Nose Winter Ale,Winter Warmer,Natty Greene's Pub & Brewing Co.,6.8,3.59,35,50,8,44,24,...,US,f,Winter Warmer,0,0,0,0,0,0,0
4391,Fish Tale Winterfish,Winter Warmer,Fish Brewing Company / Fishbowl Brewpub,7.5,3.76,35,50,11,36,50,...,US,f,Winter Warmer,0,0,0,0,0,0,0
4392,"He'Brew Hanukkah, Chanukah: Pass The Beer",Winter Warmer,Shmaltz Brewing Company,8.0,3.61,35,50,6,64,30,...,US,f,Winter Warmer,0,0,0,0,0,0,0


In [201]:
if abv is not None:
        bad_index_abv = model.working_df[model.working_df['abv'] > abv]
        bad_index_abv = set(bad_index_abv.index)
        name_position = get_name_index(name, model.working_df)
        bad_index_abv.discard(int(name_position)) # keep current beer position in kernel
        # Set distance to zero in the kernel for indexes not wanted

        

In [202]:
results = get_recommendations(model.working_df, name, 
                      sim_matrix=kernel, n_recomm=5,
                        ignore_index=bad_index_abv)

In [206]:
results

Unnamed: 0,name,original_style,brewery,abv,ave rating,min ibu,max ibu,astringency,body,alcohol,...,country,retired,style,milk,old,dark,wild,pale,red,imperial
19,Frankenheim Alt,Altbier,Privatbrauerei Frankenheim,4.8,3.76,25,50,19,43,5,...,DE,f,Altbier,0,0,0,0,0,0,0
21,Diebels Premium Altbier,Altbier,Brauerei Diebels GmbH & Co KG,4.9,3.72,25,50,21,27,5,...,DE,f,Altbier,0,0,0,0,0,0,0
34,Duckstein Rotblondes Original,Altbier,Duckstein GmbH,4.9,3.51,25,50,17,31,5,...,DE,f,Altbier,0,0,0,0,0,0,0
6,Copper,Altbier,The Olde Mecklenburg Brewery,4.8,4.1,25,50,25,35,4,...,US,f,Altbier,0,0,0,0,0,0,0
15,Headwall Alt,Altbier,Tuckerman Brewing Co.,4.5,3.69,25,50,19,42,13,...,US,f,Altbier,0,0,0,0,0,0,0


In [64]:
get_most_similar_beers_ibu_abv("He'Brew Hanukkah, Chanukah: Pass The Beer",ibu=20, abv=16)

> [0;32m/Users/alix/code/ltexp1998/zytholic_project/zytholic_project/evaluate.py[0m(41)[0;36mget_recommendations[0;34m()[0m
[0;32m     40 [0;31m    [0;32mimport[0m [0mipdb[0m[0;34m;[0m [0mipdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 41 [0;31m    [0mbeers_indices[0m [0;34m=[0m [0mbeers_indices[0m[0;34m[[0m[0;34m:[0m[0mn_recomm[0m[0;34m+[0m[0;36m1[0m[0;34m][0m [0;31m# Remove 1st beer[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     42 [0;31m    [0;32mreturn[0m [0mdf[0m[0;34m.[0m[0miloc[0m[0;34m[[0m[0mbeers_indices[0m[0;34m,[0m [0;34m:[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> c


{'name': {4323: "He'Brew Hanukkah, Chanukah: Pass The Beer",
  3086: 'Roggen',
  4061: '10 Degrees Below',
  3084: 'Rasselbock',
  3083: 'Rogue Farms Roguenbier Rye',
  4056: 'Andechs Weissbier Dunkel'},
 'brewery': {4323: 'Shmaltz Brewing Company',
  3086: 'Apostelbräu',
  4061: 'Scuttlebutt Brewing Company - Restaurant and Pub',
  3084: 'Goose Island Beer Co.',
  3083: 'Rogue Ales',
  4056: 'Klosterbrauerei Andechs'},
 'style': {4323: 'Winter Warmer',
  3086: 'Rye',
  4061: 'Wheat',
  3084: 'Rye',
  3083: 'Rye',
  4056: 'Wheat'},
 'abv': {4323: 8.0, 3086: 5.3, 4061: 7.4, 3084: 6.8, 3083: 5.5, 4056: 5.0},
 'min ibu': {4323: 35, 3086: 10, 4061: 10, 3084: 10, 3083: 10, 4056: 10},
 'max ibu': {4323: 50, 3086: 20, 4061: 15, 3084: 20, 3083: 20, 4056: 15}}

In [57]:
get_most_similar_beers_ibu_abv('Amber',ibu=50, abv=5.)

> [0;32m/Users/alix/code/ltexp1998/zytholic_project/zytholic_project/evaluate.py[0m(41)[0;36mget_recommendations[0;34m()[0m
[0;32m     40 [0;31m    [0;32mimport[0m [0mipdb[0m[0;34m;[0m [0mipdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 41 [0;31m    [0mbeers_indices[0m [0;34m=[0m [0mbeers_indices[0m[0;34m[[0m[0;34m:[0m[0mn_recomm[0m[0;34m+[0m[0;36m1[0m[0;34m][0m [0;31m# Remove 1st beer[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     42 [0;31m    [0;32mreturn[0m [0mdf[0m[0;34m.[0m[0miloc[0m[0;34m[[0m[0mbeers_indices[0m[0;34m,[0m [0;34m:[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> c


{'name': {1534: 'Amber',
  1540: 'Grain Belt Nordeast',
  1558: 'American Amber',
  1538: 'Barking Squirrel Lager',
  1549: "Schell's Dark",
  1561: 'Ruby Lager'},
 'brewery': {1534: 'Abita Brewing Co.',
  1540: 'August Schell Brewing Company',
  1558: 'Straub Brewery',
  1538: 'Hop City Brewing Co.',
  1549: 'August Schell Brewing Company',
  1561: 'Olde Hickory Brewery'},
 'style': {1534: 'Lager',
  1540: 'Lager',
  1558: 'Lager',
  1538: 'Lager',
  1549: 'Lager',
  1561: 'Lager'},
 'abv': {1534: 4.5, 1540: 4.7, 1558: 4.1, 1538: 5.0, 1549: 4.8, 1561: 4.0},
 'min ibu': {1534: 18, 1540: 18, 1558: 18, 1538: 18, 1549: 18, 1561: 18},
 'max ibu': {1534: 30, 1540: 30, 1558: 30, 1538: 30, 1549: 30, 1561: 30}}

In [54]:
model.working_df.loc[[1534, 1533, 1540, 1538, 1549, 1565],:]

Unnamed: 0,name,original_style,brewery,abv,ave rating,min ibu,max ibu,astringency,body,alcohol,...,country,retired,style,milk,old,dark,wild,pale,red,imperial
1534,Milwaukee's Best,Lager - Adjunct,Miller Brewing Co.,4.8,1.8,8,18,21,17,11,...,US,f,Lager,0,0,0,0,0,0,0
1533,Solid Gold,Lager - Adjunct,Founders Brewing Company,4.4,3.65,8,18,9,13,2,...,US,f,Lager,0,0,0,0,0,0,0
1540,Grain Belt Premium,Lager - Adjunct,August Schell Brewing Company,4.7,3.27,8,18,15,17,8,...,US,f,Lager,0,0,0,0,0,0,0
1538,Genesee Beer,Lager - Adjunct,Genesee Brewing Co. / Dundee Brewing Co.,4.5,2.67,8,18,11,33,3,...,US,f,Lager,0,0,0,0,0,0,0
1549,Birra Peroni,Lager - Adjunct,Birra Peroni Industriale S.p.A.,4.7,2.72,8,18,21,24,5,...,IT,f,Lager,0,0,0,0,0,0,0
1565,Point Classic Amber,Lager Red Amber - American,Stevens Point Brewery,4.9,3.28,18,30,22,39,5,...,US,f,Lager,0,0,0,0,0,1,0
