In [0]:
!pip install eli5
!pip install plotly_express



In [0]:
import pandas as pd
import numpy as np
pd.options.display.max_colwidth = 200

In [0]:
from google.colab import files
uploaded = files.upload()

In [0]:
df = pd.read_json('train.json').drop('id', axis=1)
print(df.shape)
df.head()

(39774, 2)


Unnamed: 0,cuisine,ingredients
0,greek,"[romaine lettuce, black olives, grape tomatoes, garlic, pepper, purple onion, seasoning, garbanzo beans, feta cheese crumbles]"
1,southern_us,"[plain flour, ground pepper, salt, tomatoes, ground black pepper, thyme, eggs, green tomatoes, yellow corn meal, milk, vegetable oil]"
2,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, green chilies, grilled chicken breasts, garlic powder, yellow onion, soy sauce, butter, chicken livers]"
3,indian,"[water, vegetable oil, wheat, salt]"
4,indian,"[black pepper, shallots, cornflour, cayenne pepper, onions, garlic paste, milk, butter, salt, lemon juice, water, chili powder, passata, oil, ground cumin, boneless chicken skinless thigh, garam m..."


In [0]:
df['ingredients'] = df['ingredients'].apply(lambda x: ' '.join(x))

In [0]:
import plotly_express as px
cuisine_counts = df.groupby('cuisine').aggregate({'ingredients':np.count_nonzero}).reset_index().rename(columns = {'ingredients':'recipe_count'})
px.bar(cuisine_counts,x='cuisine',y='recipe_count',color='cuisine')

### Train/Test split

In [0]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, train_size=0.80, test_size=0.20, stratify=df['cuisine'], random_state=42)
train.shape, test.shape

((31819, 2), (7955, 2))

### Arrange data into X features matrix and y target vector

In [0]:
target = 'cuisine'
X_train=train.drop(columns=target)
y_train=train[target]
X_test=test.drop(columns=target)
y_test=test[target]
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((31819, 1), (31819,), (7955, 1), (7955,))

### Count Vectorizer along with Logistic regression model

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
vec = CountVectorizer()
clf = LogisticRegression()

pipe = make_pipeline(vec, clf)
pipe.fit(X_train.ingredients, y_train)







Pipeline(memory=None,
         steps=[('countvectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('logisticregression',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None,


In [0]:
from sklearn import metrics

def print_report(pipe):
    y_actuals = y_test
    y_preds = pipe.predict(X_test['ingredients'])
    report = metrics.classification_report(y_actuals, y_preds)
    print(report)
    print("accuracy: {:0.3f}".format(metrics.accuracy_score(y_actuals, y_preds)))

print_report(pipe)

              precision    recall  f1-score   support

   brazilian       0.72      0.59      0.65        93
     british       0.60      0.46      0.52       161
cajun_creole       0.78      0.69      0.73       309
     chinese       0.81      0.85      0.83       535
    filipino       0.71      0.56      0.62       151
      french       0.57      0.61      0.59       529
       greek       0.76      0.66      0.70       235
      indian       0.85      0.91      0.88       601
       irish       0.67      0.42      0.52       133
     italian       0.80      0.89      0.84      1568
    jamaican       0.85      0.78      0.81       105
    japanese       0.79      0.70      0.74       284
      korean       0.81      0.75      0.77       166
     mexican       0.88      0.92      0.90      1288
    moroccan       0.83      0.68      0.75       164
     russian       0.69      0.39      0.50        98
 southern_us       0.70      0.79      0.74       864
     spanish       0.69    

In [0]:
clf.classes_

array(['brazilian', 'british', 'cajun_creole', 'chinese', 'filipino',
       'french', 'greek', 'indian', 'irish', 'italian', 'jamaican',
       'japanese', 'korean', 'mexican', 'moroccan', 'russian',
       'southern_us', 'spanish', 'thai', 'vietnamese'], dtype=object)

### Feature Weights

In [0]:
for i, tag in enumerate(clf.classes_):
    coefficients = clf.coef_[i]
    weights = list(zip(vec.get_feature_names(),coefficients))
    print('Tag:',tag)
    print('Most Positive Coefficients:')
    print(sorted(weights,key=lambda x: -x[1])[:10])
    print('Most Negative Coefficients:')
    print(sorted(weights,key=lambda x: x[1])[:10])
    print("--------------------------------------")

Tag: brazilian
Most Positive Coefficients:
[('cachaca', 5.890445814166117), ('manioc', 3.170121795556176), ('tapioca', 2.9905448868968763), ('açai', 2.7192975389243457), ('sprinkles', 2.032979446449359), ('stone', 1.7866716200521304), ('piri', 1.7513929823714602), ('palm', 1.6982284528058103), ('dende', 1.6852285881453315), ('hearts', 1.4282690867488832)]
Most Negative Coefficients:
[('evaporated', -2.070566898395168), ('tortillas', -1.6705276773884858), ('curry', -1.6600785657797308), ('basil', -1.5841917003636632), ('cardamom', -1.5101676507926811), ('kidney', -1.4604257700124788), ('sesame', -1.4534732367875722), ('mushrooms', -1.4476456378316116), ('soy', -1.4083834388947913), ('seasoning', -1.3987080919947739)]
--------------------------------------
Tag: british
Most Positive Coefficients:
[('stilton', 4.8500075149848465), ('mincemeat', 3.0491621535293727), ('marmite', 2.4396943896753878), ('haddock', 2.3799185893905643), ('drippings', 2.321966250789993), ('cheddar', 2.02784109443

In [0]:
import eli5
eli5.show_weights(clf, vec=vec, top=20)

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,Unnamed: 15_level_0,Unnamed: 16_level_0,Unnamed: 17_level_0,Unnamed: 18_level_0,Unnamed: 19_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,Unnamed: 13_level_5,Unnamed: 14_level_5,Unnamed: 15_level_5,Unnamed: 16_level_5,Unnamed: 17_level_5,Unnamed: 18_level_5,Unnamed: 19_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6,Unnamed: 13_level_6,Unnamed: 14_level_6,Unnamed: 15_level_6,Unnamed: 16_level_6,Unnamed: 17_level_6,Unnamed: 18_level_6,Unnamed: 19_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7,Unnamed: 13_level_7,Unnamed: 14_level_7,Unnamed: 15_level_7,Unnamed: 16_level_7,Unnamed: 17_level_7,Unnamed: 18_level_7,Unnamed: 19_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8,Unnamed: 13_level_8,Unnamed: 14_level_8,Unnamed: 15_level_8,Unnamed: 16_level_8,Unnamed: 17_level_8,Unnamed: 18_level_8,Unnamed: 19_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9,Unnamed: 11_level_9,Unnamed: 12_level_9,Unnamed: 13_level_9,Unnamed: 14_level_9,Unnamed: 15_level_9,Unnamed: 16_level_9,Unnamed: 17_level_9,Unnamed: 18_level_9,Unnamed: 19_level_9
Weight?,Feature,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10,Unnamed: 11_level_10,Unnamed: 12_level_10,Unnamed: 13_level_10,Unnamed: 14_level_10,Unnamed: 15_level_10,Unnamed: 16_level_10,Unnamed: 17_level_10,Unnamed: 18_level_10,Unnamed: 19_level_10
Weight?,Feature,Unnamed: 2_level_11,Unnamed: 3_level_11,Unnamed: 4_level_11,Unnamed: 5_level_11,Unnamed: 6_level_11,Unnamed: 7_level_11,Unnamed: 8_level_11,Unnamed: 9_level_11,Unnamed: 10_level_11,Unnamed: 11_level_11,Unnamed: 12_level_11,Unnamed: 13_level_11,Unnamed: 14_level_11,Unnamed: 15_level_11,Unnamed: 16_level_11,Unnamed: 17_level_11,Unnamed: 18_level_11,Unnamed: 19_level_11
Weight?,Feature,Unnamed: 2_level_12,Unnamed: 3_level_12,Unnamed: 4_level_12,Unnamed: 5_level_12,Unnamed: 6_level_12,Unnamed: 7_level_12,Unnamed: 8_level_12,Unnamed: 9_level_12,Unnamed: 10_level_12,Unnamed: 11_level_12,Unnamed: 12_level_12,Unnamed: 13_level_12,Unnamed: 14_level_12,Unnamed: 15_level_12,Unnamed: 16_level_12,Unnamed: 17_level_12,Unnamed: 18_level_12,Unnamed: 19_level_12
Weight?,Feature,Unnamed: 2_level_13,Unnamed: 3_level_13,Unnamed: 4_level_13,Unnamed: 5_level_13,Unnamed: 6_level_13,Unnamed: 7_level_13,Unnamed: 8_level_13,Unnamed: 9_level_13,Unnamed: 10_level_13,Unnamed: 11_level_13,Unnamed: 12_level_13,Unnamed: 13_level_13,Unnamed: 14_level_13,Unnamed: 15_level_13,Unnamed: 16_level_13,Unnamed: 17_level_13,Unnamed: 18_level_13,Unnamed: 19_level_13
Weight?,Feature,Unnamed: 2_level_14,Unnamed: 3_level_14,Unnamed: 4_level_14,Unnamed: 5_level_14,Unnamed: 6_level_14,Unnamed: 7_level_14,Unnamed: 8_level_14,Unnamed: 9_level_14,Unnamed: 10_level_14,Unnamed: 11_level_14,Unnamed: 12_level_14,Unnamed: 13_level_14,Unnamed: 14_level_14,Unnamed: 15_level_14,Unnamed: 16_level_14,Unnamed: 17_level_14,Unnamed: 18_level_14,Unnamed: 19_level_14
Weight?,Feature,Unnamed: 2_level_15,Unnamed: 3_level_15,Unnamed: 4_level_15,Unnamed: 5_level_15,Unnamed: 6_level_15,Unnamed: 7_level_15,Unnamed: 8_level_15,Unnamed: 9_level_15,Unnamed: 10_level_15,Unnamed: 11_level_15,Unnamed: 12_level_15,Unnamed: 13_level_15,Unnamed: 14_level_15,Unnamed: 15_level_15,Unnamed: 16_level_15,Unnamed: 17_level_15,Unnamed: 18_level_15,Unnamed: 19_level_15
Weight?,Feature,Unnamed: 2_level_16,Unnamed: 3_level_16,Unnamed: 4_level_16,Unnamed: 5_level_16,Unnamed: 6_level_16,Unnamed: 7_level_16,Unnamed: 8_level_16,Unnamed: 9_level_16,Unnamed: 10_level_16,Unnamed: 11_level_16,Unnamed: 12_level_16,Unnamed: 13_level_16,Unnamed: 14_level_16,Unnamed: 15_level_16,Unnamed: 16_level_16,Unnamed: 17_level_16,Unnamed: 18_level_16,Unnamed: 19_level_16
Weight?,Feature,Unnamed: 2_level_17,Unnamed: 3_level_17,Unnamed: 4_level_17,Unnamed: 5_level_17,Unnamed: 6_level_17,Unnamed: 7_level_17,Unnamed: 8_level_17,Unnamed: 9_level_17,Unnamed: 10_level_17,Unnamed: 11_level_17,Unnamed: 12_level_17,Unnamed: 13_level_17,Unnamed: 14_level_17,Unnamed: 15_level_17,Unnamed: 16_level_17,Unnamed: 17_level_17,Unnamed: 18_level_17,Unnamed: 19_level_17
Weight?,Feature,Unnamed: 2_level_18,Unnamed: 3_level_18,Unnamed: 4_level_18,Unnamed: 5_level_18,Unnamed: 6_level_18,Unnamed: 7_level_18,Unnamed: 8_level_18,Unnamed: 9_level_18,Unnamed: 10_level_18,Unnamed: 11_level_18,Unnamed: 12_level_18,Unnamed: 13_level_18,Unnamed: 14_level_18,Unnamed: 15_level_18,Unnamed: 16_level_18,Unnamed: 17_level_18,Unnamed: 18_level_18,Unnamed: 19_level_18
Weight?,Feature,Unnamed: 2_level_19,Unnamed: 3_level_19,Unnamed: 4_level_19,Unnamed: 5_level_19,Unnamed: 6_level_19,Unnamed: 7_level_19,Unnamed: 8_level_19,Unnamed: 9_level_19,Unnamed: 10_level_19,Unnamed: 11_level_19,Unnamed: 12_level_19,Unnamed: 13_level_19,Unnamed: 14_level_19,Unnamed: 15_level_19,Unnamed: 16_level_19,Unnamed: 17_level_19,Unnamed: 18_level_19,Unnamed: 19_level_19
+5.890,cachaca,,,,,,,,,,,,,,,,,,
+3.170,manioc,,,,,,,,,,,,,,,,,,
+2.991,tapioca,,,,,,,,,,,,,,,,,,
+2.719,açai,,,,,,,,,,,,,,,,,,
+2.033,sprinkles,,,,,,,,,,,,,,,,,,
+1.787,stone,,,,,,,,,,,,,,,,,,
+1.751,piri,,,,,,,,,,,,,,,,,,
+1.698,palm,,,,,,,,,,,,,,,,,,
+1.685,dende,,,,,,,,,,,,,,,,,,
+1.428,hearts,,,,,,,,,,,,,,,,,,

Weight?,Feature
+5.890,cachaca
+3.170,manioc
+2.991,tapioca
+2.719,açai
+2.033,sprinkles
+1.787,stone
+1.751,piri
+1.698,palm
+1.685,dende
+1.428,hearts

Weight?,Feature
+4.850,stilton
+3.049,mincemeat
+2.440,marmite
+2.380,haddock
+2.322,drippings
+2.028,cheddar
+2.026,suet
+2.006,dates
+1.927,malt
+1.921,worcestershire

Weight?,Feature
+3.834,cajun
+3.200,creole
+1.986,crawfish
+1.956,salami
+1.812,jambalaya
+1.795,yeast
+1.726,evaporated
+1.623,boudin
+1.591,powdered
… 507 more positive …,… 507 more positive …

Weight?,Feature
+3.112,szechwan
+2.678,mandarin
+2.318,mein
+2.298,custard
+2.295,fermented
+1.949,yardlong
+1.880,hoisin
+1.830,cornflour
+1.806,wonton
+1.746,taro

Weight?,Feature
+3.804,calamansi
+2.375,lumpia
+2.325,glutinous
+2.211,dogs
+2.136,tilapia
+1.944,patis
+1.768,edam
+1.756,peppercorns
+1.747,papaya
+1.683,evaporated

Weight?,Feature
+2.730,crepes
+2.698,gruyere
+2.457,swiss
+2.342,gruyère
+2.305,niçoise
+2.198,cognac
+2.039,roquefort
+1.945,burgundy
+1.855,snails
+1.850,duck

Weight?,Feature
+4.100,feta
+3.780,phyllo
+3.302,greek
+2.860,ouzo
+2.746,tahini
+2.335,dill
+2.085,lamb
+1.870,orzo
+1.692,garbanzo
+1.681,bulgur

Weight?,Feature
+3.941,tandoori
+2.835,curds
+2.730,curry
+2.662,masala
+2.625,yogurt
+2.518,yoghurt
+2.301,cardamom
+2.184,naan
+2.068,capsicum
+1.951,basmati

Weight?,Feature
+4.869,irish
+2.946,corned
+2.745,stout
+2.460,brisket
+1.825,potatoes
+1.723,guinness
+1.554,beer
+1.459,croissants
+1.458,lamb
+1.439,steel

Weight?,Feature
+3.474,arborio
+2.964,gnocchi
+2.795,polenta
+2.777,spaghetti
+2.750,pesto
+2.718,marinara
+2.603,breadstick
+2.482,mascarpone
+2.455,marsala
+2.301,fettucine

Weight?,Feature
+5.559,jerk
+3.348,allspice
+3.133,thyme
+2.286,rum
+2.196,plantains
+1.971,bananas
+1.963,nutmeg
+1.735,sorrel
+1.677,habanero
+1.630,jamaican

Weight?,Feature
+4.248,miso
+3.374,soba
+3.321,sake
+3.296,mirin
+3.289,bonito
+2.915,udon
+2.811,teriyaki
+2.808,tonkatsu
+2.798,edamame
+2.773,wasabi

Weight?,Feature
+4.922,kimchi
+2.958,gochujang
+2.574,wings
+2.304,korean
+2.072,sesame
+2.014,cakes
+1.947,syrup
+1.845,pinenuts
+1.744,gochugaru
+1.589,zucchini

Weight?,Feature
+4.035,tortillas
+3.669,taco
+3.393,tequila
+3.074,enchilada
+3.061,mexican
+3.056,tortilla
+2.954,jicama
+2.651,masa
+2.650,chayotes
+2.320,achiote

Weight?,Feature
+3.356,harissa
+3.065,couscous
+2.846,semolina
+2.526,preserved
+2.334,flower
+1.853,apricot
+1.822,tea
+1.747,cumin
+1.706,cinnamon
+1.699,spices

Weight?,Feature
+2.793,beets
+2.422,farmer
+2.301,dillweed
+2.227,cottage
+2.167,gherkins
+2.047,sauerkraut
+2.036,dill
+1.981,rye
+1.966,sour
+1.789,pierogi

Weight?,Feature
+4.356,grits
+3.899,eyed
+2.964,wafers
+2.720,collard
+2.607,peaches
+2.582,vidalia
+2.476,barbecue
+2.392,lima
+2.257,biscuits
+1.984,bags

Weight?,Feature
+2.933,manchego
+2.409,chorizo
+2.236,sherry
+2.074,pimenton
+1.744,cucumber
+1.651,quince
+1.650,codfish
+1.636,cava
+1.589,pimentos
+1.561,spanish

Weight?,Feature
+2.136,sticky
+1.848,chunky
+1.675,tapioca
+1.582,palm
+1.540,teas
+1.496,galangal
+1.481,lemongrass
+1.478,pla
+1.476,jasmine
… 423 more positive …,… 423 more positive …

Weight?,Feature
+3.135,vietnamese
+2.167,maggi
+2.091,tapioca
+2.088,coffee
+1.982,baguette
+1.922,nuoc
+1.776,fish
+1.722,broccolini
+1.650,grass
+1.442,warm
