# Build Classification Models

In [80]:
import pandas as pd

cuisines_df = pd.read_csv("../data/cleaned_cuisines.csv")
print(cuisines_df.head(2))

   Unnamed: 0 cuisine  almond  angelica  anise  anise_seed  apple  \
0           0  indian       0         0      0           0      0   
1           1  indian       1         0      0           0      0   

   apple_brandy  apricot  armagnac  ...  whiskey  white_bread  white_wine  \
0             0        0         0  ...        0            0           0   
1             0        0         0  ...        0            0           0   

   whole_grain_wheat_flour  wine  wood  yam  yeast  yogurt  zucchini  
0                        0     0     0    0      0       0         0  
1                        0     0     0    0      0       0         0  

[2 rows x 382 columns]


In [81]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.svm import SVC
import numpy as np

In [82]:
cuisines_label_df = cuisines_df['cuisine']
cuisines_label_df.head()

0    indian
1    indian
2    indian
3    indian
4    indian
Name: cuisine, dtype: object

In [83]:
cuisines_feature_df = cuisines_df.drop(["Unnamed: 0", 'cuisine'], axis = 1)
cuisines_feature_df.head()

Unnamed: 0,almond,angelica,anise,anise_seed,apple,apple_brandy,apricot,armagnac,artemisia,artichoke,...,whiskey,white_bread,white_wine,whole_grain_wheat_flour,wine,wood,yam,yeast,yogurt,zucchini
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [84]:
X_train, X_test, y_train, y_test = train_test_split(cuisines_feature_df, cuisines_label_df, test_size = 0.3)

In [85]:
lr = LogisticRegression()
model = lr.fit(X_train, np.ravel(y_train))

accuracy = model.score(X_test, y_test)
print("Accuracy: ", accuracy)

Accuracy:  0.804837364470392


In [86]:
print(f'ingredients: {X_test.iloc[50][X_test.iloc[50] != 0].keys()}')
print(f'cuisine: {y_test.iloc[50]}')

ingredients: Index(['bacon', 'basil', 'black_pepper', 'carrot', 'cognac', 'cream', 'egg',
       'fish', 'olive', 'olive_oil', 'onion', 'pumpkin', 'thyme', 'tomato'],
      dtype='object')
cuisine: korean


In [87]:
test = X_test.iloc[50].values.reshape(-1,1).T
proba = model.predict_proba(test)
classes = model.classes_
resultdf = pd.DataFrame(data=proba, columns=classes)

topPrediction = resultdf.T.sort_values(by=[0], ascending = [False])
topPrediction.head()



Unnamed: 0,0
korean,0.453438
japanese,0.170566
indian,0.167371
thai,0.13495
chinese,0.073674


In [88]:
y_pred = model.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

     chinese       0.69      0.69      0.69       218
      indian       0.91      0.92      0.91       238
    japanese       0.76      0.78      0.77       249
      korean       0.87      0.77      0.82       255
        thai       0.79      0.86      0.82       239

    accuracy                           0.80      1199
   macro avg       0.80      0.80      0.80      1199
weighted avg       0.81      0.80      0.80      1199



In [89]:
# Why