# Build Classification Models

In [112]:
import pandas as pd

cuisines_df = pd.read_csv('../day2/cleaned_cuisines.csv')
cuisines_df.head(10)

Unnamed: 0.1,Unnamed: 0,cuisine,almond,angelica,anise,anise_seed,apple,apple_brandy,apricot,armagnac,...,whiskey,white_bread,white_wine,whole_grain_wheat_flour,wine,wood,yam,yeast,yogurt,zucchini
0,0,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,indian,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
5,5,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,6,indian,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
7,7,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
8,8,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
9,9,indian,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [113]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, precision_recall_curve, classification_report
from sklearn.svm import SVC
import numpy as np

In [114]:
cuisines_label_df = cuisines_df['cuisine']
cuisines_label_df.head()

0    indian
1    indian
2    indian
3    indian
4    indian
Name: cuisine, dtype: object

In [115]:
cuisines_features_df =  cuisines_df.drop(["Unnamed: 0",'cuisine'],axis=1)
cuisines_features_df.head()

Unnamed: 0,almond,angelica,anise,anise_seed,apple,apple_brandy,apricot,armagnac,artemisia,artichoke,...,whiskey,white_bread,white_wine,whole_grain_wheat_flour,wine,wood,yam,yeast,yogurt,zucchini
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [116]:
# split Data into train and test
X_train, X_test, y_train, y_test = train_test_split(cuisines_features_df,cuisines_label_df)  

In [117]:
# logistic regression with ovr multi_class and liblinear solver
lr = LogisticRegression(multi_class='ovr', solver='lbfgs') # liblinear lbfgs
model =   lr.fit(X_train,np.ravel(y_train))
accuracy = model.score(X_test,y_test)   
print(f"Accuracy is {accuracy}")

Accuracy is 0.7897897897897898


In [118]:
print(f'ingredients: {X_test.iloc[50][X_test.iloc[50]!=0].keys()}')
print(f'cuisine: {y_test.iloc[50]}')

ingredients: Index(['coriander', 'fish', 'lettuce', 'lime_juice', 'shallot'], dtype='object')
cuisine: thai


In [119]:
test = X_test.iloc[50].values.reshape(-1,1).T


In [120]:
proba = model.predict_proba(test)
classes = model.classes_
resultdf = pd.DataFrame(data=proba,columns=classes)
resultdf



Unnamed: 0,chinese,indian,japanese,korean,thai
0,0.007334,0.001619,0.030807,0.005693,0.954548


In [121]:
topPrediction = resultdf.T.sort_values(by=[0], ascending=True)
topPrediction.head()

Unnamed: 0,0
indian,0.001619
korean,0.005693
chinese,0.007334
japanese,0.030807
thai,0.954548


In [122]:
y_pred = model.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

     chinese       0.76      0.67      0.71       207
      indian       0.92      0.90      0.91       210
    japanese       0.73      0.78      0.76       204
      korean       0.82      0.75      0.78       193
        thai       0.72      0.85      0.78       185

    accuracy                           0.79       999
   macro avg       0.79      0.79      0.79       999
weighted avg       0.79      0.79      0.79       999

