# Build Classification Models

In [35]:
import pandas as pd

cuisines_df = pd.read_csv("../data/cleaned_cuisines.csv")
cuisines_df.head()

Unnamed: 0.1,Unnamed: 0,cuisine,almond,angelica,anise,anise_seed,apple,apple_brandy,apricot,armagnac,...,whiskey,white_bread,white_wine,whole_grain_wheat_flour,wine,wood,yam,yeast,yogurt,zucchini
0,0,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,indian,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [36]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score,precision_score,confusion_matrix,classification_report, precision_recall_curve
from sklearn.svm import SVC
import numpy as np

cuisines_label_df = cuisines_df["cuisine"]
cuisines_label_df.tail()

cuisines_feature_df = cuisines_df.drop(["Unnamed: 0", "cuisine"], axis=1)
cuisines_feature_df.head()

Unnamed: 0,almond,angelica,anise,anise_seed,apple,apple_brandy,apricot,armagnac,artemisia,artichoke,...,whiskey,white_bread,white_wine,whole_grain_wheat_flour,wine,wood,yam,yeast,yogurt,zucchini
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [37]:
X_train, X_test, y_train, y_test = train_test_split(cuisines_feature_df, cuisines_label_df, test_size=0.2, random_state=0)


In [38]:
model = LogisticRegression(
    multi_class='ovr',     # One-vs-Rest strategy
    solver='liblinear'     # solver that supports 'ovr' well
)

# Train
model.fit(X_train, np.ravel(y_train))

# Predict
predictions = model.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, predictions))

Accuracy: 0.8135168961201502




In [39]:
lr = LogisticRegression(multi_class='ovr',solver='lbfgs')
model = lr.fit(X_train, np.ravel(y_train))
    
accuracy = model.score(X_test, y_test)
print ("Accuracy is {}".format(accuracy))

Accuracy is 0.8135168961201502




In [40]:
print(f'ingredients: {X_test.iloc[130][X_test.iloc[130]!=0].keys()}')
print(f'cuisine: {y_test.iloc[130]}')

ingredients: Index(['bread', 'cayenne', 'cilantro', 'scallion'], dtype='object')
cuisine: thai


In [47]:
test= X_test.iloc[11].values.reshape(-1, 1).T
proba = model.predict_proba(test)
classes = model.classes_
resultdf = pd.DataFrame(data=proba, columns=classes)
    
topPrediction = resultdf.T.sort_values(by=[0], ascending = [False])
topPrediction.head()



Unnamed: 0,0
japanese,0.711946
chinese,0.205535
thai,0.061626
korean,0.016952
indian,0.003942


In [49]:
y_pred = model.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

     chinese       0.80      0.69      0.74       177
      indian       0.91      0.89      0.90       151
    japanese       0.76      0.80      0.78       164
      korean       0.83      0.81      0.82       149
        thai       0.79      0.89      0.84       158

    accuracy                           0.81       799
   macro avg       0.82      0.82      0.82       799
weighted avg       0.81      0.81      0.81       799

