In [428]:
import pandas as pd
cuisines_df = pd.read_csv("../data/cleaned_cuisines_og.csv")
cuisines_df.head()

FileNotFoundError: [Errno 2] No such file or directory: '../data/cleaned_cuisines_og.csv'

In [358]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score,precision_score,confusion_matrix,classification_report, precision_recall_curve
from sklearn.svm import SVC
import numpy as np

In [360]:
# Turn cuisines into label dataframe (y)
cuisines_label_df = cuisines_df['cuisine']
cuisines_label_df.head()

0    indian
1    indian
2    indian
3    indian
4    indian
Name: cuisine, dtype: object

In [362]:
# Drop "Unnamed: 0" and "cuisine" column. Rest are trainable features (X)
cuisines_feature_df = cuisines_df.drop(['Unnamed: 0', 'cuisine'], axis=1)
cuisines_feature_df.head()

Unnamed: 0,almond,angelica,anise,anise_seed,apple,apple_brandy,apricot,armagnac,artemisia,artichoke,...,whiskey,white_bread,white_wine,whole_grain_wheat_flour,wine,wood,yam,yeast,yogurt,zucchini
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


# Choose a Classifier

Looking for multiclass classification. 

Options:
- Multiclass Logistic Regression = Fast trainng times, linear model
- Muliticlass Neural Network = Accuracy, long training times
- Multiclass Decision Forest = Accuracy, fast training times
- One-vs-All Multiclass = Depends on two-class classifier
- Multiclass Boosted Decision tree = Non-parametric, fast training times and scalable


Reasoning:
- Neural networks too heavy
- No two-class classifier
- No need for ranking
- -> Try decision tree or multiclass logistic regression

In [366]:
# Split into train and test groups
X_train, X_test, y_train, y_test = train_test_split(cuisines_feature_df, cuisines_label_df, test_size=0.3)

In [368]:
print(f'X_train shape: {X_train.shape}')
print(f'X_test shape: {X_train.shape}')
print(f'y_train shape: {X_train.shape}')
print(f'y_test shape: {X_train.shape}')

X_train shape: (2796, 383)
X_test shape: (2796, 383)
y_train shape: (2796, 383)
y_test shape: (2796, 383)


In [370]:
X_train.head()

Unnamed: 0,almond,angelica,anise,anise_seed,apple,apple_brandy,apricot,armagnac,artemisia,artichoke,...,whiskey,white_bread,white_wine,whole_grain_wheat_flour,wine,wood,yam,yeast,yogurt,zucchini
674,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
831,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1457,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2991,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3293,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [372]:
y_train.head()

674     japanese
831      chinese
1457      korean
2991      indian
3293    japanese
Name: cuisine, dtype: object

In [374]:
# Apply logistic regression 
# (with multiclass one-vs-rest scheme and liblinear solver)
lr = OneVsRestClassifier(LogisticRegression(solver='liblinear'))
model = lr.fit(X_train, np.ravel(y_train))

accuracy = model.score(X_test, y_test)
print ("Accuracy is {}".format(accuracy))

Accuracy is 0.8198498748957465


In [396]:
# Predict one random row of data

random_idx = np.random.randint(0, len(X_test))
ingredients = X_test.iloc[random_idx][X_test.iloc[random_idx] != 0].index.tolist()
print(f'Ingredients for Row {random_idx}: {ingredients}')

pred = model.predict(X_test.iloc[[random_idx]])[0]
print(f'\nPredicted cuisine: {pred}')

print(f'True cuisine: {y_test.iloc[random_idx]}')

Ingredients for Row 528: ['bell_pepper', 'cayenne', 'frankfurter', 'onion', 'rice', 'scallion', 'soy_sauce', 'soybean']

Predicted cuisine: korean
True cuisine: korean


In [398]:
# Check accuracy of prediction with probabilities for each cuisine

test = X_test.iloc[[random_idx]]
proba = model.predict_proba(test)
classes = model.classes_
resultdf = pd.DataFrame(data=proba, columns=classes)

topPrediction = resultdf.T.sort_values(by=[0], ascending = [False])
topPrediction.head()

Unnamed: 0,0
korean,0.684051
chinese,0.166692
japanese,0.083633
thai,0.046794
indian,0.018831


In [400]:
# Print classification report for more detail
y_pred = model.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

     chinese       0.74      0.72      0.73       223
      indian       0.90      0.93      0.92       241
    japanese       0.74      0.85      0.79       241
      korean       0.84      0.81      0.82       244
        thai       0.88      0.79      0.83       250

    accuracy                           0.82      1199
   macro avg       0.82      0.82      0.82      1199
weighted avg       0.82      0.82      0.82      1199



# Try Other Classifers

In [403]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score,precision_score,confusion_matrix,classification_report, precision_recall_curve

In [405]:
C = 10
# Create different classifiers.
classifiers = {
    'Linear SVC': SVC(kernel='linear', C=C, probability=True,random_state=0),
    'KNN classifier': KNeighborsClassifier(C),
    'SVC': SVC(),
    'RFST': RandomForestClassifier(n_estimators=100),
    'ADA': AdaBoostClassifier(n_estimators=100, algorithm='SAMME')
}

In [407]:
# Train model with Linear SVC, K-Neighbors, SVC, Random Forest, and AdaBoost
n_classifiers = len(classifiers)

for index, (name, classifier) in enumerate(classifiers.items()):
    classifier.fit(X_train, np.ravel(y_train))

    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy (train) for %s: %0.1f%% " % (name, accuracy * 100))
    print(classification_report(y_test,y_pred))

Accuracy (train) for Linear SVC: 80.3% 
              precision    recall  f1-score   support

     chinese       0.69      0.72      0.70       223
      indian       0.87      0.91      0.89       241
    japanese       0.78      0.81      0.80       241
      korean       0.84      0.75      0.79       244
        thai       0.82      0.82      0.82       250

    accuracy                           0.80      1199
   macro avg       0.80      0.80      0.80      1199
weighted avg       0.80      0.80      0.80      1199

Accuracy (train) for KNN classifier: 74.2% 
              precision    recall  f1-score   support

     chinese       0.61      0.70      0.65       223
      indian       0.86      0.78      0.82       241
    japanese       0.65      0.84      0.73       241
      korean       0.91      0.58      0.71       244
        thai       0.78      0.80      0.79       250

    accuracy                           0.74      1199
   macro avg       0.76      0.74      0.74    

### Random forest seems to do the best, followed by SVC