# Cuisine Machine Learning Classification Modeling

### Import necessary packages

In [1]:
import re
import json
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

from collections import defaultdict

from sklearn import preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report



### Load Data into JSON file

In [2]:
#load in json file
f = open('recipes_raw/train.json','r')
recipes_train_txt = f.read()
recipes_train_json = json.loads(recipes_train_txt)


### Data Cleaning and Mining

In [3]:
id_ = []
cuisine = []
ingredients = []
for i in range(len(recipes_train_json)):
    id_.append(recipes_train_json[i]['id'])
    cuisine.append(recipes_train_json[i]['cuisine'])
    ingredients.append(recipes_train_json[i]['ingredients'])

df = pd.DataFrame({'id': id_,
                  'cuisine':cuisine,
                  'ingredients':ingredients})


#### Display Initial Target Variables

In [4]:
df['cuisine'].value_counts()

italian         7838
mexican         6438
southern_us     4320
indian          3003
chinese         2673
french          2646
cajun_creole    1546
thai            1539
japanese        1423
greek           1175
spanish          989
korean           830
vietnamese       825
moroccan         821
british          804
filipino         755
irish            667
jamaican         526
russian          489
brazilian        467
Name: cuisine, dtype: int64

#### Create, clean, and explore new dataframe containing only recipies from the 10 cuisines with highest number of dishes.

In [5]:
top10amt = ['italian','mexican','southern_us','indian','chinese','french','cajun_creole','thai','japanese','greek']
#bottom10amt = ['spanish','korea','vietnamese','moroccan','british','filipino','irish','jamaican','russian','brazilian']


In [6]:
df_t10 = df.copy()[df['cuisine'].isin(top10amt)]
df_t10.head()


Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."
7,3735,italian,"[sugar, pistachio nuts, white almond bark, flo..."


In [7]:
#Turn lists of ingredients into strings
new = []
for s in df_t10['ingredients']:
    s = ', '.join(s)
    new.append(s)
df_t10['ing'] = new
df_t10.head(2)


Unnamed: 0,id,cuisine,ingredients,ing
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes...","romaine lettuce, black olives, grape tomatoes,..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g...","plain flour, ground pepper, salt, tomatoes, gr..."


In [8]:
#Create column to count the number of ingredients per recipe
df_t10['NumberOitems'] = df_t10.apply(lambda row: len(row['ingredients']),axis=1)
df_t10.NumberOitems.describe()


count    32601.000000
mean        10.739364
std          4.390789
min          1.000000
25%          8.000000
50%         10.000000
75%         13.000000
max         65.000000
Name: NumberOitems, dtype: float64

In [9]:
l = []
for s in df_t10['ing']:
    #remove punctuation
    #s = re.sub(r'[^\w\s]','',s)
    s = re.sub(r'[^\w\s]', lambda m: "," if m.group(0)=="," else "",s)
    #remove digits
    s = re.sub(r'(\d)','',s)
    #remove content inside parenthesis
    s = re.sub(r'\([^)]*\)','',s)
    #remove Brand Name
    s = re.sub(u'\w*\u2122','',s)
    #convert to lowercase
    s = s.lower()   
    #Remove Stop Words
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(s)
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    filtered_sentence = []
    for w in word_tokens:
        if w not in stop_words:
            filtered_sentence.append(w)
    s = ' '.join(filtered_sentence)
    #remove low-content adjectives
    #porter stemmer algorithm
    #ps = PorterStemmer()
    #words = word_tokenize(s)
    #word_ps = []
    #for w in words:
    #    word_ps.append(ps.stem(w))
    #s = ' '.join(word_ps)
    
    l.append(s)
df_t10['ing_mod']=l
df_t10.head(2)

Unnamed: 0,id,cuisine,ingredients,ing,NumberOitems,ing_mod
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes...","romaine lettuce, black olives, grape tomatoes,...",9,"romaine lettuce , black olives , grape tomatoe..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g...","plain flour, ground pepper, salt, tomatoes, gr...",11,"plain flour , ground pepper , salt , tomatoes ..."


In [10]:
new_df_t10=df_t10.copy().iloc[:,[0,1,5]]
new_df_t10.columns=['id','cuisine','ingredients']
new_df_t10.head(3)

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"romaine lettuce , black olives , grape tomatoe..."
1,25693,southern_us,"plain flour , ground pepper , salt , tomatoes ..."
3,22213,indian,"water , vegetable oil , wheat , salt"


In [11]:
new_df_t10.info()
new_df_t10.describe()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32601 entries, 0 to 39773
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           32601 non-null  int64 
 1   cuisine      32601 non-null  object
 2   ingredients  32601 non-null  object
dtypes: int64(1), object(2)
memory usage: 1018.8+ KB


Unnamed: 0,id
count,32601.0
mean,24794.239164
std,14346.083121
min,1.0
25%,12370.0
50%,24757.0
75%,37235.0
max,49717.0


In [12]:
#create sorted list containing each ingredient in the data

ingredientset = set()
ingredients_matrix = []

for index, row in new_df_t10.iterrows():
    
    row['ing_for_set']=row['ingredients'].split(', ')
    for ingred in row['ing_for_set']:
        ingredientset.add(ingred.strip())
ingredientset = list(ingredientset)
ingredientset.sort()
ingredientset=ingredientset[1:]


### Prepare Data for Training

In [13]:
listOfIngredients = ingredientset.copy()
d = defaultdict(list)
for ing in listOfIngredients:
    d["{0}".format(ing)] = []
    for row in new_df_t10.itertuples():
        if ing in row.ingredients:
            d[ing].append(1)
        else:
            d[ing].append(0)
data_t10 = pd.DataFrame.from_dict(d,orient='index').transpose()
data_t10.head(3)

Unnamed: 0,abalone,abbamele,absinthe,abura age,accent,accompaniment,achiote,achiote paste,achiote powder,acini di pepe,...,yuzukosho,zaatar,zatarains creole seasoning,zatarains jambalaya mix,zest,zesty italian dressing,zinfandel,ziti,zucchini,zucchini blossoms
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Split data

In [14]:
X = data_t10.values
y = new_df_t10['cuisine'].values
le = preprocessing.LabelEncoder()
y_encoded = le.fit_transform(y)
X_train,X_test,y_train,y_test=train_test_split(X,y_encoded,test_size=0.3, random_state =369)


#### Synthesize imbalanced data

In [15]:
#Synthesize data
from imblearn.over_sampling import ADASYN
ad = ADASYN(random_state=369)
X_trainAD, y_trainAD = ad.fit_sample(X_train, y_train)
from collections import Counter
print(sorted(Counter(y_trainAD).items()))

[(0, 5395), (1, 5340), (2, 5563), (3, 5398), (4, 5676), (5, 5502), (6, 5447), (7, 5526), (8, 5585), (9, 5456)]


In [16]:
X_trainAD.shape

(54888, 6166)

#### Use PCA for Dimensionality Reduction

In [17]:
pca = PCA(n_components = 700, random_state=369)

X_trainAD = pca.fit_transform(X_trainAD)
X_test = pca.transform(X_test)

explained_variance = pca.explained_variance_ratio_

#### Estimated best parameters using Grid Search with Cross-validation

In [18]:
#define parameter range
param_grid = {'C':[2,10],
             'gamma':[0.1],
             'kernel':['rbf']}

grid = GridSearchCV(SVC(random_state=369),param_grid, refit=True, verbose=20)

In [19]:
grid.fit(X_trainAD,y_trainAD)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV] C=2, gamma=0.1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .......... C=2, gamma=0.1, kernel=rbf, score=0.707, total=16.6min
[CV] C=2, gamma=0.1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 16.6min remaining:    0.0s


[CV] .......... C=2, gamma=0.1, kernel=rbf, score=0.824, total=19.6min
[CV] C=2, gamma=0.1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 36.2min remaining:    0.0s


[CV] .......... C=2, gamma=0.1, kernel=rbf, score=0.824, total=20.9min
[CV] C=2, gamma=0.1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 57.1min remaining:    0.0s


[CV] .......... C=2, gamma=0.1, kernel=rbf, score=0.803, total=20.1min
[CV] C=2, gamma=0.1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 77.1min remaining:    0.0s


[CV] .......... C=2, gamma=0.1, kernel=rbf, score=0.763, total=20.0min
[CV] C=10, gamma=0.1, kernel=rbf .....................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 97.1min remaining:    0.0s


[CV] ......... C=10, gamma=0.1, kernel=rbf, score=0.704, total=14.4min
[CV] C=10, gamma=0.1, kernel=rbf .....................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 111.5min remaining:    0.0s


[CV] ......... C=10, gamma=0.1, kernel=rbf, score=0.821, total=17.2min
[CV] C=10, gamma=0.1, kernel=rbf .....................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed: 128.7min remaining:    0.0s


[CV] ......... C=10, gamma=0.1, kernel=rbf, score=0.827, total=18.1min
[CV] C=10, gamma=0.1, kernel=rbf .....................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed: 146.8min remaining:    0.0s


[CV] ......... C=10, gamma=0.1, kernel=rbf, score=0.808, total=18.0min
[CV] C=10, gamma=0.1, kernel=rbf .....................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 164.8min remaining:    0.0s


[CV] ......... C=10, gamma=0.1, kernel=rbf, score=0.768, total=18.2min


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 183.0min remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 183.0min finished


GridSearchCV(estimator=SVC(random_state=369),
             param_grid={'C': [2, 10], 'gamma': [0.1], 'kernel': ['rbf']},
             verbose=20)

#### Found best parameters and estimator of SVC model

In [20]:
#print best parameter after tuning
print(grid.best_params_)

#print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)


{'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
SVC(C=10, gamma=0.1, random_state=369)


In [21]:
grid_predictions = grid.predict(X_test)

#print classification report
print(classification_report(y_test,grid_predictions))


              precision    recall  f1-score   support

           0       0.69      0.71      0.70       475
           1       0.83      0.86      0.84       783
           2       0.63      0.58      0.60       833
           3       0.73      0.57      0.64       366
           4       0.87      0.89      0.88       886
           5       0.77      0.87      0.82      2336
           6       0.79      0.65      0.71       435
           7       0.93      0.90      0.91      1936
           8       0.79      0.75      0.77      1283
           9       0.85      0.86      0.86       448

    accuracy                           0.81      9781
   macro avg       0.79      0.76      0.77      9781
weighted avg       0.81      0.81      0.80      9781



#### SVC model with C=10,gamma=0.1, and kernel='rbf' resulted in precision of 81%