In [None]:
#Load packages:
import json
from pprint import pprint
import pandas as pd
import numpy as np
import nltk
import re
import itertools
import seaborn as sns
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression as Logistic
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
import matplotlib.pyplot as plt
from utils import*
from IPython.display import display
%matplotlib inline
plt.style.use('ggplot')

# Preprocessing and data transformation:

### Load Train and Test Set

In [None]:
traindf = pd.read_json("train.json")
testdf = pd.read_json("test.json")

### Make dictionaries of ingredients/cuisines and list of ingredients:

In [None]:
cuis_ingr = {}
ingr_list=[]
# Create dict with ket = cuisine and value = list of ingred.
for a,b in traindf.groupby('cuisine'):
    #done this way remos issue of list within list of ingr
    cuis_ingr[a] = list(itertools.chain.from_iterable(b['ingredients'].values))
    ingr_list+=list(itertools.chain.from_iterable(b['ingredients'].values))

#### create list of ingredients to remove:

In [None]:
# unique ingredients
unique_ingr = pd.Series(ingr_list).value_counts()
# list ingredients that appear only one time in the dataset
ingr_rm= unique_ingr[unique_ingr<2].index.tolist()

In [None]:
print('Number of Ingredients in Training dataset:',len(ingr_list))
print('Number of Unique Ingredients in Training dataset:',len(np.unique(ingr_list)))
print('Number of Ingredients to be removed from dataset', len(ingr_rm))

## Preprocessing

### Clean train dataset and create strings of ingredients:
When using the preproces_ing function we can specify True if we want the one_word routine to be used

In [None]:
one_word = True

In [None]:
# remove ingredients that appear only once
traindf['ingredients_rm'] = traindf.apply(lambda row: remove_ing(row['ingredients'],ingr_rm), axis=1)
# clean and prepare list of ingredients as a single string
traindf['ingredients_string'] = traindf.apply(lambda row: preprocess_ing(row['ingredients_rm'], one_word), axis=1)

### Clean test dataset:

In [None]:
testdf['ingredients_rm'] = testdf.apply(lambda row: remove_ing(row['ingredients'],ingr_rm), axis=1)
testdf['ingredients_string'] = testdf.apply(lambda row: preprocess_ing(row['ingredients_rm'],one_word), axis=1)

### Check Results of preprocessing routine:

In [None]:
# change setting for pd column width
pd.set_option('display.max_colwidth', -1)

In [None]:
traindf.ix[:2, 2:]

In [None]:
traindf.ix[95:100,[2,4]]

### Create Corpus of ingredients for train adn test and Vecotrize them:

In [None]:
corpus_train = traindf['ingredients_string']
corpus_test = testdf['ingredients_string']

In [None]:
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,1), 
                             analyzer='word', max_df=0.5, token_pattern=r'\w+')

# Create the input matrix and the label vector for the train set:
X_train = vectorizer.fit_transform(corpus_train)
#train_vector_feat = vectorizer.get_feature_names()
y_train = traindf['cuisine']

# Create input matrix for the test set: 
X_test = vectorizer.transform(corpus_test)

In [None]:
# Check shape of input matrix obtained:
print('Train', X_train.shape)
print('Test', X_test.shape)

# Classification algorithms:

### Create stratified k-fold method

In [None]:
k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

## Logistic Classifier:

use one_word preprocessing routine for best performance with Logistic classifier

In [None]:
# Call classifier:
lc = Logistic(solver='liblinear', C=5, penalty='l2', class_weight = 'balanced', random_state=42)

In [None]:
%%time
# cross validation score for the classifier selected:
logis_score= cross_val_score(lc, X_train, y_train, scoring='f1_weighted', 
                        cv=k_fold, n_jobs=-1)

print("Accuracy: %0.4f (+/- %0.4f)" % (logis_score.mean(), logis_score.std() * 2))

#### Example of small Randomized search for the optimal parameters:

In [None]:
param_grid = {'solver':['liblinear','lbfgs']
              'penalty': ['l1','l2'],
              'C': np.linspace(1,100)}

lc_rnd = RandomizedSearchCV(lc, param_grid, cv=5, n_iter=20)
lc_rnd.fit(X_train, y_train)

print('Best Score:', lm_rnd.best_score_)
print('Best Parameters:', lm_rnd.best_params_)

Use best parameters to train algorithm, fit the test set and output the csv file for the submission to kaggle.
If the randomized search was not performace the classifier set above can be used.

In [None]:
# if hyperparameter search was performed:
#lc_model = lm_rnd.best_estimator_
# if we already have parameters:
lc_model = lc

# fit the train set
lc_model.fit(X_train, y_train)

# predicting test cuisines
prediction_logis = lc_model.predict(X_test)

# Create CSV file:
sub_logis = testdf[['id']].copy()
sub_logis['cuisine']= prediction_logis
sub_logis.to_csv("submission_logistic_TESTSSS.csv",index=False)

## Naive Bayes:

use one_word preprocessing routine for best performance with NB classifier

In [None]:
# Call classifier:
nb = MultinomialNB(alpha=0.103)

In [None]:
nb_score= cross_val_score(nb, X_train, y_train, scoring='accuracy', 
                        cv=k_fold, n_jobs=-1)

print("Accuracy: %0.4f (+/- %0.4f)" % (nb_score.mean(), nb_score.std() * 2))

#### Example of small Randomized search for the optimal parameters:

In [None]:
param_grid = {'alpha': np.linspace(0.001,1)}

nb_rnd = RandomizedSearchCV(nb, param_grid, cv=5, n_iter=20)
nb_rnd.fit(X_train, y_train)

print(nb_rnd.best_score_)
print(nb_rnd.best_params_)

Use best parameters to train algorithm, fit the test set and output the csv file for the submission to kaggle.
If the randomized search was not performace the classifier set above can be used.

In [None]:
# if hyperparameter search was performed:
#nb_model = nb_rnd.best_estimator_
# if we already have parameters:
nb_model = nb
# fit train set
nb_model.fit(X_train, y_train)
# predict test set
nb_predictions= nb_model.predict(X_test)
# create file for submission:
sub_nb=testdf[['id']].copy()
sub_nb['cuisine']=nb_predictions
sub_nb.to_csv("submission_multiNB_1.csv",index=False)

## SVC: 

In [None]:
# create classifier:
svc_model = SVC(C=15,gamma=1, kernel='rbf', class_weight= 'balanced',
                decision_function_shape='ovr', random_state=42)

In [None]:
%%time
# cross validation score for parameters selected:
svc_score= cross_val_score(svc_model, X_train, y_train, scoring='f1_weighted', 
                        cv=k_fold, n_jobs=-1)

print("Accuracy: %0.4f (+/- %0.4f)" % (svc_score.mean(), svc_score.std() * 2))

In [None]:
# Fit train data
svc_model.fit(X_train, y_train)
# predict labels for test set:
prediction_svc = svc_model.predict(X_test)
# create csv file for submission:
sub_svc = testdf[['id']].copy()
sub_svc['cuisine']= prediction_svc
sub_svc.to_csv("submission_svc_c15_g1.csv",index=False)

## Random Forest:

In [None]:
# create classifier:
forest = RandomForestClassifier(criterion='gini',n_estimators = 1000,max_features= 'auto',
                                         class_weight='balanced', random_state=42)

In [None]:
# Evalueate the algorithm and the parameters selected:
score_forest = cross_val_score(forest, X_train, y_train, scoring='f1_weighted', cv=5)
print("Accuracy: %0.4f (+/- %0.4f)" % (score_forest.mean(), score_forest.std() * 2))

In [None]:
# fit train data
forest.fit(X_train, y_train)
# predict cuisines for test set
prediction_forest= forest.predict(test_vector)
#Create file for submission
sub_forest = testdf[['id']].copy()
sub_forest['cuisine']= prediction_forest
sub_forest.to_csv("submission_forest.csv",index=False)

## Creating Confusion Matrix:

Before creating a confusion matrix it is necessary to restart the kernel, do the preprocessing and finally set the algorithm we want to use for the task in the cells above.
After that we can split the train data set in a train and validation set, train the model, and use the validation set to create a more realistic confusion matrix.

In [None]:
# split train set in train and validation
x_tr, x_val, y_tr, y_val=tts(X_train, y_train, test_size=0.15, random_state=42)

# declare the model we want to evaluate, for example:
model = lc #could be svc_model, nb

# train the model:
model.fit(x_tr, y_tr)

In [None]:
# create the confusion matrix and the normalized version:
cm = confusion_matrix(y_val, model.predict(x_val))
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

# create list of cuisines to label axis
cuisines = traindf['cuisine'].value_counts().index.tolist()
xt=yt=sorted(cuisines)

# plot the figure and save it:
plt.figure(figsize=(12,10))
with sns.axes_style("white"):
    ax = sns.heatmap(cm_normalized, square=True,xticklabels=xt, yticklabels=yt,
                     cmap='YlGnBu', annot=True, fmt='.2f',linewidths=.5)
ax.set_ylabel('True label')
ax.set_xlabel('Predicted label')
#ax.figure.savefig("output.png")