In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, accuracy_score, log_loss, roc_auc_score
import scipy
import csv
import pickle
#import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

  from numpy.core.umath_tests import inner1d


In [2]:
train = pd.read_json('./cuisine.train.v2.json')

In [3]:
train.head()

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


In [4]:
#train=train_df
total = train.isnull().sum().sort_values(ascending = False)
percent = (train.isnull().sum()/train.isnull().count()*100).sort_values(ascending = False)
missing_train_data  = pd.concat([total, percent], axis=1, keys=['Total missing', 'Percent missing'])
print("             # of Rows, Columns:",train.shape)
print(missing_train_data.head())

             # of Rows, Columns: (39774, 3)
             Total missing  Percent missing
ingredients              0              0.0
id                       0              0.0
cuisine                  0              0.0


So looks like there is no null value in the train set

## Text Processing



In [5]:
def flatten_json(input_file):
    import json
    import re
    from pandas.io.json import json_normalize
    corpus_file = open(input_file,"r")
    corpus = corpus_file.read()
    entries =  json.loads(corpus)
    df =  json_normalize(entries)
    df['flat_ingredients'] = df.apply(lambda row: ' '.join(ingredient for ingredient in row['ingredients']), axis=1)
    df['word_count'] = df.apply(lambda row: len(row['flat_ingredients'].split(' ')), axis=1)
    df.drop('ingredients', axis=1, inplace=True)   
    df.sort_values(['word_count'], ascending=False, inplace=True)
    return df 

In [6]:
train_flatten=flatten_json('./cuisine.train.v2.json')

In [7]:
train_flatten.head()

Unnamed: 0,cuisine,id,flat_ingredients,word_count
15289,italian,3885,fettucine fresh marjoram minced garlic olive o...,136
30350,brazilian,13430,marshmallows fresh corn cheddar cheese shredde...,107
26103,mexican,13049,vanilla ice cream lime garlic powder zucchini ...,105
10513,mexican,49282,condensed cream of chicken soup pepper refried...,102
6449,southern_us,6548,canned black beans seasoned bread crumbs prepa...,88


## Building the Model

In [9]:
# prepare the input for the model
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
train_transform = tfidf_vect.fit_transform(train_flatten['flat_ingredients'])
le = preprocessing.LabelEncoder()
X = train_transform
y = le.fit_transform(train_flatten['cuisine'])

In [10]:
# save the vectorizer for future use

pickle.dump(tfidf_vect, open("vectorizer.pickle", "wb"))

In [11]:
# split the train set to train and valid

#X_train,X_valid,y_train,y_valid = train_test_split(X,y, test_size = 0.33, random_state = 42)

In [12]:
rf = RandomForestClassifier(n_estimators=100, max_depth=2,random_state=0)

In [14]:
from sklearn.metrics import f1_score, classification_report, accuracy_score

#print('training score:', f1_score(y_train, rf.predict(X_train), average='macro'))
#print('validation score:', f1_score(y_valid, rf.predict(X_valid), average='macro'))
#print(classification_report(y_valid, xgb_prediction))

rf.fit(X, y)
#rf.fit(X_train, y_train)
#predicted_labels = rf.predict(X_valid)
#print ("FINISHED classifying. accuracy score : ")
#print (accuracy_score(y_valid, predicted_labels))

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [15]:
# save the model for future use
filename = 'finalized_model.pkl'
pickle.dump(rf, open(filename, 'wb'))


In [27]:
test = flatten_json('./test2.json')
test.head()

Unnamed: 0,id,flat_ingredients,word_count
1,28583,sugar egg yolks corn starch cream of tartar ba...,20
0,18009,baking powder eggs all-purpose flour raisins m...,9


In [28]:
vect = pickle.load(open("vectorizer.pickle", "rb"))
test_transform = vect.transform(test['flat_ingredients'].values)
X_test = test_transform
loaded_model = pickle.load(open(filename, 'rb'))
predicted_labels = loaded_model.predict(X_test)
predicted_proba = loaded_model.predict_proba(X_test)
print(predicted_labels)
print(predicted_proba)
#print(predicted_proba[0])

[9 9]
[[0.01256878 0.02309934 0.03749478 0.07811859 0.01927727 0.07127609
  0.02632433 0.06620387 0.01780258 0.17846839 0.012995   0.03381639
  0.02042203 0.16786254 0.01720543 0.01338107 0.12414803 0.02345183
  0.03641069 0.01967298]
 [0.01270265 0.02515649 0.03949825 0.06030703 0.01915204 0.07322007
  0.0290996  0.0718222  0.02037151 0.19187031 0.01408462 0.03239637
  0.01921427 0.14901947 0.01883097 0.01442608 0.12805732 0.02484447
  0.03656362 0.01936266]]


In [17]:
# create a mapping and save in a csv
hashMap = {}
for item in y:
    if item in hashMap:
        continue
    else:
        index = item
        hashMap[item] = train_flatten['cuisine'].values[index]

print (hashMap)

# save the map in a csv file

with open('cuisine_dict.csv', 'w') as csv_file:
    writer = csv.writer(csv_file)
    for key, value in hashMap.items():
       writer.writerow([key, value])

{9: 'italian', 0: 'italian', 13: 'mexican', 16: 'mexican', 18: 'cajun_creole', 4: 'southern_us', 3: 'mexican', 17: 'mexican', 7: 'filipino', 2: 'mexican', 14: 'brazilian', 10: 'spanish', 1: 'brazilian', 19: 'mexican', 11: 'indian', 5: 'thai', 6: 'italian', 12: 'indian', 8: 'chinese', 15: 'mexican'}
