In [None]:
# import statements
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', None)
import re
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [3]:
df_train = pd.read_json('train.json')
#df_train.head()
df_train['all_ingredients'] = df_train['ingredients'].map(";".join) # adds new column with string of ingredients separated by semi-colons

In [4]:
def lowercase(ingredient): # making the data uniform by making all letters lowercase for the ingredients
    return [x.lower() for x in ingredient]

def remove_digits(ingredient): # removes any numbers from ingredients list
   return [re.sub("\d+","", x) for x in ingredient]

def remove_special_characters(ingredient): # removes any special characters from ingredients list
    ingredient = [x.replace("-", " ") for x in ingredient] 
    ingredient = [x.replace("&", " ") for x in ingredient]
    ingredient = [x.replace("'", " ") for x in ingredient]
    ingredient = [x.replace("''", " ") for x in ingredient]
    ingredient = [x.replace("%", " ") for x in ingredient] 
    ingredient = [x.replace("!", " ") for x in ingredient] 
    ingredient = [x.replace("(", " ") for x in ingredient] 
    ingredient = [x.replace(")", " ") for x in ingredient] 
    ingredient = [x.replace("/", " ") for x in ingredient] 
    ingredient = [x.replace("?", " ") for x in ingredient] 
    ingredient = [x.replace(",", " ") for x in ingredient] 
    ingredient = [x.replace(".", " ") for x in ingredient] 
    ingredient = [x.replace(u"\u2122", " ") for x in ingredient] # trademark sign
    ingredient = [x.replace(u"\u00AE", " ") for x in ingredient] # registered sign
    ingredient = [x.replace(u"\u2019", " ") for x in ingredient] # right single quotation mark
    return ingredient 

def remove_letters(ingredient): # removes units and other letters from ingredients 
    letters = ['g', 'lb', 's', 'n']   
    def check(string): 
        s = string.split()
        remove  = [word for word in s if word.lower() not in letters]
        return ' '.join(remove)
    return [check(x) for x in ingredient] 

def lemmatization(ingredient): # applying lemmatization to ingredients list
    lemma = WordNetLemmatizer()
    def words(string):
        return " ".join(["".join(lemma.lemmatize(w)) for w in string.split()])
    return [words(x) for x in ingredient] 

In [5]:
# make cleaning changes to column
df_train['all_ingredients'] = lowercase(df_train['all_ingredients'])
df_train['all_ingredients'] = remove_digits(df_train['all_ingredients'])
df_train['all_ingredients'] = remove_special_characters(df_train['all_ingredients'])
df_train['all_ingredients'] = remove_letters(df_train['all_ingredients'])
df_train['all_ingredients'] = lemmatization(df_train['all_ingredients'])
#df_train.head(50)

In [6]:
cv = CountVectorizer()
X = cv.fit_transform(df_train['all_ingredients'].values) # build matrix for encoding
#X.shape
le = LabelEncoder()
y = le.fit_transform(df_train.cuisine) # encode labels that represents the cuisine of each recipe
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # split into training and testing sets

In [None]:
# logistic regression model
logistic = LogisticRegression(solver='lbfgs', max_iter=10000)
logistic.fit(X_train, y_train)

In [None]:
# evaluate model accuracy
logistic.score(X_test, y_test)

In [None]:
# confusion matrix 
plt.figure(figsize=(10, 10))

cm = confusion_matrix(y_test, logistic.predict(X_test))
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

plt.imshow(cm_normalized, interpolation='nearest')
plt.title("confusion matrix")
plt.colorbar(shrink=0.3)
cuisines = df_train['cuisine'].value_counts().index
tick_marks = np.arange(len(cuisines))
plt.xticks(tick_marks, cuisines, rotation=90)
plt.yticks(tick_marks, cuisines)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')

In [None]:
# classification report
y_pred = logistic.predict(X_test)
print(classification_report(y_test, y_pred, target_names=cuisines))