# Cooking Recipe Detection

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from collections import Counter
import matplotlib as mpl
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sys import getsizeof
import json

### Carregando os dados treinamento e teste

In [2]:
TRAIN_PATH = './TrainingSet/cooking-recipes-train.json'
TEST_PATH = './TestSet/cooking-recipes-test.json'

In [3]:
train_data = pd.read_json(TRAIN_PATH)
train_data.head()

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."


In [4]:
test_data = pd.read_json(TEST_PATH)
test_data.head()

Unnamed: 0,id,ingredients
0,49434,"[chocolate bars, marshmallows, cinnamon graham..."
1,27165,"[fish sauce, chicken broth, sesame oil, green ..."
2,34248,"[soy sauce, chili paste, oil, brown sugar, hon..."
3,26415,"[buttermilk, okra, large eggs, all-purpose flo..."
4,10425,"[tomatoes, coarse salt, water, white onion, se..."


### Funções
Listando todos os ingredientes

In [5]:
def create_features(data):
    features_all_list = []
    for ingredient in data.ingredients:
        features_all_list += ingredient

    features = list(set(features_all_list))

    return features

Verificando quais ingredientes estão em cada receita

In [6]:
def create_onehot_ingredients(data, features):
    onehot_ingredients = np.zeros((data.shape[0], len(features)))

    feature_lookup = sorted(features)

    for index, row in data.iterrows():
        for ingredient in row['ingredients']:
            if ingredient in feature_lookup:
                onehot_ingredients[index, feature_lookup.index(ingredient)] = 1
                
    return onehot_ingredients

Criando o quadro com as features

In [7]:
def create_data_frame_features(onehot_ingredients, features):
    df_features = pd.DataFrame(onehot_ingredients)

    d = {}

    for i in range(len(features)):
        d[df_features.columns[i]] = features[i]

    df_features = df_features.rename(columns=d)
    
    return df_features

### Pré-processamento
Processando o X e definindo o Y


In [8]:
features = create_features(train_data)

Pode levar alguns minutos.

In [9]:
onehot_ingredients = create_onehot_ingredients(train_data, features)

In [10]:
y = train_data.cuisine.values.reshape(-1, 1)

Pode levar alguns minutos.

In [11]:
df_features = create_data_frame_features(onehot_ingredients, features)

Para sistemas com até 8GB de ram, se faz necessário a remoção de variáveis que não se utilizará mais.

In [12]:
print(getsizeof(onehot_ingredients))
del onehot_ingredients

2002053536


### Separando dados de treino e teste

In [13]:
X_train, X_test, y_train, y_test = train_test_split(df_features, y, test_size=0.3, shuffle=True, random_state=42)


In [14]:
print(getsizeof(df_features))
del df_features

2002053576


### Treinado o modelo
O treinamento pode levar alguns minutos.

In [15]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train.ravel())

y_pred = clf.predict(X_test)

a = accuracy_score(y_test, y_pred)
print("Accuracy Score in % : ")
print(a * 100)

Accuracy Score in % : 
70.39520112914607


### Testando o modelo com os dados de teste


In [16]:
onehot_ingredients_test = create_onehot_ingredients(test_data, features)

df_test = create_data_frame_features(onehot_ingredients_test, features)

cuisines_pred_test = clf.predict(df_test)
print(cuisines_pred_test)

['italian' 'chinese' 'chinese' ... 'italian' 'mexican' 'indian']


### Salvando o resultado
O resultado final está no arquivo `result.json`

In [17]:
test_data['cuisine'] = cuisines_pred_test
test_data.head()

Unnamed: 0,id,ingredients,cuisine
0,49434,"[chocolate bars, marshmallows, cinnamon graham...",italian
1,27165,"[fish sauce, chicken broth, sesame oil, green ...",chinese
2,34248,"[soy sauce, chili paste, oil, brown sugar, hon...",chinese
3,26415,"[buttermilk, okra, large eggs, all-purpose flo...",southern_us
4,10425,"[tomatoes, coarse salt, water, white onion, se...",mexican


In [18]:
result = []
for index, row in test_data.iterrows():
    result.append({
        'id': row['id'],
        'cuisine': row['cuisine'],
        'ingredients': row['ingredients']
    })

with open('result.json', 'w') as fp:
    json.dump(result, fp, sort_keys=False, indent=4)