In [1]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from collections import Counter
import matplotlib as mpl
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sys import getsizeof

In [2]:
TRAIN_PATH = './TrainingSet/cooking-recipes-train.json'
TEST_PATH = './TestSet/cooking-recipes-test.json'

In [3]:
train_data = pd.read_json(TRAIN_PATH)
train_data.head()

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."


In [4]:
test_data = pd.read_json(TEST_PATH)
test_data.head()

Unnamed: 0,id,ingredients
0,49434,"[chocolate bars, marshmallows, cinnamon graham..."
1,27165,"[fish sauce, chicken broth, sesame oil, green ..."
2,34248,"[soy sauce, chili paste, oil, brown sugar, hon..."
3,26415,"[buttermilk, okra, large eggs, all-purpose flo..."
4,10425,"[tomatoes, coarse salt, water, white onion, se..."


In [5]:
def create_features(data):
    features_all_list = []
    for ingredient in data.ingredients:
        features_all_list += ingredient

    features = list(set(features_all_list))

    return features

In [19]:
def create_onehot_ingredients(data, features):
    onehot_ingredients = np.zeros((data.shape[0], len(features)))

    feature_lookup = sorted(features)

    for index, row in data.iterrows():
        for ingredient in row['ingredients']:
            if ingredient in feature_lookup:
                onehot_ingredients[index, feature_lookup.index(ingredient)] = 1
                
    return onehot_ingredients

In [7]:
features = create_features(train_data)

In [8]:
onehot_ingredients = create_onehot_ingredients(train_data, features)

In [9]:
y = train_data.cuisine.values.reshape(-1, 1)

In [10]:
def create_data_frame_features(onehot_ingredients, features):
    df_features = pd.DataFrame(onehot_ingredients)

    # Create empty dictionary to store featureindex:columnname
    d = {}

    # For each feature, fetch the column name
    for i in range(len(features)):
        d[df_features.columns[i]] = features[i]

    # Rename the features (stop using the index # and use the actual text)
    df_features = df_features.rename(columns=d)
    #df_features.shape
    return df_features

In [11]:
df_features = create_data_frame_features(onehot_ingredients, features)

In [12]:
getsizeof(onehot_ingredients)

2002053536

In [13]:
del onehot_ingredients

In [14]:
X_train, X_test, y_train, y_test = train_test_split(df_features, y, test_size=0.3, shuffle=True, random_state=42)


### Removendo variáveis da memória

In [15]:
getsizeof(df_features)

2002053576

In [16]:
del df_features

### Treinado o modelo


In [17]:
clf = RandomForestClassifier()

# Train the random forest (use ravel to coerce to 1d array)
clf.fit(X_train, y_train.ravel())

# Get test predictions
y_pred = clf.predict(X_test)

# Get accuracy for the random forest classifier
a = accuracy_score(y_test, y_pred)
print("Accuracy Score in % : ")
print(a * 100)

Accuracy Score in % : 
70.45695130557516


### Testando o modelo e salvando no CSV

In [21]:
onehot_ingredients_test = create_onehot_ingredients(test_data, features)

df_test = create_data_frame_features(onehot_ingredients_test, features)

y_pred_test = clf.predict(df_test)
print(y_pred_test)

['southern_us' 'thai' 'chinese' ... 'italian' 'mexican' 'indian']
1988
