In [1]:
import xgboost as xgb
import numpy as np
import pandas as pd

from sklearn.cross_validation import KFold, train_test_split
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.grid_search import GridSearchCV

In [2]:
rng = np.random.RandomState(33)

In [3]:
train_df = pd.read_json("train.json")
train_df.head()

test_df = pd.read_json("test.json")

In [4]:
train_df['raw_text'] = train_df.ingredients.apply(lambda x: ", ".join(x))
train_df['raw_text'].head()

0    romaine lettuce, black olives, grape tomatoes,...
1    plain flour, ground pepper, salt, tomatoes, gr...
2    eggs, pepper, salt, mayonaise, cooking oil, gr...
3                    water, vegetable oil, wheat, salt
4    black pepper, shallots, cornflour, cayenne pep...
Name: raw_text, dtype: object

In [5]:
import nltk
import re
from nltk.stem import WordNetLemmatizer

In [6]:
stemmer = WordNetLemmatizer()

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.lemmatize(item))
    return stemmed

def tokenize(text):
    text = re.sub('[^A-Za-z]', ' ', text)
    text = re.sub(" +", " ", text)
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

In [7]:
train_df['ingredients_clean_text'] = [', '.join(x).strip() for x in train_df['ingredients']]
train_df['ingredients_string'] = [' '.join([WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', line)) for line in lists]).strip() for lists in train_df['ingredients']]

In [8]:
train_df.head()

Unnamed: 0,cuisine,id,ingredients,raw_text,ingredients_clean_text,ingredients_string
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes...","romaine lettuce, black olives, grape tomatoes,...","romaine lettuce, black olives, grape tomatoes,...",romaine lettuce black olives grape tomatoes ga...
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g...","plain flour, ground pepper, salt, tomatoes, gr...","plain flour, ground pepper, salt, tomatoes, gr...",plain flour ground pepper salt tomato ground b...
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g...","eggs, pepper, salt, mayonaise, cooking oil, gr...","eggs, pepper, salt, mayonaise, cooking oil, gr...",egg pepper salt mayonaise cooking oil green ch...
3,indian,22213,"[water, vegetable oil, wheat, salt]","water, vegetable oil, wheat, salt","water, vegetable oil, wheat, salt",water vegetable oil wheat salt
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe...","black pepper, shallots, cornflour, cayenne pep...","black pepper, shallots, cornflour, cayenne pep...",black pepper shallot cornflour cayenne pepper ...


In [9]:
test_df['ingredients_clean_text'] = [', '.join(x).strip() for x in test_df['ingredients']]
test_df['ingredients_string'] = [' '.join([WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', line)) for line in lists]).strip() for lists in test_df['ingredients']]
# test_df['ingredients_string'].head()

In [10]:
test_df.head()

Unnamed: 0,id,ingredients,ingredients_clean_text,ingredients_string
0,18009,"[baking powder, eggs, all-purpose flour, raisi...","baking powder, eggs, all-purpose flour, raisin...",baking powder egg all purpose flour raisin mil...
1,28583,"[sugar, egg yolks, corn starch, cream of tarta...","sugar, egg yolks, corn starch, cream of tartar...",sugar egg yolks corn starch cream of tartar ba...
2,41580,"[sausage links, fennel bulb, fronds, olive oil...","sausage links, fennel bulb, fronds, olive oil,...",sausage links fennel bulb frond olive oil cuba...
3,29752,"[meat cuts, file powder, smoked sausage, okra,...","meat cuts, file powder, smoked sausage, okra, ...",meat cuts file powder smoked sausage okra shri...
4,35687,"[ground black pepper, salt, sausage casings, l...","ground black pepper, salt, sausage casings, le...",ground black pepper salt sausage casings leek ...


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
vectorizer = TfidfVectorizer(analyzer='word', tokenizer=tokenize, lowercase=True, stop_words='english', ngram_range=(1,2))

In [13]:
corpus = train_df['ingredients_string'].tolist() + test_df['ingredients_string'].tolist()
corpus_features = vectorizer.fit_transform(corpus)

train_features = vectorizer.transform(train_df['ingredients_string'])
test_features  = vectorizer.transform(test_df['ingredients_string'])

In [14]:
from sklearn import preprocessing

one_hot_encoder = preprocessing.LabelEncoder()
one_hot_encoder.fit(train_df['cuisine'])
train_df['cuisine_encode'] = one_hot_encoder.transform(train_df['cuisine'])

In [15]:
train_df.head()

Unnamed: 0,cuisine,id,ingredients,raw_text,ingredients_clean_text,ingredients_string,cuisine_encode
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes...","romaine lettuce, black olives, grape tomatoes,...","romaine lettuce, black olives, grape tomatoes,...",romaine lettuce black olives grape tomatoes ga...,6
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g...","plain flour, ground pepper, salt, tomatoes, gr...","plain flour, ground pepper, salt, tomatoes, gr...",plain flour ground pepper salt tomato ground b...,16
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g...","eggs, pepper, salt, mayonaise, cooking oil, gr...","eggs, pepper, salt, mayonaise, cooking oil, gr...",egg pepper salt mayonaise cooking oil green ch...,4
3,indian,22213,"[water, vegetable oil, wheat, salt]","water, vegetable oil, wheat, salt","water, vegetable oil, wheat, salt",water vegetable oil wheat salt,7
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe...","black pepper, shallots, cornflour, cayenne pep...","black pepper, shallots, cornflour, cayenne pep...",black pepper shallot cornflour cayenne pepper ...,7


In [16]:
train_xgb, test_xgb, train_target_xgb, test_target_xgb = train_test_split(corpus_features[:len(train_df)], train_df['cuisine_encode'], test_size=0.2, random_state=rng)

In [17]:
dtrain = xgb.DMatrix(train_xgb, label=train_target_xgb)
dtest  = xgb.DMatrix(test_xgb, label=test_target_xgb)

# Train data using xgboost NEW

In [None]:
# params
# param = {'max_depth':10, 'eta':0.1, 'silent':1, 'objective':'multi:softmax', 'num_class': 20 , 'eval_metric': 'merror'}

In [20]:
# new params
param = {
    'max_depth': 5,
    'eta': 0.09,
    'silent': 1,
    'subsample': 0.65,
    'objective':'multi:softmax',
    'num_class': 20,
    'eval_metric': 'merror'
}

In [21]:
watchlist = [(dtrain, 'train'), (dtest, 'evals')]
num_round = 2000
gbm = xgb.train(param, dtrain, num_round, watchlist)

[0]	train-merror:0.389924	evals-merror:0.415085
[1]	train-merror:0.347622	evals-merror:0.373350
[2]	train-merror:0.329583	evals-merror:0.351603
[3]	train-merror:0.317892	evals-merror:0.341546
[4]	train-merror:0.307552	evals-merror:0.332495
[5]	train-merror:0.300072	evals-merror:0.329855
[6]	train-merror:0.295735	evals-merror:0.325204
[7]	train-merror:0.292184	evals-merror:0.321307
[8]	train-merror:0.290173	evals-merror:0.320176
[9]	train-merror:0.286810	evals-merror:0.318919
[10]	train-merror:0.283227	evals-merror:0.314393
[11]	train-merror:0.280587	evals-merror:0.312508
[12]	train-merror:0.278230	evals-merror:0.311376
[13]	train-merror:0.276439	evals-merror:0.309239
[14]	train-merror:0.275087	evals-merror:0.306851
[15]	train-merror:0.270719	evals-merror:0.304840
[16]	train-merror:0.269116	evals-merror:0.306097
[17]	train-merror:0.267073	evals-merror:0.303960
[18]	train-merror:0.264810	evals-merror:0.302200
[19]	train-merror:0.263428	evals-merror:0.302828
[20]	train-merror:0.260913	eva

# Predict using gbm model

In [22]:
test_matrix = xgb.DMatrix(test_features)
test_predicted = gbm.predict(test_matrix)

In [23]:
test_predicted = np.array(test_predicted, dtype='int32')
predicted_labels = one_hot_encoder.inverse_transform(test_predicted)

# Submit result

In [24]:
submit_df = pd.DataFrame({'id': test_df['id'], 'cuisine': predicted_labels})
submit_df.to_csv('result_xgboost_new.csv', index=False)

In [None]:
%ls

This submition obtains 0.78429