In [1]:
import xgboost as xgb
import numpy as np
import pandas as pd

from sklearn.cross_validation import KFold, train_test_split
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.grid_search import GridSearchCV

In [2]:
rng = np.random.RandomState(33)

In [3]:
train_df = pd.read_json("train.json")
train_df.head()

test_df = pd.read_json("test.json")

In [4]:
train_df['raw_text'] = train_df.ingredients.apply(lambda x: ", ".join(x))
train_df['raw_text'].head()

0    romaine lettuce, black olives, grape tomatoes,...
1    plain flour, ground pepper, salt, tomatoes, gr...
2    eggs, pepper, salt, mayonaise, cooking oil, gr...
3                    water, vegetable oil, wheat, salt
4    black pepper, shallots, cornflour, cayenne pep...
Name: raw_text, dtype: object

In [5]:
import nltk
import re
from nltk.stem import WordNetLemmatizer

In [6]:
stemmer = WordNetLemmatizer()

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.lemmatize(item))
    return stemmed

def tokenize(text):
    text = re.sub('[^A-Za-z]', ' ', text)
    text = re.sub(" +", " ", text)
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

In [7]:
train_df['ingredients_clean_text'] = [', '.join(x).strip() for x in train_df['ingredients']]
train_df['ingredients_string'] = [' '.join([WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', line)) for line in lists]).strip() for lists in train_df['ingredients']]

In [8]:
train_df.head()

Unnamed: 0,cuisine,id,ingredients,raw_text,ingredients_clean_text,ingredients_string
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes...","romaine lettuce, black olives, grape tomatoes,...","romaine lettuce, black olives, grape tomatoes,...",romaine lettuce black olives grape tomatoes ga...
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g...","plain flour, ground pepper, salt, tomatoes, gr...","plain flour, ground pepper, salt, tomatoes, gr...",plain flour ground pepper salt tomato ground b...
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g...","eggs, pepper, salt, mayonaise, cooking oil, gr...","eggs, pepper, salt, mayonaise, cooking oil, gr...",egg pepper salt mayonaise cooking oil green ch...
3,indian,22213,"[water, vegetable oil, wheat, salt]","water, vegetable oil, wheat, salt","water, vegetable oil, wheat, salt",water vegetable oil wheat salt
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe...","black pepper, shallots, cornflour, cayenne pep...","black pepper, shallots, cornflour, cayenne pep...",black pepper shallot cornflour cayenne pepper ...


In [9]:
test_df['ingredients_clean_text'] = [', '.join(x).strip() for x in test_df['ingredients']]
test_df['ingredients_string'] = [' '.join([WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', line)) for line in lists]).strip() for lists in test_df['ingredients']]
# test_df['ingredients_string'].head()

In [10]:
test_df.head()

Unnamed: 0,id,ingredients,ingredients_clean_text,ingredients_string
0,18009,"[baking powder, eggs, all-purpose flour, raisi...","baking powder, eggs, all-purpose flour, raisin...",baking powder egg all purpose flour raisin mil...
1,28583,"[sugar, egg yolks, corn starch, cream of tarta...","sugar, egg yolks, corn starch, cream of tartar...",sugar egg yolks corn starch cream of tartar ba...
2,41580,"[sausage links, fennel bulb, fronds, olive oil...","sausage links, fennel bulb, fronds, olive oil,...",sausage links fennel bulb frond olive oil cuba...
3,29752,"[meat cuts, file powder, smoked sausage, okra,...","meat cuts, file powder, smoked sausage, okra, ...",meat cuts file powder smoked sausage okra shri...
4,35687,"[ground black pepper, salt, sausage casings, l...","ground black pepper, salt, sausage casings, le...",ground black pepper salt sausage casings leek ...


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
vectorizer = TfidfVectorizer(analyzer='word', tokenizer=tokenize, lowercase=True, stop_words='english', ngram_range=(1,2))

In [13]:
corpus = train_df['ingredients_string'].tolist() + test_df['ingredients_string'].tolist()
corpus_features = vectorizer.fit_transform(corpus)

train_features = vectorizer.transform(train_df['ingredients_string'])
test_features  = vectorizer.transform(test_df['ingredients_string'])

In [14]:
from sklearn import preprocessing

one_hot_encoder = preprocessing.LabelEncoder()
one_hot_encoder.fit(train_df['cuisine'])
train_df['cuisine_encode'] = one_hot_encoder.transform(train_df['cuisine'])

In [15]:
train_df.head()

Unnamed: 0,cuisine,id,ingredients,raw_text,ingredients_clean_text,ingredients_string,cuisine_encode
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes...","romaine lettuce, black olives, grape tomatoes,...","romaine lettuce, black olives, grape tomatoes,...",romaine lettuce black olives grape tomatoes ga...,6
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g...","plain flour, ground pepper, salt, tomatoes, gr...","plain flour, ground pepper, salt, tomatoes, gr...",plain flour ground pepper salt tomato ground b...,16
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g...","eggs, pepper, salt, mayonaise, cooking oil, gr...","eggs, pepper, salt, mayonaise, cooking oil, gr...",egg pepper salt mayonaise cooking oil green ch...,4
3,indian,22213,"[water, vegetable oil, wheat, salt]","water, vegetable oil, wheat, salt","water, vegetable oil, wheat, salt",water vegetable oil wheat salt,7
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe...","black pepper, shallots, cornflour, cayenne pep...","black pepper, shallots, cornflour, cayenne pep...",black pepper shallot cornflour cayenne pepper ...,7


In [16]:
train_xgb, test_xgb, train_target_xgb, test_target_xgb = train_test_split(corpus_features[:len(train_df)], train_df['cuisine_encode'], test_size=0.2, random_state=rng)

In [17]:
dtrain = xgb.DMatrix(train_xgb, label=train_target_xgb)
dtest  = xgb.DMatrix(test_xgb, label=test_target_xgb)

# Train data using xgboost NEW

In [18]:
# params
param = {'max_depth':10, 'eta':0.1, 'silent':1, 'objective':'multi:softmax', 'num_class': 20 , 'eval_metric': 'merror'}

In [19]:
watchlist = [(dtrain, 'train'), (dtest, 'evals')]
num_round = 300
gbm = xgb.train(param, dtrain, num_round, watchlist)

[0]	train-merror:0.290110	evals-merror:0.341546
[1]	train-merror:0.250731	evals-merror:0.311754
[2]	train-merror:0.231057	evals-merror:0.296417
[3]	train-merror:0.220591	evals-merror:0.291389
[4]	train-merror:0.210629	evals-merror:0.285607
[5]	train-merror:0.202552	evals-merror:0.280075
[6]	train-merror:0.197241	evals-merror:0.279573
[7]	train-merror:0.192244	evals-merror:0.276053
[8]	train-merror:0.188221	evals-merror:0.274419
[9]	train-merror:0.183098	evals-merror:0.270145
[10]	train-merror:0.178855	evals-merror:0.265493
[11]	train-merror:0.175084	evals-merror:0.264991
[12]	train-merror:0.172193	evals-merror:0.264613
[13]	train-merror:0.169396	evals-merror:0.263859
[14]	train-merror:0.166316	evals-merror:0.262225
[15]	train-merror:0.162576	evals-merror:0.261722
[16]	train-merror:0.160030	evals-merror:0.259962
[17]	train-merror:0.156416	evals-merror:0.258454
[18]	train-merror:0.153870	evals-merror:0.258454
[19]	train-merror:0.151042	evals-merror:0.257825
[20]	train-merror:0.148213	eva

# Predict using gbm model

In [20]:
test_matrix = xgb.DMatrix(test_features)
test_predicted = gbm.predict(test_matrix)

In [21]:
test_predicted = np.array(test_predicted, dtype='int32')
predicted_labels = one_hot_encoder.inverse_transform(test_predicted)

# Submit result

In [22]:
submit_df = pd.DataFrame({'id': test_df['id'], 'cuisine': predicted_labels})
submit_df.to_csv('result_xgboost_new.csv', index=False)

# train using xgboost

In [None]:
import scipy

In [None]:
scr = scipy.sparse.csr_matrix(train_xgb)

In [None]:
dtrain = xgb.DMatrix(train_xgb, label=train_target_xgb)
dtest  = xgb.DMatrix(test_xgb, label=test_target_xgb)

In [None]:
dtrain

In [None]:
# params
param = {'max_depth':6, 'eta':0.1, 'silent':1, 'objective':'multi:softmax', 'num_class': 20 , 'eval_metric': 'merror'}

watchlist = [(dtrain, 'train'), (dtest, 'test')]
num_round = 200
bst = xgb.train(param, dtrain, num_round, watchlist)

# predict using bst model

In [None]:
test_matrix = xgb.DMatrix(tfidf_test)

test_predicted = bst.predict(test_matrix)

In [None]:
test_predicted = np.array(test_predicted, dtype='int32')
predicted_labels = one_hot_encoder.inverse_transform(test_predicted)

In [None]:
%ls