In [1]:
import numpy as np
import pandas as pd
import ngram
from nlp_utils import stopwords, english_stemmer, stem_tokens, getTFV
import cPickle
import config

import re
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn import decomposition, pipeline, metrics, grid_search
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from nltk.stem.porter import *
from nltk.metrics import edit_distance

from utility import correct_string

In [2]:
df_train = pd.read_csv('input/train.csv', encoding='ISO-8859-1')
df_test = pd.read_csv('input/test.csv', encoding='ISO-8859-1')
df_pro_desc = pd.read_csv('input/product_descriptions.csv', encoding='ISO-8859-1')
df_attr = pd.read_csv('input/attributes.csv')
df_brand = df_attr[df_attr.name == "MFG Brand Name"][["product_uid", "value"]].rename(columns={"value": "brand"})

num_train = df_train.shape[0]
y_train = df_train['relevance'].values
id_test = df_test['id']

In [3]:
df_train.info(), df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 74067 entries, 0 to 74066
Data columns (total 5 columns):
id               74067 non-null int64
product_uid      74067 non-null int64
product_title    74067 non-null object
search_term      74067 non-null object
relevance        74067 non-null float64
dtypes: float64(1), int64(2), object(2)
memory usage: 3.4+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 166693 entries, 0 to 166692
Data columns (total 4 columns):
id               166693 non-null int64
product_uid      166693 non-null int64
product_title    166693 non-null object
search_term      166693 non-null object
dtypes: int64(2), object(2)
memory usage: 6.4+ MB


(None, None)

# merge features

## 1. common word count feature

In [4]:
X_common_word_feat_train = np.loadtxt(config.path_features + 'common_word_feat_train.txt')
X_common_word_feat_test = np.loadtxt(config.path_features + 'common_word_feat_test.txt')

In [5]:
X_common_word_feat_train.shape, X_common_word_feat_test.shape

((74067, 17), (166693, 17))

In [None]:
X_common_word_feat_test[1,:]

## 2. jaccard coef feature

In [6]:
X_jaccard_feat_train = np.loadtxt(config.path_features + 'jaccard_dice_dist_feat_train.txt')
X_jaccard_feat_test = np.loadtxt(config.path_features + 'jaccard_dice_dist_feat_test.txt')

IOError: [Errno 2] No such file or directory: './features/jaccard_dice_dist_feat_train.txt'

In [7]:
X_jaccard_feat_train.shape, X_jaccard_feat_test.shape

NameError: name 'X_jaccard_feat_train' is not defined

## 3. similiarity feature (word2vec, cosine sim)

In [8]:
X_sim_feat_train = np.loadtxt(config.path_features + 'X_similiarity_additional_train.txt')
X_sim_feat_test = np.loadtxt(config.path_features + 'X_similiarity_additional_test.txt')
print X_sim_feat_train.shape, X_sim_feat_test.shape

(74067, 6) (166693, 6)


## 4. counts feature

In [9]:
X_train_count = np.loadtxt(config.path_features + 'train_counts.txt')
X_test_count = np.loadtxt(config.path_features + 'test_counts.txt')
print X_train_count.shape, X_test_count.shape

(74067, 9) (166693, 9)


## 5-1. extended query count features (top 10)

In [10]:
X_extquery_count_feat_train = np.loadtxt(config.path_features + 'train_ext_counts_top10.txt')
X_extquery_count_feat_test = np.loadtxt(config.path_features + 'test_ext_counts_top10.txt')
print type(X_extquery_count_feat_train)
print X_extquery_count_feat_train.shape, X_extquery_count_feat_test.shape

<type 'numpy.ndarray'>
(74067, 6) (166693, 6)


## 5-2. extended query count features (top 15)

In [None]:
X_extquery_count_feat_train = np.loadtxt(config.path_features + 'train_ext_counts_top15.txt')
X_extquery_count_feat_test = np.loadtxt(config.path_features + 'test_ext_counts_top15.txt')
print type(X_extquery_count_feat_train)
print X_extquery_count_feat_train.shape, X_extquery_count_feat_test.shape

## 6. char similiarity feature

In [11]:
X_char_sim_train = np.loadtxt(config.path_features + 'ssfeas4train.txt')
X_char_sim_test = np.loadtxt(config.path_features + 'ssfeas4test.txt')
print X_char_sim_train.shape, X_char_sim_test.shape

(74067, 27) (166693, 27)


In [12]:
# merge all features
X_train = np.hstack((X_common_word_feat_train
                     #, X_jaccard_feat_train 
                     , X_sim_feat_train
                     , X_train_count
                     , X_extquery_count_feat_train, X_char_sim_train))
X_test = np.hstack((X_common_word_feat_test
                     #, X_jaccard_feat_test
                     , X_sim_feat_test
                     , X_test_count
                     , X_extquery_count_feat_test, X_char_sim_test))

In [13]:
X_train.shape, X_test.shape

((74067, 65), (166693, 65))

# Evaluation Method

In [14]:
from sklearn.metrics import mean_squared_error, make_scorer

def fmean_squarded_error(ground_truth, prediction):
    fmean_squared_error_ = mean_squared_error(ground_truth, prediction) ** 0.5
    return fmean_squared_error_

RMSE = make_scorer(fmean_squarded_error, greater_is_better=False)

# Train model Random Froest

In [15]:
rfr = RandomForestRegressor()
clf = pipeline.Pipeline([('rfr', rfr)])
param_grid_old = {'rfr__n_estimators' : list(range(320, 400 ,1)), 'rfr__max_depth': list(range(8,10,1))}
param_grid = {'rfr__n_estimators' : [260],#list(range(109,110,1)), 
              'rfr__max_depth': [8], #list(range(7,8,1))
            }
model = grid_search.GridSearchCV(estimator = clf, param_grid = param_grid,
                                 n_jobs = 2, cv = 5, verbose = 1, scoring=RMSE)
model.fit(X_train, y_train)

print("Best parameters found by grid search:")
print(model.best_params_)
print("Best CV score:")
print(model.best_score_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best parameters found by grid search:
{'rfr__n_estimators': 260, 'rfr__max_depth': 8}
Best CV score:
-0.468753424833


[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed: 18.4min finished


In [None]:
rfr = RandomForestRegressor()
clf = pipeline.Pipeline([('rfr', rfr)])
param_grid_old = {'rfr__n_estimators' : list(range(320, 400 ,1)), 'rfr__max_depth': list(range(8,10,1))}
param_grid = {'rfr__n_estimators' : [350],#list(range(109,110,1)), 
              'rfr__max_depth': [8], #list(range(7,8,1))
            }
model = grid_search.GridSearchCV(estimator = clf, param_grid = param_grid,
                                 n_jobs = 2, cv = 10, verbose = 1, scoring=RMSE)
model.fit(X_train, y_train)

print("Best parameters found by grid search:")
print(model.best_params_)
print("Best CV score:")
print(model.best_score_)

## predict

In [16]:
y_pred = model.predict(X_test)
pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv('submission/rf_fe_tuned_20160222.csv', index=False)

# Xgboost

In [18]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test)  

params ={
    'colsample_bytree': 0.7,
    #'silent': 1,
    'eval_metric': 'rmse',
    'nthread': 8,
    'min_child_weight': 4.0,
    'n_estimators': 380.0,
    'subsample': 0.55,
    'eta': 0.05,
    'objective': 'reg:linear',
    'seed': 2016,
    'max_depth': 7,
    'gamma': 0.75}

cv_nround=1000
cv_nfold=10
#bst_cv = xgb.cv(params, dtrain, nfold=cv_nfold, num_boost_round=cv_nround
#                , early_stopping_rounds=10, show_progress=True)
clf = xgb.train(params, dtrain, 646)
y_pred = clf.predict(dtest)
def output(x):
    if x < 1:
        return 1
    elif x > 3:
        return 3
    else:
        return x

result = [output(x) for x in y_pred]
pd.DataFrame({"id": id_test, "relevance": result}).to_csv('submission/xgb_fe_20160222_tune.csv', index=False)