In [1]:
import os
mingw_path = 'C:\\mingw-w64\\x86_64-6.3.0-posix-seh-rt_v5-rev2\\mingw64\\bin'
os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']

In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, KFold

In [3]:
train = pd.read_csv('train.csv', encoding="latin-1")
test = pd.read_csv('test.csv', encoding="latin-1")
num_train = train.shape[0]
num_train

74067

In [4]:
features = pd.read_csv('ready.csv')
features.columns

Index(['Unnamed: 0', 'id', 'product_title', 'product_uid', 'relevance',
       'search_term', 'product_description', 'product_colour', 'brand',
       'product_weight', 'product_depth', 'product_height', 'product_width',
       'Bullet01', 'Bullet02', 'Bullet03', 'Bullet04', 'Bullet05',
       'num_words_query', 'num_words_title', 'num_words_brand',
       'num_words_colour', 'num_words_weight', 'num_words_height',
       'num_words_depth', 'num_words_width', 'query_in_title',
       'common_words_query_and_title', 'query_last_word_in_title',
       'title_seq_match_score', 'title_levenshtein_ratio',
       'title_Jaccard_dist_norm', 'ratio_title', 'query_in_description',
       'common_words_query_and_desc', 'query_last_word_in_desc',
       'desc_sequence_match_score', 'desc_levenshtein_ratio',
       'desc_Jaccard_dist_norm', 'ratio_description', 'query_in_brand',
       'common_words_query_and_brand', 'brand_sequence_match_score',
       'brand_levenshtein_ratio', 'brand_Jaccard_di

In [5]:
features = features.drop(['Unnamed: 0', 'product_title', 'product_uid', 
       'search_term', 'product_description', 'product_colour', 'brand',
       'product_weight', 'product_depth', 'product_height', 'product_width',
       'Bullet01', 'Bullet02', 'Bullet03', 'Bullet04', 'Bullet05',],axis=1)
features = features.fillna(0)
df_train = features.iloc[:num_train]
df_test = features.iloc[num_train:]
id_test = df_test['id']
y_train = df_train['relevance']
x_train = df_train.drop(['id','relevance'],axis=1)
x_test = df_test.drop(['id','relevance'],axis=1)

In [6]:
print(len(x_train), len(y_train), len(x_test), len(features), len(id_test))

74067 74067 166693 240760 166693


In [7]:
# specify parameters and distributions to sample from
clf = xgb.XGBRegressor()
params = {
            'learning_rate': np.linspace(0.01,0.1,10).tolist(),
            'gamma': np.linspace(0.0,1.0,10).tolist(),
            'max_depth': np.arange(1,31,5).tolist(),
            'min_child_weight':np.arange(1,10,1).tolist(),
            'subsample':np.linspace(0.1,1.0,10).tolist(),
            'colsample_bytree':np.linspace(0.5,1.0,10).tolist(),
            'colsample_bylevel':np.linspace(0.5,1.0,10).tolist(),
            'reg_lambda': np.linspace(0.01,1.0,10).tolist(),
            'reg_alpha': np.linspace(0.0,1.0,10).tolist()
          }

k_fold = KFold(3, random_state=1)

grid_search = RandomizedSearchCV(clf, param_distributions=params,scoring='neg_mean_squared_error', cv=k_fold, n_iter=50)
grid_search.fit(x_train, y_train)
result = grid_search.cv_results_
result

{'mean_fit_time': array([  0.89962133,  18.68466194,  12.4118197 ,   6.94693756,
         18.1895837 ,  17.52845399,  25.60686556,  10.78867722,
         13.93283709,  11.37902196,  13.5779504 ,  11.11828907,
         12.86379735,  18.89875778,  17.61656721,  17.42851154,
         22.90115746,  11.18425528,  14.06277466,   9.21448859,
          7.59294955,   9.37884164,  30.14523419,  14.23145111,
          6.32101885,   0.89597297,   9.56112806,  16.22005121,
          6.41970658,  17.70391448,  35.75126664,  21.31314445,
          1.38791068,   1.11761864,  11.81033039,  11.77815032,
          9.33592804,   2.45073247,   1.07344858,   7.78778108,
          0.91466951,  11.8485597 ,  15.12035004,   0.98120817,
          0.82673152,   2.45225135,   3.15761654,  10.90143005,
          5.42488321,   6.5989635 ]),
 'mean_score_time': array([ 0.06838139,  0.36909691,  0.23517179,  0.18579729,  0.20848012,
         0.23416607,  0.40212258,  0.24332388,  0.22410671,  0.20868087,
         0.2

In [8]:
optimized_list = []
optimized = {}

#take the best CV result
for i in range(1,51):
    best = np.flatnonzero(result['rank_test_score'] == i)
    for b in best:
        rmse = np.sqrt(abs(result['mean_test_score'][b]))
        optimized = {'rank': i, 'rmse': rmse,
                     'gamma': result['param_gamma'].data[b],
                     'learning_rate': result['param_learning_rate'].data[b],
                     'reg_lambda': result['param_reg_lambda'].data[b],
                     'reg_alpha': result['param_reg_alpha'].data[b],
                     'max_depth': result['param_max_depth'].data[b],
                     'colsample_bytree': result['param_colsample_bytree'].data[b],
                     'colsample_bylevel': result['param_colsample_bylevel'].data[b],
                     'subsample': result['param_subsample'].data[b],
                     'min_child_weight': result['param_min_child_weight'].data[b]}        
        optimized_list.append(optimized)
#optimized

In [9]:
df = pd.DataFrame(columns=['rank','rmse','gamma','learning_rate','reg_lambda','reg_alpha','max_depth','colsample_bytree','colsample_bylevel','subsample','min_child_weight'])
df = df.append(optimized_list)
df.to_csv('optimisation_xgboost.csv')
df

Unnamed: 0,rank,rmse,gamma,learning_rate,reg_lambda,reg_alpha,max_depth,colsample_bytree,colsample_bylevel,subsample,min_child_weight
0,1.0,0.475556,0.333333,0.06,0.56,0.888889,16.0,0.555556,0.555556,1.0,6.0
1,2.0,0.475844,1.0,0.09,0.56,1.0,16.0,0.611111,0.722222,1.0,4.0
2,3.0,0.475937,0.666667,0.09,0.67,0.555556,11.0,0.555556,0.722222,1.0,2.0
3,4.0,0.475947,0.0,0.06,0.78,1.0,11.0,0.722222,0.944444,0.7,4.0
4,5.0,0.476392,0.888889,0.05,1.0,0.666667,6.0,0.888889,0.611111,1.0,2.0
5,6.0,0.476416,0.444444,0.07,0.01,0.888889,16.0,0.666667,0.666667,1.0,7.0
6,7.0,0.476434,0.333333,0.06,0.89,0.444444,6.0,0.666667,1.0,0.7,9.0
7,8.0,0.476887,0.666667,0.07,0.56,0.888889,11.0,0.833333,0.722222,0.6,1.0
8,9.0,0.477263,0.333333,0.05,0.56,0.888889,16.0,0.833333,0.555556,0.4,5.0
9,10.0,0.477554,1.0,0.09,0.23,0.777778,11.0,0.833333,0.722222,0.6,3.0


In [10]:
score = []
def metric_scorer(model, x, y):
    y_pred = model.predict(x)
    exp_var = metrics.explained_variance_score(y,y_pred)
    mean_abs_err = metrics.mean_absolute_error(y,y_pred)
    mean_sq_err = metrics.mean_squared_error(y,y_pred)
    rms = np.sqrt(mean_sq_err)
    r2_sc = metrics.r2_score(y,y_pred)
    score.append((exp_var,mean_abs_err,mean_sq_err,rms,r2_sc))
    
    return rms

In [11]:
optimum_param = df.iloc[0][2:]
optimum_param = optimum_param.T
optimum_param

gamma                 0.333333
learning_rate         0.060000
reg_lambda            0.560000
reg_alpha             0.888889
max_depth            16.000000
colsample_bytree      0.555556
colsample_bylevel     0.555556
subsample             1.000000
min_child_weight      6.000000
Name: 0, dtype: float64

In [12]:
model = xgb.XGBRegressor(gamma=optimum_param['gamma'],
                         learning_rate=optimum_param['learning_rate'],
                         reg_lambda=optimum_param['reg_lambda'],
                         reg_alpha=optimum_param['reg_alpha'],
                         max_depth=int(optimum_param['max_depth']),
                         colsample_bytree=optimum_param['colsample_bytree'],
                         colsample_bylevel=optimum_param['colsample_bylevel'],
                         subsample=optimum_param['subsample'],
                         min_child_weight=optimum_param['min_child_weight']
                        )
model.fit(x_train,y_train)
y_pred= model.predict(x_test)
#id_test = test.id
pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv('submission_xgboost.csv',index=False)

In [13]:
k_fold = KFold(10, random_state=1)
xbg_scores = cross_val_score(model, x_train, y_train, cv=k_fold, scoring=metric_scorer, n_jobs=1)

# In[6]:

# CV results
cv_results_df = pd.DataFrame(score)
cv_results_df.columns = ['expl_var','mae','mse','rmse','r2']
cv_results_df = cv_results_df.T
cv_results_df.columns = ['F- '+str(i+1) for i in cv_results_df.columns]
means = []
stds = []
for i in range(0,5):
    means.append(np.mean(cv_results_df.iloc[i].values))
    stds.append(np.std(cv_results_df.iloc[i].values))
cv_results_df['mean'] = means
cv_results_df['std'] = stds
cv_results_df.to_csv('CV_xgboost.csv')
cv_results_df

Unnamed: 0,F- 1,F- 2,F- 3,F- 4,F- 5,F- 6,F- 7,F- 8,F- 9,F- 10,mean,std
expl_var,0.205656,0.230447,0.241488,0.242358,0.226975,0.2554,0.235973,0.229167,0.172305,0.176172,0.221594,0.026623
mae,0.398054,0.380329,0.374238,0.373989,0.370209,0.367456,0.371775,0.373086,0.374401,0.386092,0.376963,0.008604
mse,0.235486,0.220477,0.212049,0.213976,0.210358,0.206253,0.210653,0.219212,0.222379,0.236383,0.218723,0.009835
rmse,0.485269,0.46955,0.460487,0.462576,0.458649,0.454151,0.458969,0.4682,0.471571,0.486192,0.467561,0.010436
r2,0.186035,0.221007,0.235171,0.238805,0.223602,0.254048,0.233477,0.218843,0.151351,0.152708,0.211505,0.034077
