In [13]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, KFold
import warnings
warnings.filterwarnings('ignore')

In [14]:
train = pd.read_csv('train.csv', encoding="latin-1")
test = pd.read_csv('test.csv', encoding="latin-1")
num_train = train.shape[0]
num_train

74067

In [15]:
features = pd.read_csv('ready.csv')
features.columns

Index(['Unnamed: 0', 'id', 'product_title', 'product_uid', 'relevance',
       'search_term', 'product_description', 'product_colour', 'brand',
       'product_weight', 'product_depth', 'product_height', 'product_width',
       'Bullet01', 'Bullet02', 'Bullet03', 'Bullet04', 'Bullet05',
       'num_words_query', 'num_words_title', 'num_words_brand',
       'num_words_colour', 'num_words_weight', 'num_words_height',
       'num_words_depth', 'num_words_width', 'query_in_title',
       'common_words_query_and_title', 'query_last_word_in_title',
       'title_seq_match_score', 'title_levenshtein_ratio',
       'title_Jaccard_dist_norm', 'ratio_title', 'query_in_description',
       'common_words_query_and_desc', 'query_last_word_in_desc',
       'desc_sequence_match_score', 'desc_levenshtein_ratio',
       'desc_Jaccard_dist_norm', 'ratio_description', 'query_in_brand',
       'common_words_query_and_brand', 'brand_sequence_match_score',
       'brand_levenshtein_ratio', 'brand_Jaccard_di

In [16]:
features = features.drop(['Unnamed: 0', 'product_title', 'product_uid', 
       'search_term', 'product_description', 'product_colour', 'brand',
       'product_weight', 'product_depth', 'product_height', 'product_width',
       'Bullet01', 'Bullet02', 'Bullet03', 'Bullet04', 'Bullet05',],axis=1)
features = features.fillna(0)
df_train = features.iloc[:num_train]
df_test = features.iloc[num_train:]
id_test = df_test['id']
y_train = df_train['relevance']
x_train = df_train.drop(['id','relevance'],axis=1)
x_test = df_test.drop(['id','relevance'],axis=1)

In [17]:
print(len(x_train), len(y_train), len(x_test), len(features), len(id_test))

74067 74067 166693 240760 166693


In [18]:
score = []
def metric_scorer(model, x, y):
    y_pred = model.predict(x)
    y_pred_scaled = np.array([max(min(y,3.0),1.0) for y in y_pred])
    exp_var = metrics.explained_variance_score(y,y_pred)
    mean_abs_err = metrics.mean_absolute_error(y,y_pred)
    mean_sq_err = metrics.mean_squared_error(y,y_pred)
    rms = np.sqrt(mean_sq_err)
    r2_sc = metrics.r2_score(y,y_pred)
    score.append((exp_var,mean_abs_err,mean_sq_err,rms,r2_sc))
    
    return rms

In [19]:
# specify parameters and distributions to sample from
clf = Ridge()
params = {
            'alpha': [0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0],
            'fit_intercept': [True, False],
            'normalize': [True, False],
            'tol':np.linspace(0.0001,0.1,10).tolist(),
            'solver':['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag'],
          }

k_fold = KFold(3, random_state=1)

grid_search = RandomizedSearchCV(clf, param_distributions=params,scoring='neg_mean_squared_error', cv=k_fold, n_iter=50)
grid_search.fit(x_train, y_train)
result = grid_search.cv_results_
result

{'mean_fit_time': array([ 0.54055071,  0.86544895,  0.25851647,  0.34541122,  0.76887965,
         0.3289005 ,  0.29204273,  0.92042303,  0.3192668 ,  0.37973285,
         0.35247087,  0.44224524,  0.47425524,  0.45800908,  1.00179728,
         0.53168313,  0.79856412,  0.42129978,  0.76146968,  0.29437637,
         0.32871596,  0.62070696,  0.76798709,  0.41901509,  1.06232897,
         0.36949547,  0.36286505,  0.29470952,  0.28720474,  0.34758019,
         0.30989059,  0.28670406,  0.31955997,  0.36776137,  0.36836775,
         0.39327971,  0.33757273,  0.34707928,  0.31889272,  0.28019873,
         0.63295245,  0.87739881,  0.33223605,  0.82908972,  0.31472548,
         0.36604269,  0.95117561,  0.32856647,  0.86344671,  0.25518235]),
 'mean_score_time': array([ 0.02218278,  0.02268354,  0.018531  ,  0.01651239,  0.02251609,
         0.02718623,  0.02451563,  0.02084812,  0.02272224,  0.0271349 ,
         0.02301677,  0.0299356 ,  0.02613211,  0.01574914,  0.02735305,
         0.01

In [20]:
optimized_list = []
optimized = {}

#take the best CV result
for i in range(1,51):
    best = np.flatnonzero(result['rank_test_score'] == i)
    for b in best:
        rmse = np.sqrt(abs(result['mean_test_score'][b]))
        optimized = {'rank': i, 'rmse': rmse,
                     'alpha': result['param_alpha'].data[b],
                     'fit_intercept': result['param_fit_intercept'].data[b],
                     'normalize': result['param_normalize'].data[b],
                     'tol': result['param_tol'].data[b],
                     'solver': result['param_solver'].data[b]}        
        optimized_list.append(optimized)
#optimized

In [21]:
df = pd.DataFrame(columns=['rank','rmse','alpha','fit_intercept','normalize','tol','solver'])
df = df.append(optimized_list)
df.to_csv('optimisation_ridge.csv')
df

Unnamed: 0,rank,rmse,alpha,fit_intercept,normalize,tol,solver
0,1.0,0.488251,5.0,True,False,0.0889,auto
1,2.0,0.488271,0.5,True,False,0.0334,svd
2,3.0,0.488281,0.01,True,False,0.0112,auto
3,3.0,0.488281,0.01,True,False,0.0778,cholesky
4,5.0,0.48838,0.05,True,True,0.0001,lsqr
5,6.0,0.488381,0.05,True,True,0.0556,svd
6,7.0,0.488435,0.01,True,True,0.0223,sparse_cg
7,8.0,0.488472,50.0,True,False,0.0889,cholesky
8,8.0,0.488472,50.0,True,False,0.0001,cholesky
9,8.0,0.488472,50.0,True,False,0.0334,cholesky


In [22]:
optimum_param = df.iloc[0][2:]
optimum_param = optimum_param.T
optimum_param

alpha                 5
fit_intercept      True
normalize         False
tol              0.0889
solver             auto
Name: 0, dtype: object

In [23]:
model = Ridge(alpha=optimum_param['alpha'],
              fit_intercept=optimum_param['fit_intercept'],
              normalize=optimum_param['normalize'],
              tol=optimum_param['tol'],
              solver=optimum_param['solver'])
model.fit(x_train,y_train)
y_pred= model.predict(x_test)
#id_test = test.id
y_pred = np.array([max(min(y,3.0),1.0) for y in y_pred])
pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv('submission_ridge.csv',index=False)

In [24]:
k_fold = KFold(10, random_state=1)
ridge_scores = cross_val_score(model, x_train, y_train, cv=k_fold, scoring=metric_scorer, n_jobs=1)

# In[6]:

# CV results
cv_results_df = pd.DataFrame(score)
cv_results_df.columns = ['expl_var','mae','mse','rmse','r2']
cv_results_df = cv_results_df.T
cv_results_df.columns = ['F- '+str(i+1) for i in cv_results_df.columns]
means = []
stds = []
for i in range(0,5):
    means.append(np.mean(cv_results_df.iloc[i].values))
    stds.append(np.std(cv_results_df.iloc[i].values))
cv_results_df['mean'] = means
cv_results_df['std'] = stds
cv_results_df.to_csv('CV_ridge.csv')
cv_results_df

Unnamed: 0,F- 1,F- 2,F- 3,F- 4,F- 5,F- 6,F- 7,F- 8,F- 9,F- 10,mean,std
expl_var,0.196191,0.190925,0.192155,0.200421,0.179284,0.211236,0.186055,0.177863,0.132586,0.132947,0.179966,0.025349
mae,0.405149,0.394586,0.389846,0.388727,0.386015,0.381679,0.388044,0.386277,0.388574,0.402393,0.391129,0.007056
mse,0.239629,0.232694,0.226537,0.226279,0.223875,0.218757,0.224867,0.237574,0.241265,0.258317,0.23298,0.011016
rmse,0.489519,0.482384,0.475959,0.475688,0.473154,0.467715,0.474201,0.487416,0.491188,0.508249,0.482547,0.011296
r2,0.171714,0.177841,0.182912,0.19504,0.173715,0.208823,0.181756,0.153408,0.079278,0.074086,0.159857,0.043811
