In [26]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold

In [27]:
train = pd.read_csv('train.csv', encoding="latin-1")
test = pd.read_csv('test.csv', encoding="latin-1")
num_train = train.shape[0]
num_train

74067

In [28]:
features = pd.read_csv('ready.csv')
features.columns

Index(['Unnamed: 0', 'id', 'product_title', 'product_uid', 'relevance',
       'search_term', 'product_description', 'product_colour', 'brand',
       'product_weight', 'product_depth', 'product_height', 'product_width',
       'Bullet01', 'Bullet02', 'Bullet03', 'Bullet04', 'Bullet05',
       'num_words_query', 'num_words_title', 'num_words_brand',
       'num_words_colour', 'num_words_weight', 'num_words_height',
       'num_words_depth', 'num_words_width', 'query_in_title',
       'common_words_query_and_title', 'query_last_word_in_title',
       'title_seq_match_score', 'title_levenshtein_ratio',
       'title_Jaccard_dist_norm', 'ratio_title', 'query_in_description',
       'common_words_query_and_desc', 'query_last_word_in_desc',
       'desc_sequence_match_score', 'desc_levenshtein_ratio',
       'desc_Jaccard_dist_norm', 'ratio_description', 'query_in_brand',
       'common_words_query_and_brand', 'brand_sequence_match_score',
       'brand_levenshtein_ratio', 'brand_Jaccard_di

In [29]:
features = features.drop(['Unnamed: 0', 'product_title', 'product_uid', 
       'search_term', 'product_description', 'product_colour', 'brand',
       'product_weight', 'product_depth', 'product_height', 'product_width',
       'Bullet01', 'Bullet02', 'Bullet03', 'Bullet04', 'Bullet05',],axis=1)
features = features.fillna(0)
df_train = features.iloc[:num_train]
df_test = features.iloc[num_train:]
id_test = df_test['id']
y_train = df_train['relevance']
x_train = df_train.drop(['id','relevance'],axis=1)
x_test = df_test.drop(['id','relevance'],axis=1)

In [30]:
print(len(x_train), len(y_train), len(x_test), len(features), len(id_test))

74067 74067 166693 240760 166693


In [31]:
score = []
def metric_scorer(model, x, y):
    y_pred = model.predict(x)
    y_pred_scaled = np.array([max(min(y,3.0),1.0) for y in y_pred])
    exp_var = metrics.explained_variance_score(y,y_pred)
    mean_abs_err = metrics.mean_absolute_error(y,y_pred)
    mean_sq_err = metrics.mean_squared_error(y,y_pred)
    rms = np.sqrt(mean_sq_err)
    r2_sc = metrics.r2_score(y,y_pred)
    score.append((exp_var,mean_abs_err,mean_sq_err,rms,r2_sc))
    
    return rms

In [32]:
# specify parameters and distributions to sample from
clf = LinearRegression()
params = {
            'fit_intercept': [True, False],
            'normalize': [True, False],
          }

k_fold = KFold(3, random_state=1)

grid_search = GridSearchCV(clf, param_grid=params,scoring='neg_mean_squared_error', cv=k_fold, n_jobs=4)
grid_search.fit(x_train, y_train)
result = grid_search.cv_results_
result

{'mean_fit_time': array([ 0.55919472,  0.67188954,  0.6770939 ,  0.58201249]),
 'mean_score_time': array([ 0.0341018 ,  0.0260423 ,  0.0208358 ,  0.02083357]),
 'mean_test_score': array([-0.23841816, -0.23841816, -0.26877853, -0.26877853]),
 'mean_train_score': array([-0.22983879, -0.22983879, -0.257054  , -0.257054  ]),
 'param_fit_intercept': masked_array(data = [True True False False],
              mask = [False False False False],
        fill_value = ?),
 'param_normalize': masked_array(data = [True False True False],
              mask = [False False False False],
        fill_value = ?),
 'params': ({'fit_intercept': True, 'normalize': True},
  {'fit_intercept': True, 'normalize': False},
  {'fit_intercept': False, 'normalize': True},
  {'fit_intercept': False, 'normalize': False}),
 'rank_test_score': array([2, 1, 3, 3]),
 'split0_test_score': array([-0.23667401, -0.23667401, -0.27360061, -0.27360061]),
 'split0_train_score': array([-0.23024042, -0.23024042, -0.25402663, -0.25

In [33]:
optimized_list = []
optimized = {}

#take the best CV result
for i in range(1,51):
    best = np.flatnonzero(result['rank_test_score'] == i)
    for b in best:
        rmse = np.sqrt(abs(result['mean_test_score'][b]))
        optimized = {'rank': i, 'rmse': rmse,
                     'fit_intercept': result['param_fit_intercept'].data[b],
                     'normalize': result['param_normalize'].data[b]}        
        optimized_list.append(optimized)
#optimized

In [34]:
df = pd.DataFrame(columns=['rank','rmse','fit_intercept','normalize'])
df = df.append(optimized_list)
df.to_csv('optimisation_linear_regression.csv')
df

Unnamed: 0,rank,rmse,fit_intercept,normalize
0,1.0,0.488281,True,False
1,2.0,0.488281,True,True
2,3.0,0.518439,False,True
3,3.0,0.518439,False,False


In [35]:
optimum_param = df.iloc[0][2:]
optimum_param = optimum_param.T
optimum_param

fit_intercept     True
normalize        False
Name: 0, dtype: object

In [36]:
model = LinearRegression(fit_intercept=optimum_param['fit_intercept'],
                         normalize=optimum_param['normalize'],
                         n_jobs=-1)
model.fit(x_train,y_train)
y_pred= model.predict(x_test)
y_pred = np.array([max(min(y,3.0),1.0) for y in y_pred])
pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv('submission_linear_regression.csv',index=False)

In [37]:
y_pred[0]

1.8166235704070193

In [38]:
k_fold = KFold(10, random_state=1)
linear_regression_scores = cross_val_score(model, x_train, y_train, cv=k_fold, scoring=metric_scorer, n_jobs=1)

# In[6]:

# CV results
cv_results_df = pd.DataFrame(score)
cv_results_df.columns = ['expl_var','mae','mse','rmse','r2']
cv_results_df = cv_results_df.T
cv_results_df.columns = ['F- '+str(i+1) for i in cv_results_df.columns]
means = []
stds = []
for i in range(0,5):
    means.append(np.mean(cv_results_df.iloc[i].values))
    stds.append(np.std(cv_results_df.iloc[i].values))
cv_results_df['mean'] = means
cv_results_df['std'] = stds
cv_results_df.to_csv('CV_linear_regression.csv')
cv_results_df

Unnamed: 0,F- 1,F- 2,F- 3,F- 4,F- 5,F- 6,F- 7,F- 8,F- 9,F- 10,mean,std
expl_var,0.195863,0.190708,0.191922,0.200461,0.179216,0.211297,0.185901,0.177856,0.132457,0.13278,0.179846,0.02537
mae,0.405136,0.394543,0.389819,0.388601,0.386008,0.381617,0.388017,0.386223,0.38852,0.402329,0.391081,0.007061
mse,0.239696,0.232733,0.2266,0.226255,0.223883,0.218728,0.224901,0.23754,0.24122,0.25824,0.23298,0.010995
rmse,0.489587,0.482424,0.476025,0.475663,0.473163,0.467684,0.474237,0.487381,0.491141,0.508173,0.482548,0.011276
r2,0.171483,0.177703,0.182685,0.195124,0.173685,0.20893,0.181632,0.153528,0.079453,0.074363,0.159859,0.043711
