In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
train = pd.read_csv('./data/train.csv', index_col='id', encoding="ISO-8859-1")
test = pd.read_csv('./data/test.csv', index_col='id', encoding="ISO-8859-1")

products_description = pd.read_csv('./data/product_descriptions.csv')

In [3]:
# merge with product description

train = pd.merge(train, products_description, how='left', on='product_uid')
test = pd.merge(test, products_description, how='left', on='product_uid')

In [4]:
train.head()

Unnamed: 0,product_uid,product_title,search_term,relevance,product_description
0,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3.0,"Not only do angles make joints stronger, they ..."
1,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket,2.5,"Not only do angles make joints stronger, they ..."
2,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,3.0,BEHR Premium Textured DECKOVER is an innovativ...
3,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,rain shower head,2.33,Update your bathroom with the Delta Vero Singl...
4,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,shower only faucet,2.67,Update your bathroom with the Delta Vero Singl...


In [5]:
test.head()

Unnamed: 0,product_uid,product_title,search_term,product_description
0,100001,Simpson Strong-Tie 12-Gauge Angle,90 degree bracket,"Not only do angles make joints stronger, they ..."
1,100001,Simpson Strong-Tie 12-Gauge Angle,metal l brackets,"Not only do angles make joints stronger, they ..."
2,100001,Simpson Strong-Tie 12-Gauge Angle,simpson sku able,"Not only do angles make joints stronger, they ..."
3,100001,Simpson Strong-Tie 12-Gauge Angle,simpson strong ties,"Not only do angles make joints stronger, they ..."
4,100001,Simpson Strong-Tie 12-Gauge Angle,simpson strong tie hcc668,"Not only do angles make joints stronger, they ..."


In [None]:
y = train.relevance

In [None]:
from sklearn.cross_validation import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.3, random_state=44)

In [None]:
print X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import PassiveAggressiveRegressor, SGDRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor

from sklearn.preprocessing import MinMaxScaler

import xgboost as xgb

In [None]:
%run scripts/features.py

In [None]:
ft = FeatureTransformer()
scaler = MinMaxScaler()
est = PassiveAggressiveRegressor(C=0.01)
# clf = SGDRegressor(penalty='l1')
# reg = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0)
# est = BaggingRegressor(reg, n_estimators=45, max_samples=0.1, random_state=25)
# est = xgb.XGBRegressor()

online_model_pipe = Pipeline([('ft', ft), ('scaler', scaler), ('est', est)])

online_model_pipe.fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
predsTrain = online_model_pipe.predict(X_train)
predsTest = online_model_pipe.predict(X_test)

In [None]:
print 'RMSE on the training set %f ' %(np.sqrt(mean_squared_error(y_train, predsTrain)))
print 'RMSE on the test set %f ' %(np.sqrt(mean_squared_error(y_test, predsTest)))

In [None]:
# fit on the entire training dataset
online_model_pipe.fit(train, y)

In [None]:
predictions = online_model_pipe.predict(test)

In [None]:
def scale_predictions(prediction):
    if prediction > 3.0:
        return 3.0
    elif prediction < 1.0:
        return 1.0
    else:
        return prediction

In [None]:
predictions = map(scale_predictions, predictions)

In [None]:
pd.Series(predictions).plot(kind='hist')

In [None]:
# prepare submission file
submission = pd.read_csv('./data/sample_submission.csv')
submission['relevance'] = predictions
submission.to_csv('./submissions/fourth.csv', index=False)