In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
train = pd.read_csv('./data/train.csv', index_col='id', encoding="ISO-8859-1")
test = pd.read_csv('./data/test.csv', index_col='id', encoding="ISO-8859-1")

products_description = pd.read_csv('./data/product_descriptions.csv')

In [3]:
# merge with product description

train = pd.merge(train, products_description, how='left', on='product_uid')
test = pd.merge(test, products_description, how='left', on='product_uid')

In [4]:
# load product attributes

attributes = pd.read_csv('./data/attributes.csv')

In [5]:
# filter out brands
attributes = attributes[attributes.name == "MFG Brand Name"][["product_uid", "value"]].rename(columns={"value": "brand"})

In [6]:
train_with_attributes = pd.merge(train, attributes, how='left', on='product_uid')
test_with_attributes = pd.merge(test, attributes, how='left', on='product_uid')

In [7]:
train_with_attributes = train_with_attributes.fillna('Unknown')
test_with_attributes = test_with_attributes.fillna('Unknown')

In [8]:
train_with_attributes.head()

Unnamed: 0,product_uid,product_title,search_term,relevance,product_description,brand
0,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3.0,"Not only do angles make joints stronger, they ...",Simpson Strong-Tie
1,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket,2.5,"Not only do angles make joints stronger, they ...",Simpson Strong-Tie
2,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,3.0,BEHR Premium Textured DECKOVER is an innovativ...,BEHR Premium Textured DeckOver
3,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,rain shower head,2.33,Update your bathroom with the Delta Vero Singl...,Delta
4,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,shower only faucet,2.67,Update your bathroom with the Delta Vero Singl...,Delta


In [9]:
test_with_attributes.head()

Unnamed: 0,product_uid,product_title,search_term,product_description,brand
0,100001,Simpson Strong-Tie 12-Gauge Angle,90 degree bracket,"Not only do angles make joints stronger, they ...",Simpson Strong-Tie
1,100001,Simpson Strong-Tie 12-Gauge Angle,metal l brackets,"Not only do angles make joints stronger, they ...",Simpson Strong-Tie
2,100001,Simpson Strong-Tie 12-Gauge Angle,simpson sku able,"Not only do angles make joints stronger, they ...",Simpson Strong-Tie
3,100001,Simpson Strong-Tie 12-Gauge Angle,simpson strong ties,"Not only do angles make joints stronger, they ...",Simpson Strong-Tie
4,100001,Simpson Strong-Tie 12-Gauge Angle,simpson strong tie hcc668,"Not only do angles make joints stronger, they ...",Simpson Strong-Tie


In [10]:
y = train_with_attributes.relevance

In [11]:
from sklearn.cross_validation import train_test_split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(train_with_attributes, y, test_size=0.3, random_state=44)

In [13]:
print X_train.shape, X_test.shape, y_train.shape, y_test.shape

(51846, 6) (22221, 6) (51846L,) (22221L,)


In [14]:
# import external scripts
%run scripts/features.py
%run scripts/models.py
%run scripts/eval.py

In [15]:
# for rmse
from sklearn.metrics import mean_squared_error

In [16]:
# prepare a model 
rf_model = build_random_forest_model()

In [19]:
# cross validation
mean_relevance, std_relevance = eval_model([rf_model], X_train[:5000], y_train[:5000])

score: 0.513388
combined score: 0.513388
score: 0.496003
combined score: 0.496003
score: 0.498647
combined score: 0.498647
score: 0.485155
combined score: 0.485155
score: 0.498571
combined score: 0.498571


In [18]:
print 'Mean score %f and standard deviation %f ' %(mean_relevance, std_relevance)

Mean score 0.490464 and standard deviation 0.012764 


In [None]:
predsTrain = rf_model.predict(X_train)
predsTest = rf_model.predict(X_test)

In [None]:
print 'RMSE on the training set %f ' %(np.sqrt(mean_squared_error(y_train, predsTrain)))
print 'RMSE on the test set %f ' %(np.sqrt(mean_squared_error(y_test, predsTest)))

In [None]:
# fit on the entire training dataset
online_model_pipe.fit(train_with_attributes, y)

In [None]:
predictions = rf_model.predict(test_with_attributes)

In [None]:
def scale_predictions(prediction):
    if prediction > 3.0:
        return 3.0
    elif prediction < 1.0:
        return 1.0
    else:
        return prediction

In [None]:
predictions = map(scale_predictions, predictions)

In [None]:
pd.Series(predictions).plot(kind='hist')

In [None]:
# prepare submission file
submission = pd.read_csv('./data/sample_submission.csv')
submission['relevance'] = predictions
submission.to_csv('./submissions/thirteenth.csv', index=False)