In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [None]:
train = pd.read_csv('./data/train.csv', index_col='id', encoding="ISO-8859-1")
test = pd.read_csv('./data/test.csv', index_col='id', encoding="ISO-8859-1")

products_description = pd.read_csv('./data/product_descriptions.csv')

In [None]:
# merge with product description

train = pd.merge(train, products_description, how='left', on='product_uid')
test = pd.merge(test, products_description, how='left', on='product_uid')

In [None]:
# load product attributes
attributes = pd.read_csv('./data/attributes.csv')

In [None]:
# filter out brands
attributes = attributes[attributes.name == "MFG Brand Name"][["product_uid", "value"]].rename(columns={"value": "brand"})

In [None]:
train_with_attributes = pd.merge(train, attributes, how='left', on='product_uid')
test_with_attributes = pd.merge(test, attributes, how='left', on='product_uid')

In [None]:
train_with_attributes = train_with_attributes.fillna('Unknown')
test_with_attributes = test_with_attributes.fillna('Unknown')

In [None]:
# response variable
y = train_with_attributes.relevance

In [None]:
from sklearn.cross_validation import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_with_attributes, y, test_size=0.3, random_state=44)

In [None]:
print X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
# import external scripts
%run scripts/features.py
%run scripts/models.py
%run scripts/eval.py

In [None]:
# for rmse
from sklearn.metrics import mean_squared_error

In [None]:
# prepare a model 
rf_model = build_random_forest_model()

In [None]:
# cross validation
mean_relevance, std_relevance = eval_model([rf_model], X_train[:5000], y_train[:5000])

In [None]:
print 'Mean score %f and standard deviation %f ' %(mean_relevance, std_relevance)

In [None]:
predsTrain = rf_model.predict(X_train[:5000])
predsTest = rf_model.predict(X_test[:5000])

In [None]:
print 'RMSE on the training set %f ' %(np.sqrt(mean_squared_error(y_train[:5000], predsTrain)))
print 'RMSE on the test set %f ' %(np.sqrt(mean_squared_error(y_test[:5000], predsTest)))

In [None]:
(2.66 + 3.0) / 2.

In [None]:
# fit on the entire training dataset
online_model_pipe.fit(train_with_attributes, y)

In [None]:
predictions = rf_model.predict(test_with_attributes)

In [None]:
def scale_predictions(prediction):
    if prediction > 3.0:
        return 3.0
    elif prediction < 1.0:
        return 1.0
    else:
        return prediction

In [None]:
predictions = map(scale_predictions, predictions)

In [None]:
pd.Series(predictions).plot(kind='hist')

In [None]:
# prepare submission file
submission = pd.read_csv('./data/sample_submission.csv')
submission['relevance'] = predictions
submission.to_csv('./submissions/thirteenth.csv', index=False)