In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
train = pd.read_csv('./data/train.csv', index_col='id', encoding="ISO-8859-1")
test = pd.read_csv('./data/test.csv', index_col='id', encoding="ISO-8859-1")

products_description = pd.read_csv('./data/product_descriptions.csv')

In [3]:
# merge with product description

train = pd.merge(train, products_description, how='left', on='product_uid')
test = pd.merge(test, products_description, how='left', on='product_uid')

In [4]:
# load product attributes
attributes = pd.read_csv('./data/attributes.csv')

In [5]:
# filter out brands
attributes = attributes[attributes.name == "MFG Brand Name"][["product_uid", "value"]].rename(columns={"value": "brand"})

In [6]:
train_with_attributes = pd.merge(train, attributes, how='left', on='product_uid')
test_with_attributes = pd.merge(test, attributes, how='left', on='product_uid')

In [7]:
train_with_attributes = train_with_attributes.fillna('Unknown')
test_with_attributes = test_with_attributes.fillna('Unknown')

In [8]:
# response variable
y = train_with_attributes.relevance

In [9]:
from sklearn.cross_validation import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(train_with_attributes, y, test_size=0.3, random_state=44)

In [11]:
print X_train.shape, X_test.shape, y_train.shape, y_test.shape

(51846, 6) (22221, 6) (51846L,) (22221L,)


In [19]:
# import external scripts
%run scripts/features.py
%run scripts/models.py
%run scripts/eval.py

In [13]:
# for rmse
from sklearn.metrics import mean_squared_error

In [14]:
# prepare a model 
# rf_model = build_random_forest_model()

In [15]:
# cross validation
mean_relevance, std_relevance = eval_model([rf_model], X_train[:5000], y_train[:5000])

score: 0.488984
combined score: 0.488984
score: 0.493464
combined score: 0.493464
score: 0.494445
combined score: 0.494445


In [16]:
print 'Mean score %f and standard deviation %f ' %(mean_relevance, std_relevance)

Mean score 0.492298 and standard deviation 0.002377 


In [17]:
predsTrain = rf_model.predict(X_train[:5000])
predsTest = rf_model.predict(X_test[:5000])

In [18]:
print 'RMSE on the training set %f ' %(np.sqrt(mean_squared_error(y_train[:5000], predsTrain)))
print 'RMSE on the test set %f ' %(np.sqrt(mean_squared_error(y_test[:5000], predsTest)))

RMSE on the training set 0.435891 
RMSE on the test set 0.483844 


In [None]:
# fit on the entire training dataset
rf_model.fit(train_with_attributes, y)

In [None]:
predictions = rf_model.predict(test_with_attributes)

In [None]:
def scale_predictions(prediction):
    if prediction > 3.0:
        return 3.0
    elif prediction < 1.0:
        return 1.0
    else:
        return prediction

In [None]:
predictions = map(scale_predictions, predictions)

In [None]:
pd.Series(predictions).plot(kind='hist')

In [None]:
# prepare submission file
submission = pd.read_csv('./data/sample_submission.csv')
submission['relevance'] = predictions
submission.to_csv('./submissions/thirteenth.csv', index=False)