** Objectives **

* Create a model based on initial data exploration.
* Find ways to extend this model to capture interaction in the data.

In [23]:
%matplotlib inline

import pandas as pd
import numpy as np
import os, sys

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import train_test_split

sns.set_style('whitegrid')
sns.set_context('poster')

import warnings
warnings.filterwarnings('ignore')

basepath = os.path.expanduser('~/Desktop/src/See_Click_Predict/')
sys.path.append(os.path.join(basepath, 'src'))

np.random.seed(0)

In [3]:
# load files
train = pd.read_csv(os.path.join(basepath, 'data/raw/train.csv'), parse_dates=['created_time'])
test = pd.read_csv(os.path.join(basepath,  'data/raw/test.csv'), parse_dates=['created_time'])
sample_sub = pd.read_csv(os.path.join(basepath, 'data/raw/sampleSubmission.csv'))

In [11]:
class Data:
    def __init__(self, train, test):
        self.train = train
        self.test = test
    
    def concat_data(self):
        self.data = pd.concat((self.train, self.test), axis=0)
        return self.data
    
    def round_location(self):
        self.data['latitude'] = self.data.latitude.map(np.round)
        self.data['longitude'] = self.data.longitude.map(np.round)
        
        return self.data
    
    def fill_missing_values(self, feature, value):
        self.data[feature] = self.data[feature].fillna(value)
        
        return self.data
        
    def encode_categorical_variable(self, feature):
        lbl = LabelEncoder()
        
        lbl.fit(self.data[feature])
        self.data[feature] = lbl.transform(self.data[feature])
        
        return self.data
        
    def get_train_test(self):
        mask = self.data.num_votes.notnull()
        
        train = self.data.loc[mask]
        test = self.data.loc[~mask]
        
        return train, test
    
    def one_hot_encode(self, feature):
        ohe = pd.get_dummies(self.data[feature])
        self.data = pd.concat((self.data, ohe), axis=1)
        
        return self.data

In [87]:
d = Data(train, test)
d.concat_data()
d.round_location()
d.fill_missing_values('source', 'not_known')
d.fill_missing_values('tag_type', 'not_known')
d.encode_categorical_variable('source')
d.encode_categorical_variable('tag_type')
d.encode_categorical_variable('latitude')
_ = d.encode_categorical_variable('longitude')

In [88]:
train_, test_ = d.get_train_test()

In [89]:
assert len(train) == len(train_)
assert len(test) == len(test_)

** Model **

In [96]:
features = ['source', 'longitude', 'latitude', 'tag_type']

X = train_[features]
y_votes = train_.num_votes
y_views = train_.num_views
y_comments = train_.num_comments

Xtest = test_[features]

In [97]:
itrain, itest = train_test_split(range(len(train_)), test_size=0.2, random_state=1)

X_train = X.iloc[itrain]
X_test = X.iloc[itest]

y_train_votes = y_votes.iloc[itrain]
y_test_votes =  y_votes.iloc[itest]

y_train_views = y_views.iloc[itrain]
y_test_views = y_views.iloc[itest]

y_train_comments = y_comments.iloc[itrain]
y_test_comments = y_comments.iloc[itest]

In [98]:
est_1 = RandomForestRegressor(n_estimators=50, n_jobs=2)
est_2 = RandomForestRegressor(n_estimators=50, max_depth=15, n_jobs=2)
est_3 = RandomForestRegressor(n_estimators=50, n_jobs=2)

est_1.fit(X_train, y_train_votes)
est_2.fit(X_train, y_train_views)
est_3.fit(X_train, y_train_comments)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=50, n_jobs=2, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [99]:
y_preds_votes = est_1.predict(X_test)
y_preds_views = est_2.predict(X_test)
y_preds_comments = est_3.predict(X_test)

In [100]:
def rmsle(ytvotes, ytviews, ytcomments, ypvotes, ypviews, ypcomments):
    s = 0 
    
    dvotes = (np.log(ytvotes + 1) - np.log(ypvotes + 1)) ** 2
    dviews = (np.log(ytviews + 1) - np.log(ypviews + 1)) ** 2
    dcomments = (np.log(ytcomments + 1) - np.log(ypcomments + 1)) ** 2
      
    s = np.sum(dvotes) + np.sum(dviews) + np.sum(dcomments)
    return np.sqrt(1 / (3*len(ytvotes)) * s)


In [101]:
print('RMSLE on the test set: %f'%(rmsle(y_test_votes.values, y_test_views.values, y_test_comments.values, y_preds_votes,\
                                         y_preds_views, y_preds_comments)))

RMSLE on the test set: 0.527489


In [55]:
# fit on the whole set
est_1.fit(X, y_votes)
est_2.fit(X, y_views)
est_3.fit(X, y_comments)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [74]:
ypred_views = est_1.predict(Xtest)
ypred_votes = est_2.predict(Xtest)
ypred_comments = est_3.predict(Xtest)

In [75]:
sample_sub['num_views'] = ypred_views
sample_sub['num_votes'] = ypred_votes
sample_sub['num_comments'] = ypred_comments

In [77]:
sample_sub.to_csv(os.path.join(basepath, 'submissions/benchmark_model.csv'), index=False)