In [6]:
import numpy as np
import os
from sklearn.ensemble import RandomForestRegressor
from sklearn.grid_search import RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn import cross_validation
from sklearn import preprocessing

import pandas as pd


In [5]:
class Trainer:

    def __init__(self, features, pred):
        self.features = features
        self.pred = pred

    def train_rf(self, n_estimators = 200, param_grid = None, folders = 5,score = 'mean_absolute_error'):
        # If customized parameters are not passsed, create standard ones
        if not param_grid:
            param_grid = {
            'max_features': ['auto', 'sqrt', 'log2'],
            'min_samples_split':[2, 4, 6, 8],
            'min_samples_leaf' : [1, 2, 4]
            }
        # Create estimator object for the grid search
        clf = RandomForestRegressor(n_estimators=n_estimators,
                                    max_depth=None,
                                    min_samples_split=4,
                                    min_samples_leaf=1,
                                    min_weight_fraction_leaf=0.0,
                                    max_features='log2',
                                    max_leaf_nodes=None,
                                    n_jobs=-1)

        # Create Grid Search Instance. It creates 10 models with randomly choosen parameters
        CV_clf = RandomizedSearchCV(estimator=clf,
                                    param_distributions = param_grid,
                                    n_iter = 10,
                                    cv= folders,
                                    scoring= score)

        # Fit the models
        CV_clf.fit(self.features, self.pred)

        # Print results
        print("Grid scores on development set:")
        print()
        for params, mean_score, scores in clf.grid_scores_:
            print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() * 2, params))
            print('Best score found on development set: {0} \n'.format(CV_clf.best_score_))
            print('Best parameters set found on development set:{0} \n'.format(CV_clf.best_params_))
        return CV_clf.best_estimator_

In [8]:
features = pd.read_csv('../data/w2v_features.csv')
test_features = pd.read_csv('../data/w2v_features_test.csv')

X = features[['feature_{0}'.format(index) for index in xrange(1,38)]].as_matrix()
y = features['relevance'].as_matrix()

In [None]:
rf_trainer = Trainer(features = X,pred = y,)
clf = rf_trainer.train_rf(n_estimators = 200, param_grid = None, folders = 5, score = 'mean_absolute_error')


In [None]:
print("Detailed classification report:")
print()
print("The model is trained on the full training set.")
print("The scores are computed on the full training set.")
print()
y_pred = clf.predict(X)
print(classification_report(y, y_pred))
print()
print('Saving predictions for test set...')
rf_df = test_features[['id']]
rf_df['relevance'] = y_pred
rf_df.to_csv('../submissions/w2v_rf_submission', index=False)