# Introduction

Before running this notebook, run [Prepare data](Prepare data.ipynb) in order to create `../data/CleanedReviews.pickle`, which is required here.

# Setup

In [None]:
%matplotlib inline

In [None]:
import pandas as pd
import numpy as np

from collections import namedtuple, OrderedDict
from datetime import datetime, timedelta
import pickle

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import Ridge
from sklearn.svm import LinearSVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.dummy import DummyRegressor


sns.set(style="white", color_codes=True)

In [None]:
figsize = (16, 5)

In [None]:
from feature_utils import ItemSelector, TextStats

In [None]:
%cat feature_utils.py

## Data import

In [None]:
all_reviews = pd.read_pickle('../data/CleanedReviews.pickle')

In [None]:
all_reviews.info()

In [None]:
all_reviews.isnull().sum()

# Data processing

In [None]:
# Use only reviews which have been up/down-voted.
reviews = all_reviews[all_reviews['HelpfulnessDenominator'] > 0].copy()
print('Ratio of kept reviews: {:.2f}'.format(len(reviews) / len(all_reviews)))

reviews['Helpfulness'] = reviews['HelpfulnessNumerator'].divide(reviews['HelpfulnessDenominator'], axis=0)
reviews['Helpful'] = reviews['Helpfulness'] >= 0.5

# Statistics

In [None]:
print('Helpful status:\n{}'.format(reviews['Helpful'].value_counts()))

As can be seen from the plot below, reviews with high scores are more useful than ones with low scores. 

In [None]:
plot_data = reviews[reviews['HelpfulnessDenominator'] > 10]
sns.jointplot(x='Score', y='Helpfulness', data=plot_data, kind='kde')
plt.show()

# Model training

The following section can be used as a template for training several regressors. Each regressor type is trained using a separate procedure (`RandomizedSearchCV` or `GridSearchCV`) and the best estimators are kept.

Each search procedure used a pipeline made of two steps:
1. Common feature building
2. Regressor fitting

Parameters for the these two steps are selected indepently for each procedure. Thus, two (best) estimators can use different features.

Running the fitting procedure takes a long time. For testing, change `modeling_reviews_n` to select a set of reviews at random.

In [None]:
modeling_reviews_n = 1000  # number of reviews to select at random
test_size=0.8

if modeling_reviews_n is not None and len(reviews) > modeling_reviews_n:
    rev = reviews.sample(n=modeling_reviews_n, random_state=0)
else:
    rev = reviews
train_data, test_data, train_target, test_target = train_test_split(
    rev[['Summary', 'Text', 'Score']], rev['Helpfulness'], random_state=0, test_size=test_size)

In [None]:
print('Training with {} samples'.format(len(train_data)))

## Features pipeline

Below we define the feature processing steps that will be used by all the search procedures. These can be customized by setting parameters for the common transformers in `SearchSettings` objects (see the next section).

In [None]:
# common features - used by all estimators
features_union = FeatureUnion(
    
    transformer_list=[

        # Text field textual features
        ('summary_terms',  Pipeline([
            ('selector', ItemSelector(key='Summary')),
            ('tfidf', TfidfVectorizer()),
        ])),

        # Text field textual features
        ('text_terms',  Pipeline([
            ('selector', ItemSelector(key='Text')),
            ('tfidf', TfidfVectorizer()),
        ])),
        
        # Summary field statistics
        ('summary_stats', Pipeline([
            ('selector', ItemSelector(key='Summary')),
            ('stats', TextStats()),
            ('vect', DictVectorizer()),
        ])),
        
        # Text field statistics
        ('text_stats', Pipeline([
            ('selector', ItemSelector(key='Summary')),
            ('stats', TextStats()),
            ('vect', DictVectorizer()),
        ])),
        
        # Score review
        ('score', Pipeline([
            ('selector', ItemSelector(key=['Score'])),
            ('value', MinMaxScaler()),
        ])),
    ]
)

# parameters for common features used in cross validation
features_parameters = {
    'features__text_terms__tfidf__min_df': (0, 0.1, 0.2),  # ignore terms with document frequency strictly lower
    'features__text_terms__tfidf__use_idf': (True, False),
    'features__transformer_weights': [{
            'summary_terms': 1,
            'text_terms': 1,
            'summary_stats': 1,
            'text_stats': 1,
            'score': 1
        }, {
            'summary_terms': 1,
            'text_terms': 0.5,
            'summary_stats': 1,
            'text_stats': 0.5,
            'score': 1
        }]
}

## Estimators setup

We create a list of search procedures as described above. We use two baselines: a regressor that predicts the mean and one that predicts the median.

In [None]:
SearchSettings = namedtuple('SearchSettings', [
    'name',        # name of the search operation
    'estimator',   # estimators for which we test hyperparameters
    'parameters',  # hyperparameters that are going to be added to `features_parameters`
    'procedure',   # randomized for RandomizedSearchCV, grid for GridSearchCV or
                   # dummy for a fake search procedure (train dummy estimator)
])

search_settings_list = []

search_settings_list.append(SearchSettings(
    name='Dummy_Mean',
    estimator=DummyRegressor(strategy='mean'),
    parameters={},
    procedure='dummy'
))

search_settings_list.append(SearchSettings(
    name='Dummy_Median',
    estimator=DummyRegressor(strategy='median'),
    parameters={},
    procedure='dummy'
))

search_settings_list.append(SearchSettings(
    name='Ridge',
    estimator=Ridge(),
    parameters={
        'estimator__alpha': [1, 2]
    },
    procedure='randomized'
))

search_settings_list.append(SearchSettings(
    name='LinearSVR',
    estimator=LinearSVR(),
    parameters={
        'estimator__C': [1, 2]
    },
    procedure='randomized'
))

## Fitting

In [None]:
r_s_iter = 10   # number of iterations for RandomizedSearchCV
scoring = 'neg_mean_squared_error'  # scoring method used for selecting the hyperparameters
verbose = 1  # output level
n_jobs = 1  # parallelization level
cv = 3  # cross-validation options
searches = OrderedDict()

for s in search_settings_list:
    
    print('Fitting {}'.format(s.name))
    
    pipeline= Pipeline([
        ('features', features_union),
        ('estimator', s.estimator)
    ])
    
    # merged hyperparameters space
    params = {**features_parameters, **s.parameters}
    
    if s.procedure == 'randomized':
        search = RandomizedSearchCV(
            pipeline, params,
            scoring=scoring,
            n_jobs=n_jobs, verbose=verbose, n_iter=r_s_iter, cv=cv)
    elif s.procedure == 'grid':
        search = GridSearchCV(
            pipeline, params,
            scoring=scoring,
            n_jobs=n_jobs, verbose=verbose, cv=cv)
    else:
        # we should not do any cross-validation here as we waste time
        # the estimator is refitted, so it gives correct results
        search = RandomizedSearchCV(
            pipeline, {},
            scoring=scoring,
            n_jobs=n_jobs, verbose=verbose, n_iter=1, cv=2)
    
    %time search.fit(train_data, train_target)
    
    searches[s.name] = search

# Model evaluation

## Summary

Below you can see the best estimators that resulted from each procedure, along with their training errors.

In [None]:
for name, search in searches.items():
    print(name, '\n',
          'score:', search.best_score_, '\n',
          'parameters:\n', search.best_params_,
          end='\n\n')

## Evaluation set metrics

In [None]:
# regression

results = pd.DataFrame(index=searches.keys())

for name, search in searches.items():
    predictions = search.predict(test_data)
    
    mse = metrics.mean_squared_error(test_target, predictions)
    
    plt.hist(predictions)
    plt.show()
    
    predictions[predictions < 0] = 0
    predictions[predictions > 1] = 1
    mse_bounded = metrics.mean_squared_error(test_target, predictions)
    
    results.loc[name, 'mse'] = mse
    results.loc[name, 'mse_bounded'] = mse_bounded

print(results)
results.plot(kind='bar')
plt.show()

# Save best models

We will save the best fitted estimators of each type for future use.

In [None]:
best_estimators = dict([(name, s.best_estimator_) for name, s in searches.items()])

with open('../data/helpfulness_estimators.pickle', 'wb') as f:
    pickle.dump(best_estimators, f)

In [None]:
# TODO Add estimators and search paramters.

# TODO Save models in their own files.

# TODO Refit models using all the data.

# TODO Some text fields contain HTML. Add a pipeline step to keep only the content.

# Use `linear_model.RidgeCV` instead of `linear_model.Ridge`.