In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.color_palette("muted")
import scipy.stats as stats
%matplotlib inline
import scipy.stats as stats
from datetime import datetime as dt
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [2]:
data = pd.read_csv('../data/data.csv', index_col=0)

threshold = 900000
data['viral'] = data['viewCount'] >= threshold
data['viral'] = data['viral'] * 1
print('There are', (len(data.viral[data.viral ==1])), 'viral videos is this data set.')

There are 26 viral videos is this data set.


### Training data

In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import cross_validation as cv
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, confusion_matrix, mean_squared_error, roc_auc_score
from sklearn import decomposition
from sklearn.preprocessing import MinMaxScaler



In [4]:
y = data['viral']
x = data.iloc[:,2:36]

In [5]:
# 60% train and 40% test data
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=42, stratify=y)

### Baseline model

In [6]:
dumb_pred = 1 if y_train.mean() > 0.5 else 0
print("Baseline (super dumb model, always predicts '%s'):" % ('viral' if dumb_pred else 'not viral'))
print("  - Accuracy: %.2f%%" % (100 * np.mean(y_test == dumb_pred)))
print("  - MSE: %.4f" % np.mean((y_test - dumb_pred) ** 2))
print("  - AUC: 0.50")

Baseline (super dumb model, always predicts 'not viral'):
  - Accuracy: 91.74%
  - MSE: 0.0826
  - AUC: 0.50


### Real models

In [7]:
def score_all_the_ways(model):
    y_pred = tree.predict(X_test)
    print("Confusion matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("Accuracy = %.2f%%" % (100 * accuracy_score(y_pred, y_test)))
    y_pred_proba = tree.predict_proba(X_test)[:, 1]
    print("Mean squared error = %.4f" % mean_squared_error(y_pred_proba, y_test))
    print("AUC = %.4f" % roc_auc_score(y_test, y_pred_proba))

In [8]:
SCORING_METRICS = ['accuracy', 'neg_mean_squared_error', 'roc_auc']

### Single decision tree, single fold, no parameters set

In [9]:
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
score_all_the_ways(tree)

Confusion matrix:
[[100  11]
 [  8   2]]
Accuracy = 84.30%
Mean squared error = 0.1632
AUC = 0.5396


### Single decision tree, grid search over `max_depth` parameter, using cross validation

In [10]:
for metric in SCORING_METRICS:
    print("\n**** Optimizing for:", metric)

    parameters = {'max_depth':range(1,20)}
    tree = GridSearchCV(DecisionTreeClassifier(), parameters, n_jobs=4, scoring=metric, cv=5)
    tree.fit(X_train, y_train)
    tree_model = tree.best_estimator_

    print("Best model:", tree.best_params_)
    score_all_the_ways(tree)


**** Optimizing for: accuracy
Best model: {'max_depth': 1}
Confusion matrix:
[[111   0]
 [ 10   0]]
Accuracy = 91.74%
Mean squared error = 0.0783
AUC = 0.4658

**** Optimizing for: neg_mean_squared_error
Best model: {'max_depth': 1}
Confusion matrix:
[[111   0]
 [ 10   0]]
Accuracy = 91.74%
Mean squared error = 0.0783
AUC = 0.4658

**** Optimizing for: roc_auc
Best model: {'max_depth': 4}
Confusion matrix:
[[111   0]
 [ 10   0]]
Accuracy = 91.74%
Mean squared error = 0.0872
AUC = 0.5532


### Random forest, no configuration

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [12]:
forest = RandomForestClassifier(oob_score = True, random_state=42, n_estimators =100)
forest.fit(X_train, y_train)
score_all_the_ways(forest)

Confusion matrix:
[[111   0]
 [ 10   0]]
Accuracy = 91.74%
Mean squared error = 0.0872
AUC = 0.5532


### Random forest, grid search over `max_depth` and `n_estimators`

In [13]:
for metric in SCORING_METRICS:
    print("\n**** Optimizing for:", metric)

    parameters = {'max_depth': range(1, 10), 'n_estimators' : [1,10,100]}
    forest = GridSearchCV(RandomForestClassifier(), parameters, n_jobs=4)
    forest.fit(X_train, y_train)
    print("Best model:", forest.best_params_)
    forest_model = forest.best_estimator_
    score_all_the_ways(forest_model)


**** Optimizing for: accuracy
Best model: {'max_depth': 1, 'n_estimators': 10}
Confusion matrix:
[[111   0]
 [ 10   0]]
Accuracy = 91.74%
Mean squared error = 0.0872
AUC = 0.5532

**** Optimizing for: neg_mean_squared_error
Best model: {'max_depth': 1, 'n_estimators': 1}
Confusion matrix:
[[111   0]
 [ 10   0]]
Accuracy = 91.74%
Mean squared error = 0.0872
AUC = 0.5532

**** Optimizing for: roc_auc
Best model: {'max_depth': 1, 'n_estimators': 1}
Confusion matrix:
[[111   0]
 [ 10   0]]
Accuracy = 91.74%
Mean squared error = 0.0872
AUC = 0.5532
