In [98]:
import numpy as np
import pandas as pd

import warnings
from matplotlib.colors import ListedColormap

warnings.filterwarnings('ignore')
%matplotlib inline

from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import classification_report
from sklearn import metrics

In [99]:
novel_features = pd.read_csv("../clean data/novel_features.csv", index_col = 0)
novel_features.head()

Unnamed: 0,Writer Name,Novel Name,Genre,Final Word Count,Daily Average,Winner,Synopses,url,Novel Date,Excerpt,...,num uniques,num sentences,paragraphs,fk score,has excerpt,num words excerpt,num uniques excerpt,num sentences excerpt,paragraphs excerpt,fk score excerpt
0,Nicaless,Novel: Lauren's Birthday,Young Adult,24229,807,0,\n<p></p>\n,http://nanowrimo.org/participants/nicaless/nov...,November 2015,\n<p></p>\n,...,0,0,0,0.0,0,0,0,0,0,0.0
1,Nicaless,Novel: A Mystery in the Kingdom of Aermon,Fantasy,50919,1697,1,\n<p>Hitoshi is appointed the youngest Judge a...,http://nanowrimo.org/participants/nicaless/nov...,November 2014,"\n<p>This story, funnily enough, started out a...",...,42,3,1,65.73,1,132,96,13,7,78.25
2,Rachel B. Moore,Novel: Finding Fortunato,Literary,50603,1686,1,\n<p>Sam and Anna Gold and their newly adoptiv...,http://nanowrimo.org/participants/rachel-b-moo...,November 2015,\n<p></p>\n,...,109,7,4,58.62,0,0,0,0,0,0.0
3,Rachel B. Moore,Novel: The Residency,Literary,50425,1680,1,\n<p>It's every writer's dream - an all-expens...,http://nanowrimo.org/participants/rachel-b-moo...,November 2014,\n<p></p>\n,...,51,4,3,65.73,0,0,0,0,0,0.0
4,Rachel B. Moore,Novel: The Jew From Fortunato,Literary Fiction,41447,1381,0,\n<p>20-something Andre Levinsky is a fish out...,http://nanowrimo.org/participants/rachel-b-moo...,November 2013,\n<p></p>\n,...,93,4,1,56.93,0,0,0,0,0,0.0


In [100]:
del novel_features['Novel Name']
del novel_features['Genre']
del novel_features['Final Word Count']
del novel_features['Daily Average']
del novel_features['Synopses']
del novel_features['url']
del novel_features['Excerpt']
del novel_features['Writer Name']
novel_features.columns

Index([u'Winner', u'Novel Date', u'has genre', u'standard genre',
       u'has_synopses', u'num words', u'num uniques', u'num sentences',
       u'paragraphs', u'fk score', u'has excerpt', u'num words excerpt',
       u'num uniques excerpt', u'num sentences excerpt', u'paragraphs excerpt',
       u'fk score excerpt'],
      dtype='object')

In [101]:
nov2015 = novel_features[novel_features['Novel Date'] == "November 2015"]
not_nov2015 = novel_features[novel_features['Novel Date'] != "November 2015"]
print str(sum(nov2015['Winner'] / float(len(nov2015)))) + " is the fraction of winners in the test data"
print str(sum(not_nov2015['Winner'] / float(len(not_nov2015)))) + " is the fraction of winners in the train data"

0.437125748503 is the fraction of winners in the test data
0.686806411837 is the fraction of winners in the train data


In [102]:
y = novel_features['Winner'].values
del novel_features['Winner']

currentnovels = novel_features['Novel Date'] == "November 2015"
currentnovels.values

pastnovels = novel_features['Novel Date'] != "November 2015"
pastnovels.values

del novel_features['Novel Date']
y

array([0, 1, 1, ..., 1, 1, 1])

### Logistic Regression

In [103]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import cross_val_score

In [104]:
scaler = StandardScaler()
features_norm = scaler.fit_transform(novel_features)
features_norm

array([[ 0.26264592,  0.6768341 , -1.34469367, ..., -0.25943684,
        -0.12867299, -0.32128503],
       [ 0.26264592,  0.6768341 ,  0.7436638 , ...,  0.07515182,
         0.41195729,  1.23610056],
       [ 0.26264592,  0.6768341 ,  0.7436638 , ..., -0.25943684,
        -0.12867299, -0.32128503],
       ..., 
       [ 0.26264592,  0.6768341 ,  0.7436638 , ..., -0.25943684,
        -0.12867299, -0.32128503],
       [ 0.26264592,  0.6768341 ,  0.7436638 , ..., -0.25943684,
        -0.12867299, -0.32128503],
       [ 0.26264592,  0.6768341 , -1.34469367, ..., -0.25943684,
        -0.12867299, -0.32128503]])

In [105]:
testX = features_norm[currentnovels.values]
testy = y[currentnovels.values]

trainX = features_norm[pastnovels.values]
trainy = y[pastnovels.values]

As we saw above, the ratio of winners to non-winners in the train data is almost 70/30.  That's quite imbalanced.  Let's rebalance the train data close to 50/50 first.  We'll do this by oversampling the underrepresented class - the non-winners.  

In [106]:
from unbalanced_dataset import OverSampler, SMOTE

OS = OverSampler()
trainX,trainy = OS.fit_transform(trainX,trainy)

Determining classes statistics... 2 classes detected: {0: 508, 1: 1114}
Over-sampling performed: Counter({1: 1114, 0: 1016})


In [107]:
model_lr = LogisticRegression(C=1)
cross_val_score(model_lr,trainX,trainy,cv=10).mean()


0.52354078033119655

In [108]:
model_lr = LogisticRegression(C=5).fit(trainX,trainy)
model_lr.score(testX, testy)

0.49500998003992014

In [109]:
print classification_report(testy,model_lr.predict(testX))

             precision    recall  f1-score   support

          0       0.63      0.24      0.35       282
          1       0.46      0.82      0.59       219

avg / total       0.56      0.50      0.45       501



This is not much better than guessing.  Let's try applying PCA before running the Logistic Regression again.

### PCA

In [110]:
from sklearn.decomposition import PCA

In [111]:
pca = PCA()
pca_features = pca.fit(features_norm).transform(features_norm)
pca_features

array([[  2.06083002e+00,   8.25685196e-01,   7.86582157e-01, ...,
          7.43851687e-03,   6.45786219e-04,  -4.09595202e-03],
       [ -1.30623863e+00,   3.21347224e-01,   7.30156876e-01, ...,
         -1.10197638e-01,  -6.12617500e-02,  -1.01925944e-01],
       [ -1.53347198e+00,  -2.37513049e+00,   4.05354278e-01, ...,
         -1.14848059e-01,   1.22494431e-01,  -4.05779459e-02],
       ..., 
       [  6.04951760e-01,  -4.04439337e-01,   7.20642598e-01, ...,
          4.33604775e-02,  -5.67164632e-02,   1.15560538e-02],
       [ -1.83448544e-01,  -1.11917659e+00,   6.75031346e-01, ...,
          9.31620569e-02,  -1.56300438e-01,   6.00946958e-02],
       [  2.06083002e+00,   8.25685196e-01,   7.86582157e-01, ...,
          7.43851687e-03,   6.45786219e-04,  -4.09595202e-03]])

In [112]:
# Percentage of variance explained

pca.explained_variance_ratio_

array([ 0.36108244,  0.21830197,  0.09792561,  0.0926552 ,  0.07222622,
        0.04624275,  0.03658281,  0.03082816,  0.0238363 ,  0.00937815,
        0.00485473,  0.00252106,  0.0021151 ,  0.00144952])

### Try the model again

In [113]:
pca_testX = pca_features[currentnovels.values]
pca_testy = y[currentnovels.values]

pca_trainX = pca_features[pastnovels.values]
pca_trainy = y[pastnovels.values]

OS = OverSampler()
pca_trainX, pca_trainy = OS.fit_transform(pca_trainX, pca_trainy)

Determining classes statistics... 2 classes detected: {0: 508, 1: 1114}
Over-sampling performed: Counter({1: 1114, 0: 1016})


In [114]:
new_model_lr = LogisticRegression(C=1)

print cross_val_score(new_model_lr, pca_trainX, pca_trainy, cv=10).mean()

new_model_lr.fit(pca_trainX, pca_trainy)

print classification_report(pca_testy,new_model_lr.predict(pca_testX))
print new_model_lr.score(pca_testX, pca_testy) 

0.506643531914
             precision    recall  f1-score   support

          0       0.58      0.19      0.28       282
          1       0.44      0.82      0.57       219

avg / total       0.52      0.47      0.41       501

0.465069860279


Let's run other models to see if they do any better

### K Neighbors

In [115]:
from sklearn.neighbors import KNeighborsClassifier

In [125]:
model_knn = KNeighborsClassifier(4)

print cross_val_score(model_knn, pca_trainX, pca_trainy, cv=10).mean()

model_knn.fit(pca_trainX, pca_trainy)
print classification_report(pca_testy,model_knn.predict(pca_testX))
print model_knn.score(pca_testX, pca_testy)


print cross_val_score(model_knn, trainX, trainy, cv=10).mean()

model_knn.fit(trainX, trainy)
print classification_report(testy,model_knn.predict(testX))
print model_knn.score(testX, testy)


0.564827393799
             precision    recall  f1-score   support

          0       0.51      0.34      0.41       282
          1       0.41      0.58      0.48       219

avg / total       0.46      0.45      0.44       501

0.445109780439
0.573767636679
             precision    recall  f1-score   support

          0       0.54      0.33      0.41       282
          1       0.42      0.63      0.51       219

avg / total       0.49      0.46      0.45       501

0.463073852295


Surprisingly, K Neighbors is slightly better than Logistic Regression.

### Naive Bayes

In [117]:
from sklearn.naive_bayes import GaussianNB
#using Gaussian Naive Bayes because it allows negative inputs

In [126]:
model_nb = GaussianNB()
print cross_val_score(model_nb, pca_trainX, pca_trainy, cv=10).mean()

model_nb.fit(pca_trainX, pca_trainy)
print classification_report(pca_testy,model_nb.predict(pca_testX))
print model_nb.score(pca_testX, pca_testy)

print cross_val_score(model_nb, trainX, trainy, cv=10).mean()

model_nb.fit(trainX, trainy)
print classification_report(testy,model_nb.predict(testX))
print model_nb.score(testX, testy)

0.492984238234
             precision    recall  f1-score   support

          0       0.56      0.21      0.31       282
          1       0.44      0.79      0.56       219

avg / total       0.51      0.46      0.42       501

0.463073852295
0.497279979767
             precision    recall  f1-score   support

          0       0.60      0.53      0.56       282
          1       0.47      0.54      0.51       219

avg / total       0.54      0.53      0.54       501

0.534930139721


Also surprising, Naive Bayes doesn't do very well here.

### Decision Tree

In [119]:
from sklearn.tree import DecisionTreeClassifier

In [131]:
model_dt = DecisionTreeClassifier(max_depth=4, random_state=1)
print cross_val_score(model_dt, pca_trainX, pca_trainy, cv=10).mean()

model_dt.fit(pca_trainX, pca_trainy)
print classification_report(pca_testy,model_dt.predict(pca_testX))
print model_dt.score(pca_testX, pca_testy)

print cross_val_score(model_dt, trainX, trainy, cv=10).mean()

model_dt.fit(trainX, trainy)
print classification_report(testy,model_dt.predict(testX))
print model_dt.score(testX, testy)

0.514084714009
             precision    recall  f1-score   support

          0       0.55      0.02      0.04       282
          1       0.44      0.98      0.60       219

avg / total       0.50      0.44      0.29       501

0.439121756487
0.537070078142
             precision    recall  f1-score   support

          0       0.59      0.09      0.16       282
          1       0.44      0.92      0.59       219

avg / total       0.52      0.45      0.35       501

0.453093812375


This Decision Tree does pretty well compared to the others, but still a relatively low score.

### Random Forest

In [121]:
from sklearn.ensemble import RandomForestClassifier

In [132]:
model_rf = RandomForestClassifier(max_depth=4, n_estimators=100)
print cross_val_score(model_rf, pca_trainX, pca_trainy, cv=10).mean()

model_rf.fit(pca_trainX, pca_trainy)
print classification_report(pca_testy,model_rf.predict(pca_testX))
print model_rf.score(pca_testX, pca_testy)

print cross_val_score(model_rf, trainX, trainy, cv=10).mean()

model_rf.fit(trainX, trainy)
print classification_report(testy,model_rf.predict(testX))
print model_rf.score(testX, testy)

0.562585756708
             precision    recall  f1-score   support

          0       0.55      0.12      0.20       282
          1       0.44      0.87      0.58       219

avg / total       0.50      0.45      0.37       501

0.449101796407
0.552717557328
             precision    recall  f1-score   support

          0       0.56      0.13      0.21       282
          1       0.44      0.87      0.58       219

avg / total       0.51      0.45      0.37       501

0.453093812375


### Support Vector Machine

In [123]:
from sklearn.svm import SVC

In [124]:
model_svc = SVC(kernel="rbf",C=5)
print cross_val_score(model_svc, pca_trainX, pca_trainy, cv=10).mean()

model_svc.fit(pca_trainX, pca_trainy)
print classification_report(pca_testy,model_svc.predict(pca_testX))
print model_svc.score(pca_testX, pca_testy)

0.549389261567
             precision    recall  f1-score   support

          0       0.50      0.24      0.32       282
          1       0.42      0.70      0.52       219

avg / total       0.47      0.44      0.41       501

0.439121756487


So Decision Trees and Support Vector Machines aren't much better either.  
Ok, so maybe it doesn't make sense to predict if a novel wins just based on it's synopses or excerpt.  Don't judge a book by it's cover I guess.  