In [55]:
import numpy as np
import pandas as pd

import warnings
from matplotlib.colors import ListedColormap

warnings.filterwarnings('ignore')
%matplotlib inline

from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

Now that I've created a model to predict which writers will be winners based on their past NaNoWriMo performances, let's attempt to predict which novels will be winning novels based on what little we know about them: their genre, synopsis, and excerpt.

In [56]:
novel_features = pd.read_csv("../clean data/novel_features.csv", index_col = 0)
novel_features.head()

Unnamed: 0,Writer Name,Novel Name,Genre,Final Word Count,Daily Average,Winner,Synopses,url,Novel Date,Excerpt,...,num uniques,num sentences,paragraphs,fk score,has excerpt,num words excerpt,num uniques excerpt,num sentences excerpt,paragraphs excerpt,fk score excerpt
0,Nicaless,Novel: Lauren's Birthday,Young Adult,24229,807,0,\n<p></p>\n,http://nanowrimo.org/participants/nicaless/nov...,November 2015,\n<p></p>\n,...,0,0,0,0.0,0,0,0,0,0,0.0
1,Nicaless,Novel: A Mystery in the Kingdom of Aermon,Fantasy,50919,1697,1,\n<p>Hitoshi is appointed the youngest Judge a...,http://nanowrimo.org/participants/nicaless/nov...,November 2014,"\n<p>This story, funnily enough, started out a...",...,42,3,1,65.73,1,132,96,13,7,78.25
2,Rachel B. Moore,Novel: Finding Fortunato,Literary,50603,1686,1,\n<p>Sam and Anna Gold and their newly adoptiv...,http://nanowrimo.org/participants/rachel-b-moo...,November 2015,\n<p></p>\n,...,109,7,4,58.62,0,0,0,0,0,0.0
3,Rachel B. Moore,Novel: The Residency,Literary,50425,1680,1,\n<p>It's every writer's dream - an all-expens...,http://nanowrimo.org/participants/rachel-b-moo...,November 2014,\n<p></p>\n,...,51,4,3,65.73,0,0,0,0,0,0.0
4,Rachel B. Moore,Novel: The Jew From Fortunato,Literary Fiction,41447,1381,0,\n<p>20-something Andre Levinsky is a fish out...,http://nanowrimo.org/participants/rachel-b-moo...,November 2013,\n<p></p>\n,...,93,4,1,56.93,0,0,0,0,0,0.0


In [57]:
del novel_features['Novel Name']
del novel_features['Genre']
del novel_features['Final Word Count']
del novel_features['Daily Average']
del novel_features['Synopses']
del novel_features['url']
del novel_features['Excerpt']
del novel_features['Writer Name']
novel_features.columns

Index([u'Winner', u'Novel Date', u'has genre', u'standard genre',
       u'has_synopses', u'num words', u'num uniques', u'num sentences',
       u'paragraphs', u'fk score', u'has excerpt', u'num words excerpt',
       u'num uniques excerpt', u'num sentences excerpt', u'paragraphs excerpt',
       u'fk score excerpt'],
      dtype='object')

In [58]:
novel_features.head()

Unnamed: 0,Winner,Novel Date,has genre,standard genre,has_synopses,num words,num uniques,num sentences,paragraphs,fk score,has excerpt,num words excerpt,num uniques excerpt,num sentences excerpt,paragraphs excerpt,fk score excerpt
0,0,November 2015,1,1,0,0,0,0,0,0.0,0,0,0,0,0,0.0
1,1,November 2014,1,1,1,44,42,3,1,65.73,1,132,96,13,7,78.25
2,1,November 2015,1,1,1,153,109,7,4,58.62,0,0,0,0,0,0.0
3,1,November 2014,1,1,1,59,51,4,3,65.73,0,0,0,0,0,0.0
4,0,November 2013,1,0,1,124,93,4,1,56.93,0,0,0,0,0,0.0


In [59]:
print "The fraction of winning novels is " + str(sum(novel_features['Winner'] / float(len(novel_features['Winner']))))

The fraction of winning novels is 0.6278850683


In [60]:
y = novel_features['Winner'].values
del novel_features['Winner']
del novel_features['Novel Date']

### Logistic Regression

In [61]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import cross_val_score

In [62]:
scaler = StandardScaler()
features_norm = scaler.fit_transform(novel_features)

trainX, testX, trainy, testy = train_test_split(features_norm, y, test_size=0.2, random_state=1)

In [63]:
model_lr = LogisticRegression(C=1)
cross_val_score(model_lr,trainX,trainy,cv=10).mean()

0.62544114084957148

In [64]:
model_lr = LogisticRegression(C=5).fit(trainX,trainy)
model_lr.score(testX, testy)

0.63058823529411767

In [65]:
print confusion_matrix(testy,model_lr.predict(testX))

[[  2 157]
 [  0 266]]


The Logistic Regression didn't do too well.  Let's try applying PCA before running the Logistic Regression again.

In [66]:
from sklearn.decomposition import PCA

In [67]:
pca = PCA()
pca_features = pca.fit(features_norm).transform(features_norm)

In [68]:
pca_trainX, pca_testX, pca_trainy, pca_testy = train_test_split(pca_features, y, test_size=0.2, random_state=1)

In [69]:
new_model_lr = LogisticRegression(C=1)

print cross_val_score(new_model_lr, pca_trainX, pca_trainy, cv=10).mean()

new_model_lr.fit(pca_trainX, pca_trainy)

print confusion_matrix(pca_testy,new_model_lr.predict(pca_testX))
print new_model_lr.score(pca_testX, pca_testy) 

0.62544114085
[[  1 158]
 [  0 266]]
0.628235294118


Let's run other models to see if they do any better.

### K Neighbors

In [70]:
from sklearn.neighbors import KNeighborsClassifier

In [73]:
model_knn = KNeighborsClassifier(4)

print cross_val_score(model_knn, pca_trainX, pca_trainy, cv=10).mean()

model_knn.fit(pca_trainX, pca_trainy)
print confusion_matrix(pca_testy,model_knn.predict(pca_testX))
print model_knn.score(pca_testX, pca_testy)


0.534716670432
[[ 81  78]
 [141 125]]
0.484705882353


Unurprisingly, K Neighbors performs worse than Logistic Regression.

### Naive Bayes

In [38]:
from sklearn.naive_bayes import GaussianNB

In [74]:
model_nb = GaussianNB()
print cross_val_score(model_nb, pca_trainX, pca_trainy, cv=10).mean()

model_nb.fit(pca_trainX, pca_trainy)
print confusion_matrix(pca_testy,model_nb.predict(pca_testX))
print model_nb.score(pca_testX, pca_testy)



0.544747630185
[[ 37 122]
 [ 67 199]]
0.555294117647


Also surprising, Naive Bayes doesn't as well as Logistic Regression in this case.

### Decision Tree

In [75]:
from sklearn.tree import DecisionTreeClassifier

In [85]:
model_dt = DecisionTreeClassifier(max_depth=3, random_state=1)
print cross_val_score(model_dt, pca_trainX, pca_trainy, cv=10).mean()

model_dt.fit(pca_trainX, pca_trainy)
print confusion_matrix(pca_testy,model_dt.predict(pca_testX))
print model_dt.score(pca_testX, pca_testy)

0.607790438505
[[  2 157]
 [  2 264]]
0.625882352941


This Decision Tree does pretty well compared to the others, but still not an optimal score.

### Random Forest

In [86]:
from sklearn.ensemble import RandomForestClassifier

In [87]:
model_rf = RandomForestClassifier(max_depth=3, n_estimators=100)
print cross_val_score(model_rf, pca_trainX, pca_trainy, cv=10).mean()

model_rf.fit(pca_trainX, pca_trainy)
print confusion_matrix(pca_testy,model_rf.predict(pca_testX))
print model_rf.score(pca_testX, pca_testy)

0.628385838712
[[  0 159]
 [  0 266]]
0.625882352941


### Support Vector Machine

In [84]:
from sklearn.svm import SVC

In [88]:
model_svc = SVC(kernel="rbf",C=1)
print cross_val_score(model_svc, pca_trainX, pca_trainy, cv=10).mean()

model_svc.fit(pca_trainX, pca_trainy)
print confusion_matrix(pca_testy,model_svc.predict(pca_testX))
print model_svc.score(pca_testX, pca_testy)

0.621302650407
[[  0 159]
 [  0 266]]
0.625882352941


So Decision Trees and Support Vector Machines don't perform much better than guessing either.   
Maybe it just doesn't make sense to predict if a novel wins just based on it's synopses or excerpt.  Don't judge a book by it's cover I guess.  