In [1]:
import numpy as np
import pandas as pd

import warnings
from matplotlib.colors import ListedColormap

warnings.filterwarnings('ignore')
%matplotlib inline

from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

Now that I've created a model to predict which writers will be winners based on their past NaNoWriMo performances, let's attempt to predict which novels will be winning novels based on what little we know about them: their genre, synopsis, and excerpt.

In [2]:
novel_features = pd.read_csv("../clean data/novel_features.csv", index_col = 0)
novel_features.head()

Unnamed: 0,Writer Name,Novel Name,Genre,Final Word Count,Daily Average,Winner,Synopses,url,Novel Date,Excerpt,...,num uniques,num sentences,paragraphs,fk score,has excerpt,num words excerpt,num uniques excerpt,num sentences excerpt,paragraphs excerpt,fk score excerpt
0,Nicaless,Novel: Lauren's Birthday,Young Adult,24229,807,0,\n<p></p>\n,http://nanowrimo.org/participants/nicaless/nov...,November 2015,\n<p></p>\n,...,0,0,0,0.0,0,0,0,0,0,0.0
1,Nicaless,Novel: A Mystery in the Kingdom of Aermon,Fantasy,50919,1697,1,\n<p>Hitoshi is appointed the youngest Judge a...,http://nanowrimo.org/participants/nicaless/nov...,November 2014,"\n<p>This story, funnily enough, started out a...",...,42,3,1,65.73,1,132,96,13,7,78.25
2,Rachel B. Moore,Novel: Finding Fortunato,Literary,50603,1686,1,\n<p>Sam and Anna Gold and their newly adoptiv...,http://nanowrimo.org/participants/rachel-b-moo...,November 2015,\n<p></p>\n,...,109,7,4,58.62,0,0,0,0,0,0.0
3,Rachel B. Moore,Novel: The Residency,Literary,50425,1680,1,\n<p>It's every writer's dream - an all-expens...,http://nanowrimo.org/participants/rachel-b-moo...,November 2014,\n<p></p>\n,...,51,4,3,65.73,0,0,0,0,0,0.0
4,Rachel B. Moore,Novel: The Jew From Fortunato,Literary Fiction,41447,1381,0,\n<p>20-something Andre Levinsky is a fish out...,http://nanowrimo.org/participants/rachel-b-moo...,November 2013,\n<p></p>\n,...,93,4,1,56.93,0,0,0,0,0,0.0


In [3]:
del novel_features['Novel Name']
del novel_features['Genre']
del novel_features['Final Word Count']
del novel_features['Daily Average']
del novel_features['Synopses']
del novel_features['url']
del novel_features['Excerpt']
del novel_features['Writer Name']
novel_features.columns

Index([u'Winner', u'Novel Date', u'has genre', u'standard genre',
       u'has_synopses', u'num words', u'num uniques', u'num sentences',
       u'paragraphs', u'fk score', u'has excerpt', u'num words excerpt',
       u'num uniques excerpt', u'num sentences excerpt', u'paragraphs excerpt',
       u'fk score excerpt'],
      dtype='object')

In [4]:
novel_features.head()

Unnamed: 0,Winner,Novel Date,has genre,standard genre,has_synopses,num words,num uniques,num sentences,paragraphs,fk score,has excerpt,num words excerpt,num uniques excerpt,num sentences excerpt,paragraphs excerpt,fk score excerpt
0,0,November 2015,1,1,0,0,0,0,0,0.0,0,0,0,0,0,0.0
1,1,November 2014,1,1,1,44,42,3,1,65.73,1,132,96,13,7,78.25
2,1,November 2015,1,1,1,153,109,7,4,58.62,0,0,0,0,0,0.0
3,1,November 2014,1,1,1,59,51,4,3,65.73,0,0,0,0,0,0.0
4,0,November 2013,1,0,1,124,93,4,1,56.93,0,0,0,0,0,0.0


In [5]:
print "The fraction of winning novels is " + str(sum(novel_features['Winner'] / float(len(novel_features['Winner']))))

The fraction of winning novels is 0.6278850683


In [6]:
y = novel_features['Winner'].values
del novel_features['Winner']
del novel_features['Novel Date']

### Logistic Regression

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from bokeh.plotting import figure,show,output_notebook
from bokeh.models import Range1d
output_notebook()

In [8]:
def plot_roc_curve(target_test, target_predicted_proba):
    fpr, tpr, thresholds = roc_curve(target_test, target_predicted_proba[:, 1])
    
    roc_auc = auc(fpr, tpr)
    
    p = figure(title='Receiver Operating Characteristic')
    # Plot ROC curve
    p.line(x=fpr,y=tpr,legend='ROC curve (area = %0.3f)' % roc_auc)
    p.x_range=Range1d(0,1)
    p.y_range=Range1d(0,1)
    p.xaxis.axis_label='False Positive Rate or (1 - Specifity)'
    p.yaxis.axis_label='True Positive Rate or (Sensitivity)'
    p.legend.orientation = "bottom_right"
    show(p)
    
%matplotlib inline

In [9]:
scaler = StandardScaler()
features_norm = scaler.fit_transform(novel_features)

trainX, testX, trainy, testy = train_test_split(features_norm, y, test_size=0.2, random_state=1)

In [10]:
model_lr = LogisticRegression(C=1)
print cross_val_score(model_lr,trainX,trainy,cv=10).mean()

model_lr.fit(trainX,trainy)
print pd.DataFrame(confusion_matrix(testy,model_lr.predict(testX)), index=['Predicted Class 0', 'Predicted Class 1'], 
                     columns=['Actual Class 0', 'Actual Class 1'])
print classification_report(testy,model_lr.predict(testX))
print model_lr.score(trainX,trainy)
plot_roc_curve(testy, model_lr.predict_proba(testX))

0.62544114085
                   Actual Class 0  Actual Class 1
Predicted Class 0               1             158
Predicted Class 1               0             266
             precision    recall  f1-score   support

          0       1.00      0.01      0.01       159
          1       0.63      1.00      0.77       266

avg / total       0.77      0.63      0.49       425

0.630153121319


  super(HasProps, self).__setattr__(name, value)


The Logistic Regression didn't do too well.  Let's try applying PCA before running the Logistic Regression again.

In [11]:
from sklearn.decomposition import PCA

In [12]:
pca = PCA()
pca_features = pca.fit(features_norm).transform(features_norm)

In [13]:
pca_trainX, pca_testX, pca_trainy, pca_testy = train_test_split(pca_features, y, test_size=0.2, random_state=1)

In [14]:
new_model_lr = LogisticRegression(C=1)

print cross_val_score(model_lr,pca_trainX,pca_trainy,cv=10).mean()

new_model_lr.fit(pca_trainX,pca_trainy)
print pd.DataFrame(confusion_matrix(pca_testy,new_model_lr.predict(pca_testX)), index=['Predicted Class 0', 'Predicted Class 1'], 
                     columns=['Actual Class 0', 'Actual Class 1'])
print classification_report(pca_testy,new_model_lr.predict(pca_testX))
print new_model_lr.score(pca_trainX,pca_trainy)
plot_roc_curve(pca_testy, new_model_lr.predict_proba(pca_testX))

0.62544114085
                   Actual Class 0  Actual Class 1
Predicted Class 0               1             158
Predicted Class 1               0             266
             precision    recall  f1-score   support

          0       1.00      0.01      0.01       159
          1       0.63      1.00      0.77       266

avg / total       0.77      0.63      0.49       425

0.630153121319


  super(HasProps, self).__setattr__(name, value)


Let's run other models to see if they do any better.

### K Neighbors

In [15]:
from sklearn.neighbors import KNeighborsClassifier

In [16]:
model_knn = KNeighborsClassifier(4)

print cross_val_score(model_knn,pca_trainX,pca_trainy,cv=10).mean()

model_knn.fit(pca_trainX,pca_trainy)
print pd.DataFrame(confusion_matrix(pca_testy,model_knn.predict(pca_testX)), index=['Predicted Class 0', 'Predicted Class 1'], 
                     columns=['Actual Class 0', 'Actual Class 1'])
print classification_report(pca_testy,model_knn.predict(pca_testX))
print model_knn.score(pca_trainX,pca_trainy)
plot_roc_curve(pca_testy, model_knn.predict_proba(pca_testX))

0.530626868831
                   Actual Class 0  Actual Class 1
Predicted Class 0              52             107
Predicted Class 1              81             185
             precision    recall  f1-score   support

          0       0.39      0.33      0.36       159
          1       0.63      0.70      0.66       266

avg / total       0.54      0.56      0.55       425

0.679623085984


  super(HasProps, self).__setattr__(name, value)


Unurprisingly, K Neighbors performs worse than Logistic Regression.

### Naive Bayes

In [17]:
from sklearn.naive_bayes import GaussianNB

In [18]:
model_nb = GaussianNB()
print cross_val_score(model_nb,pca_trainX,pca_trainy,cv=10).mean()

model_nb.fit(pca_trainX,pca_trainy)
print pd.DataFrame(confusion_matrix(testy,model_nb.predict(pca_testX)), index=['Predicted Class 0', 'Predicted Class 1'], 
                     columns=['Actual Class 0', 'Actual Class 1'])
print classification_report(pca_testy,model_nb.predict(pca_testX))
print model_nb.score(pca_trainX,pca_trainy)
plot_roc_curve(pca_testy, model_nb.predict_proba(pca_testX))

0.544747630185
                   Actual Class 0  Actual Class 1
Predicted Class 0              37             122
Predicted Class 1              67             199
             precision    recall  f1-score   support

          0       0.36      0.23      0.28       159
          1       0.62      0.75      0.68       266

avg / total       0.52      0.56      0.53       425

0.587750294464


  super(HasProps, self).__setattr__(name, value)


Also surprising, Naive Bayes doesn't as well as Logistic Regression in this case.

### Decision Tree

In [19]:
from sklearn.tree import DecisionTreeClassifier

In [20]:
model_dt = DecisionTreeClassifier(max_depth=3, random_state=1)
print cross_val_score(model_dt,pca_trainX,pca_trainy,cv=10).mean()

model_dt.fit(pca_trainX,pca_trainy)
print pd.DataFrame(confusion_matrix(pca_testy,model_dt.predict(pca_testX)), index=['Predicted Class 0', 'Predicted Class 1'], 
                     columns=['Actual Class 0', 'Actual Class 1'])
print classification_report(pca_testy,model_dt.predict(pca_testX))
print model_dt.score(pca_trainX,pca_trainy)
plot_roc_curve(pca_testy, model_dt.predict_proba(pca_testX))

0.607790438505
                   Actual Class 0  Actual Class 1
Predicted Class 0               2             157
Predicted Class 1               2             264
             precision    recall  f1-score   support

          0       0.50      0.01      0.02       159
          1       0.63      0.99      0.77       266

avg / total       0.58      0.63      0.49       425

0.633097762073


  super(HasProps, self).__setattr__(name, value)


This Decision Tree does only slightly better compared to the others, but still not an optimal score.

In [23]:
dt_importances = pd.DataFrame(zip(novel_features.columns, model_dt.feature_importances_))
dt_importances.sort_values(1, ascending=False).head() # most to least predictive  

Unnamed: 0,0,1
0,has genre,0.26146
7,fk score,0.189264
11,num sentences excerpt,0.130252
8,has excerpt,0.127461
9,num words excerpt,0.10641


### Random Forest

In [24]:
from sklearn.ensemble import RandomForestClassifier

In [25]:
model_rf = RandomForestClassifier(max_depth=3, n_estimators=100)
print cross_val_score(model_rf,pca_trainX,pca_trainy,cv=10).mean()

model_rf.fit(pca_trainX,pca_trainy)
print pd.DataFrame(confusion_matrix(pca_testy,model_dt.predict(pca_testX)), index=['Predicted Class 0', 'Predicted Class 1'], 
                     columns=['Actual Class 0', 'Actual Class 1'])
print classification_report(pca_testy,model_rf.predict(pca_testX))
print model_rf.score(pca_trainX,pca_trainy)
plot_roc_curve(pca_testy, model_rf.predict_proba(pca_testX))

0.628385838712
                   Actual Class 0  Actual Class 1
Predicted Class 0               2             157
Predicted Class 1               2             264
             precision    recall  f1-score   support

          0       0.00      0.00      0.00       159
          1       0.63      1.00      0.77       266

avg / total       0.39      0.63      0.48       425

0.629564193168


  super(HasProps, self).__setattr__(name, value)


### Support Vector Machine

In [26]:
from sklearn.svm import SVC

In [27]:
model_svc = SVC(kernel="linear",C=1)
print cross_val_score(model_rf,pca_trainX,pca_trainy,cv=10).mean()

model_rf.fit(pca_trainX,pca_trainy)
print pd.DataFrame(confusion_matrix(pca_testy,model_dt.predict(pca_testX)), index=['Predicted Class 0', 'Predicted Class 1'], 
                     columns=['Actual Class 0', 'Actual Class 1'])
print classification_report(pca_testy,model_rf.predict(pca_testX))
print model_rf.score(pca_trainX,pca_trainy)
plot_roc_curve(pca_testy, model_rf.predict_proba(pca_testX))

0.628385838712
                   Actual Class 0  Actual Class 1
Predicted Class 0               2             157
Predicted Class 1               2             264
             precision    recall  f1-score   support

          0       0.00      0.00      0.00       159
          1       0.63      1.00      0.77       266

avg / total       0.39      0.63      0.48       425

0.628975265018


  super(HasProps, self).__setattr__(name, value)


So Decision Trees and Support Vector Machines don't perform much better than guessing either.   
Maybe it just doesn't make sense to predict if a novel wins just based on it's synopses or excerpt.  Don't judge a book by it's cover I guess.  