# Predicting NaNoWriMo winners with Logistic Regression

In my first attempt at Logistic Regression I used all the numeric features, but now I want to exclude information from the contest that has already started.  

In [1]:
# import the data
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

writers = pd.read_csv("../clean data/user_summary_no2015.csv", index_col=0)
writers.head()

Unnamed: 0,Writer Name,Member Length,LifetimeWordCount,url,Age,Birthday,Favorite books or authors,Favorite noveling music,Hobbies,Location,...,Expected Max Submission,Expected Max Day,Expected Std Submissions,Expected Consec Subs,FW Total,FW Sub,FH Total,FH Sub,SH Total,SH Sub
0,Nicaless,2,50919,http://nanowrimo.org/participants/nicaless,24.0,December 20,"Ursula Le Guin, J.K.","Classical, Musicals","Reading, Video Games, Blogging, Learning","San Francisco, CA",...,24935.0,28.0,6235.712933,12.0,6689,6,12486,9,11743,3
1,Rachel B. Moore,10,478090,http://nanowrimo.org/participants/rachel-b-moore,,,"2666, Unaccustomed Earth, Exit Music, Crazy Lo...","Belle and Sebastian, Elliott Smith, PJ Harvey,...","Reading, volunteering, knitting, listening to ...",San Francisco,...,3809.0,9.0,1002.295167,6.8,16722,7,24086,14,26517,14
2,abookishbabe,1,0,http://nanowrimo.org/participants/abookishbabe,,April 2,"Colleen Hoover, Veronica Roth, Jennifer Niven,...",Tori Kelley,"Reading (DUH), Day dreaming, Going to Disneyla...","Sacramento, CA",...,,,,,28632,1,29299,2,0,0
3,alexabexis,11,475500,http://nanowrimo.org/participants/alexabexis,,,,Three Goddesses playlist Florence + the Machin...,"drawing, reading, movies & TV shows, comics, p...",New York City,...,2325.0,8.545455,570.626795,8.090909,25360,7,38034,12,40766,9
4,AllYellowFlowers,3,30428,http://nanowrimo.org/participants/AllYellowFlo...,,,"Lolita, Jesus' Son, Ask the",the sound of the coffeemaker,cryptozoology,Allston,...,2054.5,4.5,538.273315,21.0,1800,5,5300,10,5700,9


In [2]:
writers.columns

Index([u'Writer Name', u'Member Length', u'LifetimeWordCount', u'url', u'Age',
       u'Birthday', u'Favorite books or authors', u'Favorite noveling music',
       u'Hobbies', u'Location', u'Occupation', u'Primary Role',
       u'Sponsorship URL', u'Expected Final Word Count',
       u'Expected Daily Average', u'CURRENT WINNER', u'Current Donor', u'Wins',
       u'Donations', u'Participated', u'Consecutive Donor',
       u'Consecutive Wins', u'Consecutive Part', u'Part Years', u'Win Years',
       u'Donor Years', u'Num Novels', u'Expected Num Submissions',
       u'Expected Avg Submission', u'Expected Min Submission',
       u'Expected Min Day', u'Expected Max Submission', u'Expected Max Day',
       u'Expected Std Submissions', u'Expected Consec Subs', u'FW Total',
       u'FW Sub', u'FH Total', u'FH Sub', u'SH Total', u'SH Sub'],
      dtype='object')

In [3]:
# convert primary role and sponsorship url to binary vars
writers['Primary Role'][writers['Primary Role'] == 'Municipal Liaison'] = 1
writers['Primary Role'][writers['Primary Role'] != 1] = 0

writers['Sponsorship URL'].fillna(0, inplace=True)
writers['Sponsorship URL'][writers['Sponsorship URL'] != 0] = 1

In [2]:
# let's keep ALL NUMERIAL COLUMNS except the CURRENT WINNER column which we will use as response
features = writers._get_numeric_data()
features.columns

Index([u'Member Length', u'LifetimeWordCount', u'Age',
       u'Expected Final Word Count', u'Expected Daily Average',
       u'CURRENT WINNER', u'Current Donor', u'Wins', u'Donations',
       u'Participated', u'Consecutive Donor', u'Consecutive Wins',
       u'Consecutive Part', u'Num Novels', u'Expected Num Submissions',
       u'Expected Avg Submission', u'Expected Min Submission',
       u'Expected Min Day', u'Expected Max Submission', u'Expected Max Day',
       u'Expected Std Submissions', u'Expected Consec Subs', u'FW Total',
       u'FW Sub', u'FH Total', u'FH Sub', u'SH Total', u'SH Sub'],
      dtype='object')

In [3]:
del features['CURRENT WINNER']
# delete features that would only be collected after a contest starts
del features['Current Donor']
del features['FW Total']
del features['FW Sub']
del features['FH Total']
del features['FH Sub']
del features['SH Total']
del features['SH Sub']
features.head()

Unnamed: 0,Member Length,LifetimeWordCount,Age,Expected Final Word Count,Expected Daily Average,Wins,Donations,Participated,Consecutive Donor,Consecutive Wins,Consecutive Part,Num Novels,Expected Num Submissions,Expected Avg Submission,Expected Min Submission,Expected Min Day,Expected Max Submission,Expected Max Day,Expected Std Submissions,Expected Consec Subs
0,2,50919,24.0,50919.0,1697.3,1,1,1,1,1,1,1.0,14.0,3637.071429,299.0,2.0,24935.0,28.0,6235.712933,12.0
1,10,478090,,47809.0,1593.633333,8,8,10,8,7,10,10.0,8.3,918.057453,42.7,7.7,3809.0,9.0,1002.295167,6.8
2,1,0,,,,0,0,0,0,0,0,,,,,,,,,
3,11,475500,,43227.272727,1440.909091,7,7,11,4,4,11,11.0,9.272727,822.780595,36.0,6.727273,2325.0,8.545455,570.626795,8.090909
4,3,30428,,15214.0,507.133333,0,0,2,0,0,1,2.0,22.0,678.318083,50.0,10.5,2054.5,4.5,538.273315,21.0


In [4]:
y = writers['CURRENT WINNER'].values

In [5]:
# inputting 0 for users without prior data for daily avg, avg submission, num submissions etc. and so are marked NaN
features.fillna(0, inplace=True)
features.describe()

Unnamed: 0,Member Length,LifetimeWordCount,Age,Expected Final Word Count,Expected Daily Average,Wins,Donations,Participated,Consecutive Donor,Consecutive Wins,Consecutive Part,Num Novels,Expected Num Submissions,Expected Avg Submission,Expected Min Submission,Expected Min Day,Expected Max Submission,Expected Max Day,Expected Std Submissions,Expected Consec Subs
count,501.0,501.0,501.0,501.0,501.0,501.0,501.0,501.0,501.0,501.0,501.0,501.0,501.0,501.0,501.0,501.0,501.0,501.0,501.0,501.0
mean,4.212575,172552.676647,8.596806,36428.312194,1214.277073,2.606786,1.421158,3.656687,1.047904,1.96008,3.057884,3.377246,10.826177,1708.026777,73.105821,6.1283,4764.389341,10.005534,1314.411102,9.573348
std,3.255209,329113.33183,14.463648,43782.218313,1459.407277,4.651782,3.044384,4.899582,1.760029,2.539764,2.946632,3.45129,8.520344,2053.622361,1566.761571,6.145692,5727.358954,8.406292,2011.241171,8.393503
min,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-21113.5,0.0,0.0,0.0,0.0,0.0
25%,1.0,9818.0,0.0,7443.25,248.108333,0.0,0.0,1.0,0.0,0.0,1.0,1.0,2.0,362.75,0.0,1.0,955.0,1.0,256.685927,0.0
50%,4.0,93385.0,0.0,37594.333333,1253.144444,1.0,0.0,2.0,0.0,1.0,2.0,2.0,10.25,1446.652778,85.666667,4.5,3546.5,9.333333,873.018486,8.5
75%,6.0,206482.0,20.0,50734.2,1691.14,3.0,2.0,5.0,1.0,3.0,4.0,5.0,17.333333,2213.52,291.5,10.0,6250.0,16.2,1516.145753,16.0
max,13.0,4562712.0,61.0,651816.0,21727.2,52.0,36.0,52.0,9.0,14.0,14.0,26.0,30.0,20869.236584,5000.0,27.666667,51238.0,30.0,23874.872328,30.0


In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report

### Normalize data

In [7]:
scaler = StandardScaler()
features_norm = scaler.fit_transform(features)
features_norm[1]

array([ 1.77967343,  0.92929298, -0.5949674 ,  0.26019839,  0.26019839,
        1.16054543,  2.1631367 ,  1.29595831,  3.95393831,  1.98638802,
        2.35830378,  1.92083955, -0.29678396, -0.38505565, -0.01942619,
        0.25599572, -0.16697823, -0.11973645, -0.15534084, -0.33074635])

### Apply Logistic Regression

In [8]:
X_train, X_test, y_train, y_test = train_test_split(features_norm,y, test_size=0.2, random_state=0)

In [14]:
model_lr = LogisticRegression(C=5)
print cross_val_score(model_lr,X_train, y_train,cv=10).mean()

model_lr = LogisticRegression(C=5).fit(X_train, y_train)
print classification_report(y_test,model_lr.predict(X_test))
print model_lr.score(X_test,y_test)

0.682823639775
             precision    recall  f1-score   support

          0       0.68      0.87      0.76        55
          1       0.77      0.50      0.61        46

avg / total       0.72      0.70      0.69       101

0.70297029703


Not as accurate as when including current contest data.  Can assume then that activity in the first couple weeks of the contest is __very predictive of winning.__  

Still, let's try some other models and see how they do.



### Naive Bayes

In [23]:
from sklearn.naive_bayes import GaussianNB

model_nb = GaussianNB()
print cross_val_score(model_nb, X_train, y_train, cv=10).mean()

model_nb.fit(X_train, y_train)
print classification_report(y_test,model_nb.predict(X_test))
print model_nb.score(X_test, y_test)

0.670064102564
             precision    recall  f1-score   support

          0       0.65      0.87      0.74        55
          1       0.74      0.43      0.55        46

avg / total       0.69      0.67      0.65       101

0.673267326733


### SVM

In [27]:
from sklearn.svm import SVC

model_svc = SVC(kernel="rbf",C=1)
print cross_val_score(model_svc, X_train, y_train, cv=10).mean()

model_svc.fit(X_train, y_train)
print classification_report(y_test,model_svc.predict(X_test))
print model_svc.score(X_test, y_test)

0.705644152595
             precision    recall  f1-score   support

          0       0.69      0.89      0.78        55
          1       0.80      0.52      0.63        46

avg / total       0.74      0.72      0.71       101

0.722772277228


### Decision Tree

In [17]:
from sklearn.tree import DecisionTreeClassifier

In [18]:
model_dt = DecisionTreeClassifier(max_depth=4)
print cross_val_score(model_dt, X_train, y_train, cv=10).mean()

model_dt.fit(X_train, y_train)
print classification_report(y_test,model_dt.predict(X_test))
print model_dt.score(X_test, y_test)

0.660243902439
             precision    recall  f1-score   support

          0       0.62      0.76      0.68        55
          1       0.61      0.43      0.51        46

avg / total       0.61      0.61      0.60       101

0.613861386139


In [19]:
dt_importances = pd.DataFrame(zip(features.columns, model_dt.feature_importances_))
dt_importances.sort_values(1, ascending=False).head() # most to least predictive of being 0 ??? because negative? 

Unnamed: 0,0,1
4,Expected Daily Average,0.4215
1,LifetimeWordCount,0.221274
0,Member Length,0.064714
14,Expected Min Submission,0.057052
17,Expected Max Day,0.055056


Without the data from the current contest, the most important features are Expected Daily Average and LifetimeWordCount, or a writer's average daily writing productivity and how much they've participated in the past

### Random Forests

In [24]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

In [25]:
model_rf = RandomForestClassifier(max_depth=4, n_estimators=100, max_features=2)
print cross_val_score(model_rf, X_train, y_train, cv=10).mean()

model_rf.fit(X_train, y_train)
print classification_report(y_test,model_rf.predict(X_test))
print model_rf.score(X_test, y_test)

0.690390869293
             precision    recall  f1-score   support

          0       0.72      0.89      0.80        55
          1       0.82      0.59      0.68        46

avg / total       0.77      0.75      0.75       101

0.752475247525


In [26]:
rf_importances = pd.DataFrame(zip(features.columns, model_rf.feature_importances_))
rf_importances.sort_values(1, ascending=False).head() # most to least predictive of being 0 ??? because negative? 

Unnamed: 0,0,1
4,Expected Daily Average,0.123783
1,LifetimeWordCount,0.110945
3,Expected Final Word Count,0.100577
9,Consecutive Wins,0.077699
5,Wins,0.059649


Random Forests and Support Vector Machines do best in predicting.  