In [1]:
import re
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [2]:
training_data = 'training_data.csv'
tournament_data = 'tournament_data.csv'

# Loading Data
1. Extracting features and targets from the data
2. Converting 'era' column to integer to input it to the model

In [3]:
train = pd.read_csv(training_data)

In [4]:
features = [f for f in list(train) if "feature" in f]
targets = ['target_bernie']
features.insert(0, 'era')

X = train[features]
Y = train[targets]

In [5]:
X['era'] = X['era'].apply(lambda x: re.sub('[era]', '', x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [6]:
#X = pd.get_dummies(X)

# Shuffle & Split data

In [7]:
X, Y = shuffle(X, Y, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

print "Train_x Shape :: ", X_train.shape
print "Train_y Shape :: ", y_train.shape
print "Test_x Shape :: ", X_test.shape
print "Test_y Shape :: ", y_test.shape

Train_x Shape ::  (336830, 51)
Train_y Shape ::  (336830, 1)
Test_x Shape ::  (165902, 51)
Test_y Shape ::  (165902, 1)


In [None]:
# clf = RandomForestClassifier(n_jobs=-1)
# clf.fit(X_train, y_train)

# Machine Learning 
1. Using GridSearchCV for hyper parameter optimization
2. RandomForestClassifier for classification

In [8]:
from sklearn.model_selection import GridSearchCV

parameters = {
        'n_estimators': [ 20,25 ],
        'random_state': [ 0 ],
        'max_features': [ 2 ],
        'min_samples_leaf': [150,200,250]
}

model = RandomForestClassifier(n_jobs=-1)
clf = GridSearchCV(estimator=model, param_grid=parameters)
clf.fit(X_train, y_train)

  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [20, 25], 'max_features': [2], 'random_state': [0], 'min_samples_leaf': [150, 200, 250]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

# Accuracy

In [9]:
predictions = clf.predict(X_test)
print "Train Accuracy :", accuracy_score(y_train, clf.predict(X_train))
print "Test Accuracy  :", accuracy_score(y_test, predictions)

Train Accuracy : 0.6080901344892082
Test Accuracy  : 0.5145929524659136


In [10]:
test = pd.read_csv(tournament_data)

In [11]:
test['era'] = test['era'].apply(lambda x: re.sub('[era]', '', x))
# test['era'] = test['era'].apply(lambda x: re.sub('[X]', '187', x))

In [12]:
test = test[test.data_type == 'validation']
x_prediction = test[features]
y_prediction = clf.predict_proba(x_prediction)
results = y_prediction[:, 1]

In [13]:
ids = test['id']

# Create your submission
results_df = pd.DataFrame(data={'probability': results})
joined = pd.DataFrame(ids).join(results_df)
print "joined:", joined.head()

# Save the predictions out to a CSV file.
joined.to_csv("bernie_submission2.csv", index=False)

joined:                  id  probability
0  n00183d2eed2c463     0.530454
1  n00187fa497c9d5b     0.526056
2  n001acd04f3617b7     0.473081
3  n0043a13252683ee     0.515409
4  n00535e274720abd     0.512697


# Saving Results

In [14]:
submission = pd.read_csv("bernie_submission2.csv")

# v = test[ test.data_type == 'validation' ].copy()
validate_set = test.copy()

validate_set = validate_set.merge( s, on = 'id', how = 'left' )

In [15]:
# test.data_type.value_counts()
v.columns

Index([u'id', u'era', u'data_type', u'feature1', u'feature2', u'feature3',
       u'feature4', u'feature5', u'feature6', u'feature7', u'feature8',
       u'feature9', u'feature10', u'feature11', u'feature12', u'feature13',
       u'feature14', u'feature15', u'feature16', u'feature17', u'feature18',
       u'feature19', u'feature20', u'feature21', u'feature22', u'feature23',
       u'feature24', u'feature25', u'feature26', u'feature27', u'feature28',
       u'feature29', u'feature30', u'feature31', u'feature32', u'feature33',
       u'feature34', u'feature35', u'feature36', u'feature37', u'feature38',
       u'feature39', u'feature40', u'feature41', u'feature42', u'feature43',
       u'feature44', u'feature45', u'feature46', u'feature47', u'feature48',
       u'feature49', u'feature50', u'target_bernie', u'target_elizabeth',
       u'target_jordan', u'target_ken', u'target_charles', u'target_frank',
       u'target_hillary', u'probability'],
      dtype='object')

# Calculating Logloss & Consistency

In [16]:
from sklearn import metrics
from math import log

eras = validate_set.era.unique()
print len(eras)

good_eras = 0

for era in eras:
    tmp = validate_set[ validate_set.era == era ]
    logloss = metrics.log_loss( tmp.target_bernie, tmp.probability )
    print logloss
    is_good = logloss < -log( 0.5 )

    if is_good:
        good_eras += 1

    print( "{} {} {:.2%} {}".format( era, len( tmp ), logloss, is_good ))

consistency = good_eras / float( len( eras ))
print( "\nconsistency: {:.1%} ({}/{})".format( consistency, good_eras, len( eras )))

logloss = metrics.log_loss( validate_set.target_bernie, validate_set.probability )
print( "log loss:    {:.2%}\n".format( logloss ))

12
0.6923081173738794
121 4540 69.23% True
0.6912390382114364
122 4626 69.12% True
0.6932699624682883
123 4582 69.33% False
0.6917412360787164
124 4604 69.17% True
0.6933232899066735
125 4673 69.33% False
0.6926990963416928
126 4663 69.27% True
0.6941678699649289
127 4675 69.42% False
0.6931102180697901
128 4632 69.31% True
0.6909974837907648
129 4713 69.10% True
0.6917604609580587
130 4752 69.18% True
0.6926105869193974
131 4817 69.26% True
0.694854072438013
132 4807 69.49% False

consistency: 66.7% (8/12)
log loss:    69.27%

