In [13]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler, Normalizer, QuantileTransformer
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import AdaBoostClassifier, VotingClassifier, RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import PCA, IncrementalPCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import log_loss 
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC


from xgboost.sklearn import XGBClassifier
import xgboost as xgb
from mlens.ensemble import SuperLearner
from mlens.preprocessing import Subset
from mlens.model_selection import Evaluator

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

import re
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import warnings
from sklearn.utils import shuffle
warnings.simplefilter("ignore")
%matplotlib inline 




# Loading Data

1. Extracting features and targets from the data
2. Converting 'era' column to integer to input it to the model



In [2]:
training_data = 'training_data.csv'
tournament_data = 'tournament_data.csv'

In [3]:
train = pd.read_csv(training_data)
tournament = pd.read_csv(tournament_data)

In [4]:
features = [f for f in list(train) if "feature" in f]
targets = ['target_bernie']
features.insert(0, 'era')

X = train[features]
Y = train[targets]

In [5]:
X['era'] = X['era'].apply(lambda x: re.sub('[era]', '', x))

# Shuffle & Split Data

In [6]:
X, Y = shuffle(X, Y, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

print "Train_x Shape :: ", X_train.shape
print "Train_y Shape :: ", y_train.shape
print "Test_x Shape :: ", X_test.shape
print "Test_y Shape :: ", y_test.shape

Train_x Shape ::  (336830, 51)
Train_y Shape ::  (336830, 1)
Test_x Shape ::  (165902, 51)
Test_y Shape ::  (165902, 1)


# Model Transformer

In [8]:
#source: https://blomadam.github.io/

class ModelTransformer(BaseEstimator,TransformerMixin):

    def __init__(self, model=None):
        self.model = model

    def fit(self, *args, **kwargs):
        self.model.fit(*args, **kwargs)
        return self

    def transform(self, X, **transform_params):
        return self.model.transform(X)
    

# Machine Learning
1. Parameter Optimization
2. SGD for prediction

In [11]:
from sklearn.model_selection import GridSearchCV

pipeline_sgd = Pipeline([
        ('scale', ModelTransformer()),
        ('fit', SGDClassifier(loss='log', random_state=1)),
])

params_sgd = {
    'scale__model': [MinMaxScaler()],
    'fit__penalty': ['l1', 'l2', 'elasticnet'],
    'fit__alpha':[0.0001, .001, .01],
    'fit__l1_ratio':[.05,.15,.25],
}
    
SGD = GridSearchCV(pipeline_sgd, param_grid=params_sgd, cv=3, scoring='neg_log_loss', verbose=1, n_jobs=-1)
SGD.fit(X_train,y_train)
print('best cv score', SGD.best_score_)
print('best paramas', SGD.best_params_)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:  3.1min finished


('best cv score', -0.6924584390021604)
('best paramas', {'fit__alpha': 0.001, 'scale__model': MinMaxScaler(copy=True, feature_range=(0, 1)), 'fit__penalty': 'elasticnet', 'fit__l1_ratio': 0.15})


# Accuracy

In [14]:
predictions = SGD.predict(X_test)
print "Train Accuracy :", accuracy_score(y_train, SGD.predict(X_train))
print "Test Accuracy  :", accuracy_score(y_test, predictions)

Train Accuracy : 0.516444497224119
Test Accuracy  : 0.513375366180034


In [15]:
tournament['era'] = tournament['era'].apply(lambda x: re.sub('[era]', '', x))

In [16]:
tournament = tournament[tournament.data_type == 'validation']
x_prediction = tournament[features]
y_prediction = SGD.predict_proba(x_prediction)
results = y_prediction[:, 1]

In [17]:
ids = tournament['id']

# Create your submission
results_df = pd.DataFrame(data={'probability': results})
joined = pd.DataFrame(ids).join(results_df)
print "joined:", joined.head()

# Save the predictions out to a CSV file.
joined.to_csv("bernie_submission3.csv", index=False)

joined:                  id  probability
0  n00183d2eed2c463     0.498673
1  n00187fa497c9d5b     0.504528
2  n001acd04f3617b7     0.469130
3  n0043a13252683ee     0.504412
4  n00535e274720abd     0.496509


# Saving Results

In [19]:
submission = pd.read_csv("bernie_submission3.csv")

# v = test[ test.data_type == 'validation' ].copy()
validate_set = tournament.copy()

validate_set = validate_set.merge( submission, on = 'id', how = 'left' )

In [20]:
# test.data_type.value_counts()
validate_set.columns

Index([u'id', u'era', u'data_type', u'feature1', u'feature2', u'feature3',
       u'feature4', u'feature5', u'feature6', u'feature7', u'feature8',
       u'feature9', u'feature10', u'feature11', u'feature12', u'feature13',
       u'feature14', u'feature15', u'feature16', u'feature17', u'feature18',
       u'feature19', u'feature20', u'feature21', u'feature22', u'feature23',
       u'feature24', u'feature25', u'feature26', u'feature27', u'feature28',
       u'feature29', u'feature30', u'feature31', u'feature32', u'feature33',
       u'feature34', u'feature35', u'feature36', u'feature37', u'feature38',
       u'feature39', u'feature40', u'feature41', u'feature42', u'feature43',
       u'feature44', u'feature45', u'feature46', u'feature47', u'feature48',
       u'feature49', u'feature50', u'target_bernie', u'target_elizabeth',
       u'target_jordan', u'target_ken', u'target_charles', u'target_frank',
       u'target_hillary', u'probability'],
      dtype='object')

# Logloss & Consistency

In [21]:
from sklearn import metrics
from math import log

eras = validate_set.era.unique()
print len(eras)

good_eras = 0

for era in eras:
    tmp = validate_set[ validate_set.era == era ]
    logloss = metrics.log_loss( tmp.target_bernie, tmp.probability )
    print logloss
    is_good = logloss < -log( 0.5 )

    if is_good:
        good_eras += 1

    print( "{} {} {:.2%} {}".format( era, len( tmp ), logloss, is_good ))

consistency = good_eras / float( len( eras ))
print( "\nconsistency: {:.1%} ({}/{})".format( consistency, good_eras, len( eras )))

logloss = metrics.log_loss( validate_set.target_bernie, validate_set.probability )
print( "log loss:    {:.2%}\n".format( logloss ))

12
0.692720388656414
121 4540 69.27% True
0.691205693145769
122 4626 69.12% True
0.6920470223984226
123 4582 69.20% True
0.6926536605515791
124 4604 69.27% True
0.6926924792479007
125 4673 69.27% True
0.6922338368682034
126 4663 69.22% True
0.6942990417297664
127 4675 69.43% False
0.6923515812126897
128 4632 69.24% True
0.6906388746936871
129 4713 69.06% True
0.6925646454970055
130 4752 69.26% True
0.6925314536910303
131 4817 69.25% True
0.6938763903022958
132 4807 69.39% False

consistency: 83.3% (10/12)
log loss:    69.25%

