# Intro

This notebook continues from [my last one](./kobe-part1.ipynb) stopped, so I'll just quickly repeat all modifications I've already done, and then continue with the new stuff.

In [2]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set_style('whitegrid')

df = pd.read_csv('../../#data-sets/kobe_data.csv')

not_needed = []

not_needed.extend(['game_event_id', 'game_id'])

not_needed.extend(['lon', 'lat'])

df['time_remaining'] = 60 * df.loc[:, 'minutes_remaining'] + df.loc[:, 'seconds_remaining']
not_needed.extend(['minutes_remaining', 'seconds_remaining'])

df['season'] = df['season'].apply(lambda x: x[:4])
df['season'] = pd.to_numeric(df['season'])

dist = pd.DataFrame({'true_dist': np.sqrt((df['loc_x']/10)**2 + (df['loc_y']/10)**2), 
                     'shot_dist': df['shot_distance']})
df['shot_distance_'] = dist['true_dist']
not_needed.append('shot_distance')

df['3pt_goal'] = df['shot_type'].str.contains('3PT').astype('int')
not_needed.append('shot_type')

not_needed.append('shot_zone_range')

not_needed.extend(['team_id', 'team_name'])

df['game_date'] = pd.to_datetime(df['game_date'])
df['game_year'] = df['game_date'].dt.year
df['game_month'] = df['game_date'].dt.month
df['game_day'] = df['game_date'].dt.dayofweek
not_needed.append('game_date')

df['home_game'] = df['matchup'].str.contains('vs.').astype(int)
not_needed.append('matchup')

df.set_index('shot_id', inplace=True)

df = df.drop(not_needed, axis=1)

random_sample = df.take(np.random.permutation(len(df))[:10])
random_sample.head(10)

Unnamed: 0_level_0,action_type,combined_shot_type,loc_x,loc_y,period,playoffs,season,shot_made_flag,shot_zone_area,shot_zone_basic,opponent,time_remaining,shot_distance_,3pt_goal,game_year,game_month,game_day,home_game
shot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
5983,Jump Shot,Jump Shot,177,176,3,0,2003,1.0,Right Side Center(RC),Above the Break 3,MIN,389,24.96097,1,2004,3,4,1
17953,Jump Shot,Jump Shot,43,67,3,0,2010,0.0,Center(C),In The Paint (Non-RA),UTA,353,7.961156,0,2011,4,1,1
24120,Layup Shot,Layup,34,35,2,0,1997,,Center(C),In The Paint (Non-RA),TOR,497,4.879549,0,1998,3,1,0
26628,Driving Dunk Shot,Dunk,0,0,2,1,2001,,Center(C),Restricted Area,SAS,432,0.0,0,2002,5,6,1
2082,Jump Shot,Jump Shot,144,45,3,0,2001,0.0,Right Side(R),Mid-Range,TOR,662,15.086749,0,2002,1,6,0
13091,Driving Layup Shot,Layup,0,0,4,0,2007,1.0,Center(C),Restricted Area,POR,54,0.0,0,2008,4,1,0
4827,Jump Shot,Jump Shot,66,160,2,0,2002,0.0,Right Side Center(RC),Mid-Range,SEA,621,17.307802,0,2003,3,6,0
29048,Jump Shot,Jump Shot,-190,190,4,1,2009,0.0,Left Side Center(LC),Above the Break 3,OKC,362,26.870058,1,2010,4,6,1
4030,Jump Shot,Jump Shot,89,63,1,0,2002,0.0,Right Side(R),Mid-Range,NJN,79,10.904128,0,2003,1,4,1
741,Jump Shot,Jump Shot,-107,99,2,0,2000,0.0,Left Side(L),Mid-Range,UTA,296,14.57738,0,2001,1,2,1


# New stuff

After we've explored the data [last time](https://www.kaggle.com/narimiran/kobe-bryant-shot-selection/beginners-first-time), it's time to analyze some more.

## Action types

In [3]:
df['action_type'].value_counts()

Jump Shot                             18880
Layup Shot                             2567
Driving Layup Shot                     1978
Turnaround Jump Shot                   1057
Fadeaway Jump Shot                     1048
Running Jump Shot                       926
Pullup Jump shot                        476
Turnaround Fadeaway shot                439
Slam Dunk Shot                          411
Reverse Layup Shot                      395
Jump Bank Shot                          333
Driving Dunk Shot                       310
Dunk Shot                               262
Tip Shot                                182
Alley Oop Dunk Shot                     122
Step Back Jump shot                     118
Floating Jump shot                      114
Driving Reverse Layup Shot               97
Hook Shot                                84
Driving Finger Roll Shot                 82
Alley Oop Layup shot                     80
Reverse Dunk Shot                        75
Running Layup Shot              

There are too many (57) different action types, and many of them have only few shots, so we'll keep first 25 action types (with most of shot attempts), and all other action types will be under `other` category.

In [4]:
rare_action_types = df['action_type'].value_counts()[25:]
rare_actions = rare_action_types.index.values

df.loc[df['action_type'].isin(rare_actions), 'action_type'] = 'other'
df['action_type'].value_counts()

Jump Shot                         18880
Layup Shot                         2567
Driving Layup Shot                 1978
Turnaround Jump Shot               1057
Fadeaway Jump Shot                 1048
Running Jump Shot                   926
Pullup Jump shot                    476
other                               449
Turnaround Fadeaway shot            439
Slam Dunk Shot                      411
Reverse Layup Shot                  395
Jump Bank Shot                      333
Driving Dunk Shot                   310
Dunk Shot                           262
Tip Shot                            182
Alley Oop Dunk Shot                 122
Step Back Jump shot                 118
Floating Jump shot                  114
Driving Reverse Layup Shot           97
Hook Shot                            84
Driving Finger Roll Shot             82
Alley Oop Layup shot                 80
Reverse Dunk Shot                    75
Running Layup Shot                   72
Turnaround Bank shot                 71


## Periods and overtime

In [5]:
df['period'].value_counts()

3    8296
1    8048
4    7260
2    6718
5     330
6      38
7       7
Name: period, dtype: int64

Under 400 shot attempts (with similar accuracy) were made in overtime periods (periods 5, 6, 7), so we'll combine them in one category: `overtime`.

In [6]:
overtime = np.array([5, 6, 7])
df.loc[df['period'].isin(overtime), 'period'] = 'overtime'
df['period'].value_counts()

3           8296
1           8048
4           7260
2           6718
overtime     375
Name: period, dtype: int64

## Playoffs

As we've seen earlier there's no difference in accuracy between regular season and playoffs, so column `playoffs` won't be needed.

In [7]:
df = df.drop('playoffs', axis=1)

# Creating dummies for categorical features

We can't use categorical features so we'll convert them to dummies.

In [8]:
df.dtypes

action_type            object
combined_shot_type     object
loc_x                   int64
loc_y                   int64
period                 object
season                  int64
shot_made_flag        float64
shot_zone_area         object
shot_zone_basic        object
opponent               object
time_remaining          int64
shot_distance_        float64
3pt_goal                int32
game_year               int64
game_month              int64
game_day                int64
home_game               int32
dtype: object

In [9]:
categorical = ['action_type', 'combined_shot_type', 'shot_zone_area', 'shot_zone_basic', 
               'opponent', 'period', 'season', 'game_year', 'game_month', 'game_day']

for column in categorical:
    dummy = pd.get_dummies(df[column], prefix=column)
    df = df.join(dummy)
    df.drop(column, axis=1, inplace=True)

df.head()

Unnamed: 0_level_0,loc_x,loc_y,shot_made_flag,time_remaining,shot_distance_,3pt_goal,home_game,action_type_Alley Oop Dunk Shot,action_type_Alley Oop Layup shot,action_type_Driving Dunk Shot,...,game_month_10,game_month_11,game_month_12,game_day_0,game_day_1,game_day_2,game_day_3,game_day_4,game_day_5,game_day_6
shot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,167,72,,627,18.185984,0,0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,-157,0,0.0,622,15.7,0,0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,-101,135,1.0,465,16.860012,0,0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,138,175,0.0,412,22.286543,0,0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5,0,0,1.0,379,0.0,0,0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


# Separating the data

Splitting the data in two parts - one for our learning and other for submission.

In [10]:
unknown_shots = df['shot_made_flag'].isnull()

submission_data = df[unknown_shots].drop('shot_made_flag', 1)
data = df[~unknown_shots]

X = data.drop('shot_made_flag', 1)
y = data['shot_made_flag']

# Feature selection

We have 146 features, but would like to reduce that number to only most important features.

---

***Big THANK YOU goes to [Norbert Kozlowski](https://www.kaggle.com/khozzy) and [his script](https://www.kaggle.com/khozzy/kobe-bryant-shot-selection/kobe-shots-show-me-your-best-model/) which helped me a lot to make all of the code from now till the end of the notebook.***

---

In [11]:
from sklearn.feature_selection import VarianceThreshold, RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

## Variance Threshold

In [12]:
threshold = 0.9
vt = VarianceThreshold().fit(X)

feat_var_threshold = X.columns[vt.variances_ > threshold * (1-threshold)].values
feat_var_threshold

array(['loc_x', 'loc_y', 'time_remaining', 'shot_distance_', '3pt_goal',
       'home_game', 'action_type_Jump Shot',
       'combined_shot_type_Jump Shot', 'combined_shot_type_Layup',
       'shot_zone_area_Center(C)', 'shot_zone_area_Left Side Center(LC)',
       'shot_zone_area_Left Side(L)',
       'shot_zone_area_Right Side Center(RC)',
       'shot_zone_area_Right Side(R)', 'shot_zone_basic_Above the Break 3',
       'shot_zone_basic_In The Paint (Non-RA)',
       'shot_zone_basic_Mid-Range', 'shot_zone_basic_Restricted Area',
       'period_1', 'period_2', 'period_3', 'period_4', 'game_month_1',
       'game_month_2', 'game_month_3', 'game_month_4', 'game_month_11',
       'game_month_12', 'game_day_1', 'game_day_2', 'game_day_4',
       'game_day_6'], dtype=object)

## Random Forest Classifier

In [13]:
model = RandomForestClassifier()
model.fit(X, y)

feature_imp = pd.DataFrame(model.feature_importances_, index=X.columns, columns=["importance"])
feat_RFC = feature_imp.sort_values("importance", ascending=False).head(35)

feat_RFC = feat_RFC.index.values
feat_RFC

array(['time_remaining', 'shot_distance_', 'loc_x', 'loc_y',
       'action_type_Jump Shot', 'combined_shot_type_Dunk', 'home_game',
       'combined_shot_type_Jump Shot', 'period_3',
       'action_type_Layup Shot', 'period_2', 'period_1', 'game_day_6',
       'game_day_4', 'game_day_1', 'period_4',
       'action_type_Running Jump Shot', 'game_month_1', 'game_day_2',
       'game_month_2', 'game_month_3', 'game_month_11', 'game_month_4',
       'game_month_12', 'game_day_3', 'shot_zone_area_Center(C)',
       'game_day_0', 'opponent_SAC', 'game_month_5', 'opponent_SAS',
       'opponent_HOU', 'opponent_PHX', 'season_2008', 'opponent_DEN',
       'game_day_5'], dtype=object)

## Recursive feature elimination (RFE)

In [14]:
rfe = RFE(LogisticRegression(), 35)
rfe.fit(X, y)

feature_rfe_scoring = pd.DataFrame({'feature': X.columns, 'score': rfe.ranking_})

feat_rfe = feature_rfe_scoring[feature_rfe_scoring['score'] == 1]['feature'].values
feat_rfe

array(['action_type_Alley Oop Dunk Shot',
       'action_type_Alley Oop Layup shot', 'action_type_Driving Dunk Shot',
       'action_type_Driving Finger Roll Layup Shot',
       'action_type_Driving Finger Roll Shot',
       'action_type_Driving Layup Shot',
       'action_type_Driving Reverse Layup Shot', 'action_type_Dunk Shot',
       'action_type_Fadeaway Jump Shot', 'action_type_Hook Shot',
       'action_type_Jump Bank Shot', 'action_type_Jump Shot',
       'action_type_Layup Shot', 'action_type_Pullup Jump shot',
       'action_type_Running Layup Shot', 'action_type_Slam Dunk Shot',
       'action_type_Tip Shot', 'action_type_Turnaround Fadeaway shot',
       'action_type_Turnaround Jump Shot', 'combined_shot_type_Bank Shot',
       'combined_shot_type_Dunk', 'combined_shot_type_Layup',
       'combined_shot_type_Tip Shot', 'shot_zone_area_Back Court(BC)',
       'shot_zone_area_Center(C)', 'shot_zone_area_Left Side Center(LC)',
       'shot_zone_area_Left Side(L)',
       'shot

## Putting it all together

In [15]:
features = np.hstack([feat_var_threshold, feat_RFC, feat_rfe])

features = np.unique(features)
print('Final features set:\n')
for f in features:
    print("-{}".format(f))
    
len(features)

Final features set:

-3pt_goal
-action_type_Alley Oop Dunk Shot
-action_type_Alley Oop Layup shot
-action_type_Driving Dunk Shot
-action_type_Driving Finger Roll Layup Shot
-action_type_Driving Finger Roll Shot
-action_type_Driving Layup Shot
-action_type_Driving Reverse Layup Shot
-action_type_Dunk Shot
-action_type_Fadeaway Jump Shot
-action_type_Hook Shot
-action_type_Jump Bank Shot
-action_type_Jump Shot
-action_type_Layup Shot
-action_type_Pullup Jump shot
-action_type_Running Jump Shot
-action_type_Running Layup Shot
-action_type_Slam Dunk Shot
-action_type_Tip Shot
-action_type_Turnaround Fadeaway shot
-action_type_Turnaround Jump Shot
-combined_shot_type_Bank Shot
-combined_shot_type_Dunk
-combined_shot_type_Jump Shot
-combined_shot_type_Layup
-combined_shot_type_Tip Shot
-game_day_0
-game_day_1
-game_day_2
-game_day_3
-game_day_4
-game_day_5
-game_day_6
-game_month_1
-game_month_11
-game_month_12
-game_month_2
-game_month_3
-game_month_4
-game_month_5
-home_game
-loc_x
-loc_y


71

We'll make new datasets with only those columns.

In [16]:
submission_data = submission_data.ix[:, features]
data = data.ix[:, features]
X = X.ix[:, features]

# Testing different algorithms

In [17]:
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.grid_search import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [18]:
seed = 2016
num_folds = 5
num_instances = len(X)
jobs = -1

scoring = 'log_loss'

kfold = KFold(n=num_instances, n_folds=num_folds, random_state=seed)

## Logistic regression

In [19]:
model = LogisticRegression()

result = cross_val_score(model, X, y, cv=kfold, scoring=scoring)
print("({0:.4f}) +/- ({1:.4f})".format(result.mean(), result.std()))

(-0.6135) +/- (0.0057)


## K-nearest neighbors

In [20]:
model = Pipeline([('std_sc', StandardScaler()),
                  ('knn', KNeighborsClassifier(n_neighbors=50, n_jobs=jobs))])

result = cross_val_score(model, X, y, cv=kfold, scoring=scoring)
print("({0:.4f}) +/- ({1:.4f})".format(result.mean(), result.std()))

(-0.6315) +/- (0.0051)


## Random forest

In [21]:
model = RandomForestClassifier(n_estimators=200, n_jobs=jobs)

result = cross_val_score(model, X, y, cv=kfold, scoring=scoring)
print("({0:.4f}) +/- ({1:.4f})".format(result.mean(), result.std()))

(-0.6351) +/- (0.0049)


## Ada boost

In [22]:
model = AdaBoostClassifier(random_state=seed)

results = cross_val_score(model, X, y, cv=kfold, scoring=scoring, n_jobs=jobs)
print("({0:.4f}) +/- ({1:.4f})".format(results.mean(), results.std()))

(-0.6885) +/- (0.0007)


## Gradient Boosting

In [23]:
model = GradientBoostingClassifier(random_state=seed)

results = cross_val_score(model, X, y, cv=kfold, scoring=scoring, n_jobs=jobs)
print("({0:.4f}) +/- ({1:.4f})".format(results.mean(), results.std()))

(-0.6089) +/- (0.0050)


## Linear Discriminant Analysis (LDA)

In [24]:
model = LinearDiscriminantAnalysis()

results = cross_val_score(model, X, y, cv=kfold, scoring=scoring, n_jobs=jobs)
print("({0:.4f}) +/- ({1:.4f})".format(results.mean(), results.std()))

(-0.6156) +/- (0.0061)


# Grid search

We'll use `GridSearchCV` to find the best parameters for each algorithm that we used above.

## Logistic regression

In [25]:
lr_grid = GridSearchCV(estimator = LogisticRegression(random_state=seed),
                       param_grid = {'penalty': ['l1', 'l2'], 
                                     'C': [0.001, 0.01, 0.1, 1, 10, 100]}, 
                       cv = kfold, 
                       scoring = scoring)

lr_grid.fit(X, y)

print(lr_grid.best_score_)
print(lr_grid.best_params_)

-0.613491404182
{'C': 1, 'penalty': 'l1'}


## K-nearest neighbors

In [26]:
knn_grid = GridSearchCV(estimator = Pipeline(
    [('st_sc', StandardScaler()),
     ('knn', KNeighborsClassifier(n_jobs=jobs))
    ]),
                         param_grid = {'knn__n_neighbors': [20, 50, 80],
                                       'knn__weights': ['uniform'],
                                       'knn__algorithm': ['ball_tree'],
                                       'knn__p': [1, 2]}, 
                         cv = kfold, 
                         scoring = scoring
                        )

knn_grid.fit(X, y)

print(knn_grid.best_score_)
print(knn_grid.best_params_)

-0.622649847237
{'knn__weights': 'uniform', 'knn__n_neighbors': 80, 'knn__algorithm': 'ball_tree', 'knn__p': 2}


## Random forest

In [27]:
rf_grid = GridSearchCV(estimator = RandomForestClassifier(warm_start=True, random_state=seed, n_jobs=jobs), 
                       param_grid = {'n_estimators': [100, 200],
                                     'criterion': ['entropy'], 
                                     'max_features': ['auto', 20], 
                                     'max_depth': [None, 10]}, 
                       cv = kfold, 
                       scoring = scoring)

rf_grid.fit(X, y)

print(rf_grid.best_score_)
print(rf_grid.best_params_)

-0.606956943641
{'n_estimators': 200, 'max_features': 20, 'max_depth': 10, 'criterion': 'entropy'}


## Ada boost

In [28]:
ada_grid = GridSearchCV(estimator = AdaBoostClassifier(random_state=seed), 
                        param_grid = {'algorithm': ['SAMME', 'SAMME.R'],
                                      'n_estimators': [10, 25, 50, 100, 150],
                                      'learning_rate': [1e-3, 1e-2, 1e-1, 1]},
                        cv = kfold, 
                        scoring = scoring, 
                        n_jobs = jobs)

ada_grid.fit(X, y)

print(ada_grid.best_score_)
print(ada_grid.best_params_)

-0.641203422605
{'learning_rate': 0.001, 'n_estimators': 10, 'algorithm': 'SAMME.R'}


## Gradient Boosting

In [29]:
gbm_grid = GridSearchCV(estimator = GradientBoostingClassifier(warm_start=True, random_state=seed),
                        param_grid = {'n_estimators': [50, 100, 200],
                                      'max_depth': [3, 5],
                                      'max_features': ['auto', 'log2'],
                                      'learning_rate': [0.01, 0.1]}, 
                        cv = kfold, 
                        scoring = scoring, 
                        n_jobs = jobs)

gbm_grid.fit(X, y)

print(gbm_grid.best_score_)
print(gbm_grid.best_params_)

-0.608377981197
{'n_estimators': 200, 'max_features': 'log2', 'learning_rate': 0.1, 'max_depth': 3}


## LDA

In [30]:
lda_grid = GridSearchCV(estimator = LinearDiscriminantAnalysis(),
                        param_grid = {'solver': ['lsqr'], 
                                      'shrinkage': [None, 'auto'],
                                      'n_components': [None, 2, 5, 10]},
                        cv = kfold, 
                        scoring = scoring,
                        n_jobs = jobs)

lda_grid.fit(X, y)

print(lda_grid.best_score_)
print(lda_grid.best_params_)

-0.615563850322
{'solver': 'lsqr', 'n_components': None, 'shrinkage': 'auto'}


## Grid search summary

In [31]:
print('lr', lr_grid.best_score_)
print(lr_grid.best_params_)
print()
print('knn', knn_grid.best_score_)
print(knn_grid.best_params_)
print()
print('rf', rf_grid.best_score_)
print(rf_grid.best_params_)
print()
print('ada', ada_grid.best_score_)
print(ada_grid.best_params_)
print()
print('gbm', gbm_grid.best_score_)
print(gbm_grid.best_params_)
print()
print('lda', lda_grid.best_score_)
print(lda_grid.best_params_)

lr -0.613491404182
{'C': 1, 'penalty': 'l1'}

knn -0.622649847237
{'knn__weights': 'uniform', 'knn__n_neighbors': 80, 'knn__algorithm': 'ball_tree', 'knn__p': 2}

rf -0.606956943641
{'n_estimators': 200, 'max_features': 20, 'max_depth': 10, 'criterion': 'entropy'}

ada -0.641203422605
{'learning_rate': 0.001, 'n_estimators': 10, 'algorithm': 'SAMME.R'}

gbm -0.608377981197
{'n_estimators': 200, 'max_features': 'log2', 'learning_rate': 0.1, 'max_depth': 3}

lda -0.615563850322
{'solver': 'lsqr', 'n_components': None, 'shrinkage': 'auto'}


# Voting classifier

After lots of trial-and-error, I decided not to use ADA (the algorithm with the worst score).

In [32]:
estimators = [('lr', LogisticRegression(C=10, penalty='l2', random_state=seed)), 
              ('knn', Pipeline([('st_sc', StandardScaler()), 
                                ('knn', KNeighborsClassifier(leaf_size=10, n_neighbors=80, 
                                                             algorithm='ball_tree'))])),
              ('rf', RandomForestClassifier(warm_start=True, max_features=20, n_estimators=400, 
                                            max_depth=10, criterion='entropy', random_state=seed)),
              ('gbm', GradientBoostingClassifier(max_depth=3, learning_rate=0.1, n_estimators=200, max_features='log2')),
              ('lda', LinearDiscriminantAnalysis(solver='lsqr', shrinkage='auto'))]

In [33]:
voters = VotingClassifier(estimators, voting='soft', weights=[4, 3, 5, 5, 4])

results = cross_val_score(voters, X, y, cv=kfold, scoring=scoring, n_jobs=jobs)
print("({0:.4}) +/- ({1:.4f})".format(results.mean(), results.std()))

(-0.6089) +/- (0.0049)
