## VotingClassifier - [ BaggingClassifier, RandomForestClassifier, XGBClassifier ]

In [23]:
import pandas as pd
from xgboost import XGBClassifier, XGBRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.multiclass import OneVsOneClassifier,OneVsRestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LinearRegression, LogisticRegression, RidgeClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier


In [24]:
training_data = pd.read_csv('train.csv')
training_data.columns

Index(['Won_Championship', 'Previous_SB_Wins', 'Number_Of_Wins_This_Season',
       'Number_Of_First_Round_Draft_Picks', 'Team_Value', 'Playing_Style',
       'Average_Player_Age', 'Number_Of_Injured_Players',
       'Coach_Experience_Level', 'ID'],
      dtype='object')

In [25]:
le_Team_Value = LabelEncoder()
training_data['Team_Value'] = le_Team_Value.fit_transform(training_data['Team_Value'])

le_Playing_Style = LabelEncoder()
training_data['Playing_Style'] = le_Playing_Style.fit_transform(training_data['Playing_Style'])

le_Number_Of_Injured_Players = LabelEncoder()
training_data['Number_Of_Injured_Players'] = le_Number_Of_Injured_Players.fit_transform(training_data['Number_Of_Injured_Players'])

le_Coach_Experience_Level = LabelEncoder()
training_data['Coach_Experience_Level'] = le_Coach_Experience_Level.fit_transform(training_data['Coach_Experience_Level'])

Unnamed: 0,Won_Championship,Previous_SB_Wins,Number_Of_Wins_This_Season,Number_Of_First_Round_Draft_Picks,Average_Player_Age,Number_Of_Injured_Players,Coach_Experience_Level
0,0,3,13,2,27,1,2
1,0,2,14,2,26,1,2
2,1,2,13,1,27,2,2
3,0,2,12,2,27,6,2
4,0,1,15,2,26,1,2


### Dropping columns -- [ 'ID','Team_Value','Playing_Style','Won_Championship' ]

In [26]:
y = training_data.Won_Championship
training_data = training_data.drop(columns=['ID','Team_Value','Playing_Style','Won_Championship'],axis=1)

In [27]:
x_train,x_test, y_train, y_test = train_test_split(training_data,y,test_size=0.2)

In [16]:
bag = BaggingClassifier(n_estimators=100,oob_score=True,bootstrap_features=True)
bag.fit(x_train,y_train)
#bag.fit(training_data,y)
prediction = bag.predict(x_test)
acc = 100 * (f1_score(y_test,prediction,average='binary'))
acc

77.6871756856931

In [17]:
xgb = XGBClassifier(n_estimators=500,learning_rate=0.1,max_depth=10,reg_lambda=0.1,importance_type='total_gain')
xgb.fit(x_train,y_train)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=None, n_estimators=500, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=0.1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [18]:
prediction = xgb.predict(x_test)
acc = 100 * (f1_score(y_test,prediction,average='binary'))
acc

78.00149142431022

In [19]:
rfc = RandomForestClassifier()
rfc.fit(x_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [20]:
prediction = rfc.predict(x_test)
acc = 100 * (f1_score(y_test,prediction,average='binary'))
acc

77.08649468892261

In [30]:
bag = BaggingClassifier(n_estimators=100,oob_score=True,bootstrap_features=True)
xgb = XGBClassifier(n_estimators=500,learning_rate=0.1,max_depth=10,reg_lambda=0.1,importance_type='total_gain')
rfc = RandomForestClassifier(oob_score=True)

voting = VotingClassifier(estimators=[
        ('bag', bag), ('rfc', rfc), ('xgb', xgb)], voting='hard')
voting.fit(training_data, y)

VotingClassifier(estimators=[('bag',
                              BaggingClassifier(base_estimator=None,
                                                bootstrap=True,
                                                bootstrap_features=True,
                                                max_features=1.0,
                                                max_samples=1.0,
                                                n_estimators=100, n_jobs=None,
                                                oob_score=True,
                                                random_state=None, verbose=0,
                                                warm_start=False)),
                             ('rfc',
                              RandomForestClassifier(bootstrap=True,
                                                     ccp_alpha=0.0,
                                                     class_weight=None,
                                                     criterion='gini',
                    

In [31]:
prediction = voting.predict(x_test)
acc = 100 * (f1_score(y_test,prediction,average='binary'))
acc

78.51301115241634

In [35]:
cols = training_data.columns
test_data = pd.read_csv('test.csv')
event_id = test_data['ID']

print(test_data.shape)
test_data = test_data.drop(columns=['ID','Team_Value','Playing_Style'],axis=1)

test_data['Number_Of_Injured_Players'] = le_Number_Of_Injured_Players.fit_transform(test_data['Number_Of_Injured_Players'])

test_data['Coach_Experience_Level'] = le_Coach_Experience_Level.fit_transform(test_data['Coach_Experience_Level'])


predictions = voting.predict(test_data)
result_df = pd.DataFrame({'ID':event_id,'Won_Championship':predictions})
result_df.to_csv('Prediction.csv',index=False)

(3500, 9)


#### Online ACCURACY - 75.21, when local accuracy - 78.51 on whole data (VotingClassifier)