## VotingClassifier - [ BalancedBaggingClassifier, BalancedRandomForestClassifier, XGBClassifier ]

In [25]:
import pandas as pd
from xgboost import XGBClassifier, XGBRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from imblearn.ensemble import BalancedBaggingClassifier, BalancedRandomForestClassifier
from sklearn.linear_model import LogisticRegression,LinearRegression

In [38]:
training_data = pd.read_csv('new_appended.csv')
print(training_data.shape)
training_data.columns

(8250, 10)


Index(['Average_Player_Age', 'Coach_Experience_Level', 'ID',
       'Number_Of_First_Round_Draft_Picks', 'Number_Of_Injured_Players',
       'Number_Of_Wins_This_Season', 'Playing_Style', 'Previous_SB_Wins',
       'Team_Value', 'Won_Championship'],
      dtype='object')

### Dropping columns -- [ 'ID','Team_Value','Playing_Style','Won_Championship','Previous_SB_Wins' ]

In [39]:
y = training_data.Won_Championship
training_data = training_data.drop(columns=['Won_Championship','ID','Team_Value','Playing_Style','Previous_SB_Wins'],axis=1)

le_Number_Of_Injured_Players = LabelEncoder()
training_data['Number_Of_Injured_Players'] = le_Number_Of_Injured_Players.fit_transform(training_data['Number_Of_Injured_Players'])

le_Coach_Experience_Level = LabelEncoder()
training_data['Coach_Experience_Level'] = le_Coach_Experience_Level.fit_transform(training_data['Coach_Experience_Level'])
training_data.head()

Unnamed: 0,Average_Player_Age,Coach_Experience_Level,Number_Of_First_Round_Draft_Picks,Number_Of_Injured_Players,Number_Of_Wins_This_Season
0,27,2,2,1,13
1,26,2,2,1,14
2,27,2,1,2,13
3,27,2,2,6,12
4,26,2,2,1,15


### Performing OverSampling

In [40]:
from imblearn.over_sampling import SVMSMOTE

sm = SVMSMOTE()
X,Y = sm.fit_resample(training_data,y)
print(X.shape,'---',training_data.shape)
print(Y.shape,'---',len(y))

(8278, 5) --- (8250, 5)
(8278,) --- 8250


In [41]:
x_train,x_test, y_train, y_test = train_test_split(X,Y,test_size=0.2)

In [42]:
bags = BalancedBaggingClassifier(n_estimators=500,oob_score=True,bootstrap_features=True,
                                 replacement=True,sampling_strategy='all')
bags.fit(x_train,y_train)
#bags.fit(training_data,y)
prediction = bags.predict(x_test)
acc = 100 * (f1_score(y_test,prediction,average='binary'))
acc

81.67664670658682

In [43]:
bal_rfc = BalancedRandomForestClassifier(class_weight='balanced_subsample',criterion='entropy',
                                         sampling_strategy='all',replacement=True,max_depth=50)
bal_rfc.fit(x_train,y_train)
#bal_rfc.fit(training_data,y)
prediction = bal_rfc.predict(x_test)
acc = 100 * (f1_score(y_test,prediction,average='binary'))
acc

82.00238379022646

In [44]:
xgb = XGBClassifier(n_estimators=500,learning_rate=0.1,max_depth=50,reg_lambda=0.1,
                    importance_type='total_gain')
xgb.fit(x_train,y_train)
#xgb.fit(training_data,y)
prediction = xgb.predict(x_test)
acc = 100 * (f1_score(y_test,prediction,average='binary'))
acc

81.78041543026707

In [45]:
dtc = DecisionTreeClassifier(criterion='entropy',class_weight='balanced')
dtc.fit(x_train,y_train)
#xgb.fit(training_data,y)
prediction = dtc.predict(x_test)
acc = 100 * (f1_score(y_test,prediction,average='binary'))
acc

81.89349112426035

In [46]:
bag = BalancedBaggingClassifier(n_estimators=500,oob_score=True,bootstrap_features=True,
                                 replacement=True,sampling_strategy='all')
xgb = XGBClassifier(n_estimators=500,learning_rate=0.1,max_depth=10,reg_lambda=0.1,
                    importance_type='total_gain')
bal_rfc = BalancedRandomForestClassifier(class_weight='balanced_subsample',criterion='entropy',
                                         sampling_strategy='all',replacement=True,
                                         n_estimators=100)
dtc = DecisionTreeClassifier(criterion='entropy',class_weight='balanced')
voting = VotingClassifier(estimators=[
        ('bag', bag), ('rfc', bal_rfc), ('xgb', xgb),('dtc',dtc)], voting='soft')
voting.fit(X, Y)

VotingClassifier(estimators=[('bag',
                              BalancedBaggingClassifier(base_estimator=None,
                                                        bootstrap=True,
                                                        bootstrap_features=True,
                                                        max_features=1.0,
                                                        max_samples=1.0,
                                                        n_estimators=500,
                                                        n_jobs=None,
                                                        oob_score=True,
                                                        random_state=None,
                                                        replacement=True,
                                                        sampling_strategy='all',
                                                        verbose=0,
                                                        warm_start=False))

In [47]:
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold

cv = RepeatedStratifiedKFold(n_repeats=3,n_splits=5)
cross_val_score(voting,X,Y,cv=cv)

array([0.81219807, 0.81038647, 0.83635266, 0.82296073, 0.81148036,
       0.83574879, 0.82427536, 0.82125604, 0.80422961, 0.80483384,
       0.81099034, 0.8115942 , 0.82608696, 0.83685801, 0.8102719 ])

In [48]:
prediction = voting.predict(X)
acc = 100 * (f1_score(Y,prediction,average='binary'))
acc

82.31300763728937

In [49]:
#training_data = training_data.drop(columns=['Won_Championship'],axis=1)
cols = training_data.columns
test_data = pd.read_csv('test.csv')
event_id = test_data['ID']

test_data = test_data[cols]

print(test_data.shape)
#test_data = test_data.drop(columns=['ID','Team_Value','Playing_Style','Previous_SB_Wins'],axis=1)

test_data['Number_Of_Injured_Players'] = le_Number_Of_Injured_Players.fit_transform(test_data['Number_Of_Injured_Players'])

test_data['Coach_Experience_Level'] = le_Coach_Experience_Level.fit_transform(test_data['Coach_Experience_Level'])


predictions = voting.predict(test_data)
result_df = pd.DataFrame({'ID':event_id,'Won_Championship':predictions})
result_df.to_csv('Prediction.csv',index=False)

(3500, 5)


#### Online ACCURACY - 76.22, when local accuracy - 82.31 on whole data (VotingClassifier)