## VotingClassifier - [ BalancedBaggingClassifier, BalancedRandomForestClassifier, XGBClassifier ]

In [87]:
import pandas as pd
from xgboost import XGBClassifier, XGBRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.multiclass import OneVsOneClassifier,OneVsRestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LinearRegression, LogisticRegression, RidgeClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier
from imblearn.ensemble import BalancedBaggingClassifier, BalancedRandomForestClassifier

In [88]:
training_data = pd.read_csv('train.csv')
print(training_data.shape)
training_data.columns

(6500, 10)


Index(['Won_Championship', 'Previous_SB_Wins', 'Number_Of_Wins_This_Season',
       'Number_Of_First_Round_Draft_Picks', 'Team_Value', 'Playing_Style',
       'Average_Player_Age', 'Number_Of_Injured_Players',
       'Coach_Experience_Level', 'ID'],
      dtype='object')

### Dropping columns -- [ 'ID','Team_Value','Playing_Style','Won_Championship','Previous_SB_Wins' ]

In [89]:
y = training_data.Won_Championship
training_data = training_data.drop(columns=['Won_Championship','ID','Team_Value','Playing_Style','Previous_SB_Wins'],axis=1)

le_Number_Of_Injured_Players = LabelEncoder()
training_data['Number_Of_Injured_Players'] = le_Number_Of_Injured_Players.fit_transform(training_data['Number_Of_Injured_Players'])

le_Coach_Experience_Level = LabelEncoder()
training_data['Coach_Experience_Level'] = le_Coach_Experience_Level.fit_transform(training_data['Coach_Experience_Level'])
training_data.head()

Unnamed: 0,Number_Of_Wins_This_Season,Number_Of_First_Round_Draft_Picks,Average_Player_Age,Number_Of_Injured_Players,Coach_Experience_Level
0,13,2,27,1,2
1,14,2,26,1,2
2,13,1,27,2,2
3,12,2,27,6,2
4,15,2,26,1,2


### Performing OverSampling

In [90]:
from imblearn.over_sampling import SVMSMOTE

sm = SVMSMOTE()
X,Y = sm.fit_resample(training_data,y)
print(X.shape,'---',training_data.shape)
print(Y.shape,'---',len(y))

(6510, 5) --- (6500, 5)
(6510,) --- 6500


In [91]:
x_train,x_test, y_train, y_test = train_test_split(X,Y,test_size=0.2)

In [92]:
bags = BalancedBaggingClassifier(n_estimators=100,oob_score=True,bootstrap_features=True,replacement=True)
bags.fit(x_train,y_train)
#bags.fit(training_data,y)
prediction = bags.predict(x_test)
acc = 100 * (f1_score(y_test,prediction,average='binary'))
acc

77.4145616641902

In [93]:
bal_rfc = BalancedRandomForestClassifier(class_weight='balanced_subsample',criterion='entropy')
bal_rfc.fit(x_train,y_train)
#bal_rfc.fit(training_data,y)
prediction = bal_rfc.predict(x_test)
acc = 100 * (f1_score(y_test,prediction,average='binary'))
acc

77.73695811903013

In [94]:
xgb = XGBClassifier(n_estimators=500,learning_rate=0.1,max_depth=10,reg_lambda=0.1,importance_type='total_gain')
xgb.fit(x_train,y_train)
#xgb.fit(training_data,y)
prediction = xgb.predict(x_test)
acc = 100 * (f1_score(y_test,prediction,average='binary'))
acc

77.62289068231841

In [95]:
bag = BalancedBaggingClassifier(n_estimators=100,oob_score=True,bootstrap_features=True,replacement=True)
xgb = XGBClassifier(n_estimators=500,learning_rate=0.1,max_depth=10,reg_lambda=0.1,importance_type='total_gain')
bal_rfc = BalancedRandomForestClassifier(class_weight='balanced_subsample',criterion='entropy')

voting = VotingClassifier(estimators=[
        ('bag', bag), ('rfc', bal_rfc), ('xgb', xgb)], voting='hard')
voting.fit(training_data, y)

VotingClassifier(estimators=[('bag',
                              BalancedBaggingClassifier(base_estimator=None,
                                                        bootstrap=True,
                                                        bootstrap_features=True,
                                                        max_features=1.0,
                                                        max_samples=1.0,
                                                        n_estimators=100,
                                                        n_jobs=None,
                                                        oob_score=True,
                                                        random_state=None,
                                                        replacement=True,
                                                        sampling_strategy='auto',
                                                        verbose=0,
                                                        warm_start=False)

In [96]:
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold

cv = RepeatedStratifiedKFold(n_repeats=3,n_splits=5)
cross_val_score(voting,X,Y,cv=cv)

array([0.76728111, 0.76190476, 0.73502304, 0.77112135, 0.75345622,
       0.76574501, 0.7688172 , 0.75729647, 0.76574501, 0.75729647,
       0.75883257, 0.7703533 , 0.75038402, 0.75652842, 0.76728111])

In [98]:
prediction = voting.predict(X)
acc = 100 * (f1_score(Y,prediction,average='binary'))
acc

78.17365269461078

In [99]:
cols = training_data.columns
test_data = pd.read_csv('test.csv')
event_id = test_data['ID']

print(test_data.shape)
test_data = test_data.drop(columns=['ID','Team_Value','Playing_Style','Previous_SB_Wins'],axis=1)

test_data['Number_Of_Injured_Players'] = le_Number_Of_Injured_Players.fit_transform(test_data['Number_Of_Injured_Players'])

test_data['Coach_Experience_Level'] = le_Coach_Experience_Level.fit_transform(test_data['Coach_Experience_Level'])


predictions = voting.predict(test_data)
result_df = pd.DataFrame({'ID':event_id,'Won_Championship':predictions})
result_df.to_csv('Prediction.csv',index=False)

(3500, 9)


#### Online ACCURACY - 76.64, when local accuracy - 78.17 on whole data (VotingClassifier)