In [78]:
import pandas as pd
from sklearn.model_selection import cross_val_score, cross_validate, RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.datasets import make_classification
#from sklearn.pipeline import Pipeline # Not using this as this pipeline does not support classes without Transform() functions. 
from imblearn.pipeline import Pipeline
import warnings
warnings.filterwarnings("ignore")

In [5]:
X,y = make_classification(n_samples=1000, n_features=15, n_informative=10, weights=[0.99])

In [7]:
X.shape, y.shape

((1000, 15), (1000,))

In [22]:
dict_data={}
for i in range(0, X.shape[1]):
    dict_data['feat'+'_'+str(i+1)] = X[:,i]
dict_data['Y'] = y[:]    

In [23]:
df = pd.DataFrame(dict_data)

In [24]:
df

Unnamed: 0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,feat_10,feat_11,feat_12,feat_13,feat_14,feat_15,Y
0,1.709973,-0.195591,1.714398,0.750192,-0.851940,0.926182,-2.684918,-1.273003,-1.051296,3.684143,-0.243603,-1.111625,-0.047019,2.901631,1.876122,0
1,-2.759841,-0.583854,0.387975,2.855471,-0.039044,-1.563238,-1.666938,0.431128,5.720992,0.673643,-0.252324,0.787213,-2.482214,4.755951,4.024606,0
2,-1.538421,-0.425489,-0.646267,0.521383,-2.390072,-0.444277,-1.109288,1.429944,2.110861,2.320231,-0.595890,2.382898,-1.467904,3.538439,0.931834,0
3,-1.103696,0.133364,-1.131278,-0.635004,1.281757,-0.621743,2.060690,-0.049886,-0.269309,-2.702886,0.619486,-0.584078,-1.273077,-3.366141,-2.501734,0
4,-2.639442,2.350113,2.528675,2.363665,-0.083600,-0.070294,-3.193496,2.169469,6.243715,-1.117417,-0.226523,2.464127,-1.232254,5.784181,2.752593,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,-1.217588,2.198200,2.983337,0.724352,1.936552,0.194821,-0.628608,1.848306,4.515912,-2.395325,-0.947246,1.994453,0.854945,2.129205,1.613043,0
996,-0.626084,0.365962,-1.836680,0.233187,1.288341,0.988322,-1.017203,-0.854905,-2.094329,1.491003,-0.291425,-0.082091,0.241055,2.165136,1.065779,0
997,-2.747581,3.459579,-2.682334,-1.569196,-0.132736,0.881176,3.104522,-2.630561,-10.654411,4.416106,-0.754269,-3.870711,1.155179,4.949322,1.099779,0
998,-2.086308,0.361986,-3.201621,0.193524,-0.713614,-0.738688,1.589677,-0.807938,-3.202965,-2.592910,-0.022557,-2.985357,0.991312,-0.384208,-0.294564,0


## Baselines with Decision Tree and Random Forest

In [70]:
dt_baseline = DecisionTreeClassifier()
cv_baseline = cross_val_score(dt_baseline, df.iloc[:, 0:15], df.iloc[:,15], cv=10, scoring = 'f1_weighted')
cvf_baseline = cross_validate(dt_baseline, df.iloc[:, 0:15], df.iloc[:,15], cv=10, scoring='f1_weighted')
print("Baseline classification : " , cv_baseline.mean())
print("Baseline classification on train and test : ", cvf_baseline['train_score'].mean(), cvf_baseline['test_score'].mean())

Baseline classification :  0.9761947787485512
Baseline classification on train and test :  1.0 0.9745665728314725


In [71]:
dt = DecisionTreeClassifier(criterion='entropy', min_samples_leaf=2, min_samples_split=4)
cv_ = cross_val_score(dt, df.iloc[:, 0:15], df.iloc[:,15], cv=10, scoring='f1_weighted')
cvf_ = cross_validate(dt, df.iloc[:, 0:15], df.iloc[:,15], cv=10, scoring='f1_weighted')
print("First iter tuning :", cv_.mean())
print("First iter results on train and test : ", cvf_['train_score'].mean(), cvf_['test_score'].mean())

First iter tuning : 0.9782618831834243
First iter results on train and test :  0.9948986065141593 0.9778571716434609


In [72]:
rf_baseline = RandomForestClassifier()
cvrf_baseline = cross_val_score(rf_baseline, df.iloc[:, 0:15], df.iloc[:,15], cv=10, scoring='f1_weighted')
cvrff_baseline = cross_validate(rf_baseline, df.iloc[:, 0:15], df.iloc[:,15], cv=10, scoring='f1_weighted')
print("Second iter tuning :", cvrf_baseline.mean())
print("Second iter results on train and test : ", cvrff_baseline['train_score'].mean(), cvrff_baseline['test_score'].mean())

Second iter tuning : 0.9791130897391961
Second iter results on train and test :  0.9956510683872286 0.9791130897391961


In [73]:
rf = RandomForestClassifier(n_estimators=200, min_samples_leaf=2, min_samples_split=4)
cvrf_ = cross_val_score(rf, df.iloc[:, 0:15], df.iloc[:,15], cv=10, scoring='f1_micro')
cvrff_ = cross_validate(rf, df.iloc[:, 0:15], df.iloc[:,15], cv=10, scoring='f1_micro')
print("Second iter tuning :", cvrf_.mean())
print("Second iter results on train and test : ", cvrff_['train_score'].mean(), cvrff_['test_score'].mean())

Second iter tuning : 0.986038803880388
Second iter results on train and test :  0.9891112240056265 0.986038803880388


# Random Over and Under Sampling with Random Forest

In [86]:
ros = RandomOverSampler(ratio='minority')
pipe = Pipeline([('sampler', ros), ('clf', rf)])
rskf_cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
cv_ros = cross_validate(pipe, df.iloc[:, 0:15], df.iloc[:,15], cv=rskf_cv, scoring='f1_micro')
print("Over sampling results on train and test : ", cv_ros['train_score'].mean(), cv_ros['test_score'].mean())

Over sampling results on train and test :  1.0 0.9877189385605226


In [88]:
rus = RandomUnderSampler(ratio='majority')
pipe = Pipeline([('sampler', rus), ('clf', rf)])
rskf_cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
cv_rus = cross_validate(pipe, df.iloc[:, 0:15], df.iloc[:,15], cv=rskf_cv, scoring='f1_micro')
print("Over sampling results on train and test : ", cv_rus['train_score'].mean(), cv_rus['test_score'].mean())

Over sampling results on train and test :  0.6929947618591022 0.6870719071907191


In [89]:
rus = RandomUnderSampler(ratio='majority')
ros = RandomOverSampler(ratio='minority')
pipe = Pipeline([('sampler1', rus), ('sampler2', ros), ('clf', rf)])
rskf_cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
cv_rus_ros = cross_validate(pipe, df.iloc[:, 0:15], df.iloc[:,15], cv=rskf_cv, scoring='f1_micro')
print("Over sampling results on train and test : ", cv_rus_ros['train_score'].mean(), cv_rus_ros['test_score'].mean())

Over sampling results on train and test :  0.7077314751600288 0.7094229422942292


# BEST RESULT : OVER SAMPLER : 0.9877