In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from scipy.stats import uniform, randint
from sklearn.metrics import f1_score, make_scorer

In [16]:
df = pd.read_csv('./data/phpOJxGL9.csv')
print(df.shape, '\n\n')
print(df.describe(), '\n\n')

(583, 11) 


               V1          V3          V4           V5           V6  \
count  583.000000  583.000000  583.000000   583.000000   583.000000   
mean    44.746141    3.298799    1.486106   290.576329    80.713551   
std     16.189833    6.209522    2.808498   242.937989   182.620356   
min      4.000000    0.400000    0.100000    63.000000    10.000000   
25%     33.000000    0.800000    0.200000   175.500000    23.000000   
50%     45.000000    1.000000    0.300000   208.000000    35.000000   
75%     58.000000    2.600000    1.300000   298.000000    60.500000   
max     90.000000   75.000000   19.700000  2110.000000  2000.000000   

                V7          V8          V9         V10       Class  
count   583.000000  583.000000  583.000000  583.000000  583.000000  
mean    109.910806    6.483190    3.141852    0.947064    1.286449  
std     288.918529    1.085451    0.795519    0.318492    0.452490  
min      10.000000    2.700000    0.900000    0.300000    1.000000  
25

In [5]:
print(df.isna().sum())

V1       0
V2       0
V3       0
V4       0
V5       0
V6       0
V7       0
V8       0
V9       0
V10      0
Class    0
dtype: int64


In [32]:
classe = {'Female':0,'Male':1}

df['V2'] = df['V2'].map(classe)

In [33]:
df

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,Class
0,65,0,0.7,0.1,187,16,18,6.8,3.3,0.90,1
1,62,1,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,1,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,1,1.0,0.4,182,14,20,6.8,3.4,1.00,1
4,72,1,3.9,2.0,195,27,59,7.3,2.4,0.40,1
...,...,...,...,...,...,...,...,...,...,...,...
578,60,1,0.5,0.1,500,20,34,5.9,1.6,0.37,2
579,40,1,0.6,0.1,98,35,31,6.0,3.2,1.10,1
580,52,1,0.8,0.2,245,48,49,6.4,3.2,1.00,1
581,31,1,1.3,0.5,184,29,32,6.8,3.4,1.00,1


In [19]:
df['Class']

0      1
1      1
2      1
3      1
4      1
      ..
578    2
579    1
580    1
581    1
582    2
Name: Class, Length: 583, dtype: int64

In [56]:
svc = SVC()
param = {
    'kernel':['sigmoid','poly','rbf'],
    'C':uniform(loc=0, scale=10),
}
f1 = make_scorer(f1_score)
label = df['Class']
data = df.drop('Class', axis=1)
cv = StratifiedKFold(n_splits=10)

In [57]:
model_svc = RandomizedSearchCV(svc,param_distributions=param, n_iter=5, cv=cv,
                               scoring=f1,random_state=5762, n_jobs=-1)

model_svc.fit(data,label)

RandomizedSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
                   estimator=SVC(), n_iter=5, n_jobs=-1,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f1bb3543340>,
                                        'kernel': ['sigmoid', 'poly', 'rbf']},
                   random_state=5762, scoring=make_scorer(f1_score))

In [58]:
print(model_svc.best_score_)
print(model_svc.best_params_)
print(model_svc.best_estimator_)

0.832818081808181
{'C': 2.996689355015553, 'kernel': 'poly'}
SVC(C=2.996689355015553, kernel='poly')


In [60]:
rfc = RandomForestClassifier(random_state=5762)

param = {
    'n_estimators': randint(10,1000),
    'bootstrap': [True, False],
    'criterion': ['gini','entropy'],  
}

In [62]:
model_rfc = RandomizedSearchCV(rfc, param_distributions=param, n_iter=5, cv=cv,
                               scoring=f1, random_state=5762)
model_rfc.fit(data,label)

RandomizedSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
                   estimator=RandomForestClassifier(random_state=5762),
                   n_iter=5,
                   param_distributions={'bootstrap': [True, False],
                                        'criterion': ['gini', 'entropy'],
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f1bb2f61bb0>},
                   random_state=5762, scoring=make_scorer(f1_score))

In [63]:
print(model_rfc.best_score_)
print(model_rfc.best_params_)
print(model_rfc.best_estimator_)

0.7984322518413547
{'bootstrap': True, 'criterion': 'gini', 'n_estimators': 488}
RandomForestClassifier(n_estimators=488, random_state=5762)
