In [32]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from skopt import BayesSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

In [33]:
df = pd.read_csv("modified_data.csv")

In [34]:
df.head()

Unnamed: 0,gender,age,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,...,max_glu_serum_Norm,diag_1_Circulatory,diag_1_Diabetes,diag_1_Digestive,diag_1_Genitourinary,diag_1_Injury,diag_1_Musculoskeletal,diag_1_Neoplasms,diag_1_Other,diag_1_Respiratory
0,0,15,1,7,0,59,18,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,25,1,7,0,11,13,2,0,1,...,0,0,0,0,0,0,0,0,1,0
2,1,35,1,7,0,44,16,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,1,45,1,7,0,51,8,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,1,55,1,2,0,31,16,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [35]:
data = df.copy()
y = data["readmitted"].copy()
X = data.drop("readmitted", axis = 1)

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42, stratify = y)

# Uczenie przy defaultowych parametrach

In [37]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
proba = rfc.predict_proba(X_test)[:, 1]
pred = rfc.predict(X_test)

In [38]:
roc = "{:.4f}".format(roc_auc_score(y_test, proba))
f1 = "{:.4f}".format(f1_score(y_test, pred))
print(f"spółczynnik roc_auc wynosi: {roc}")
print(f"spółczynnik f1 wynosi: {f1}")

spółczynnik roc_auc wynosi: 0.6756
spółczynnik f1 wynosi: 0.5681


# Uczenie przy wybranych przeze mnie parametrach

In [39]:
rfc = RandomForestClassifier(max_depth = 12, 
                             min_samples_split = 15,
                             min_samples_leaf = 8,
                             n_estimators = 150,
                            criterion = "gini")
rfc.fit(X_train, y_train)
proba = rfc.predict_proba(X_test)[:, 1]
pred = rfc.predict(X_test)

In [40]:
roc = "{:.4f}".format(roc_auc_score(y_test, proba))
f1 = "{:.4f}".format(f1_score(y_test, pred))
print(f"spółczynnik roc_auc wynosi: {roc}")
print(f"spółczynnik f1 wynosi: {f1}")

spółczynnik roc_auc wynosi: 0.6897
spółczynnik f1 wynosi: 0.5554


# Uczenie przy parametrach wybranych metodą random search

In [41]:
params = {"max_depth" : [i for i in range(4, 15)],
          "max_features" : ["auto", "sqrt"],
          'criterion':['gini','entropy'],
          "min_samples_split" : [i for i in range(2, 20, 3)],
          "min_samples_leaf" : [i for i in range(1, 20, 3)],
          "n_estimators" : [i for i in range(150, 500, 50)]}

In [42]:
rfc = RandomForestClassifier()
clf = RandomizedSearchCV(rfc, params, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
clf.fit(X_train, y_train)
clf.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   46.1s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 11.7min finished


{'n_estimators': 400,
 'min_samples_split': 8,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 14,
 'criterion': 'gini'}

In [43]:
proba = clf.predict_proba(X_test)[:, 1]
pred = clf.predict(X_test)

In [44]:
roc = "{:.4f}".format(roc_auc_score(y_test, proba))
f1 = "{:.4f}".format(f1_score(y_test, pred))
print(f"spółczynnik roc_auc wynosi: {roc}")
print(f"spółczynnik f1 wynosi: {f1}")

spółczynnik roc_auc wynosi: 0.6919
spółczynnik f1 wynosi: 0.5604


# Uczenie przy parametrach wybranych metodą Bayesa

In [45]:
rfc = RandomForestClassifier()
opt = BayesSearchCV(
        estimator=rfc,
        search_spaces=params,
        scoring='roc_auc',
        n_jobs=4,
        n_iter=10,
        verbose=0,
        random_state=1,
        cv=3
    )
opt.fit(X_train, y_train)
opt.best_params_

OrderedDict([('criterion', 'entropy'),
             ('max_depth', 11),
             ('max_features', 'sqrt'),
             ('min_samples_leaf', 13),
             ('min_samples_split', 14),
             ('n_estimators', 250)])

In [46]:
proba = opt.predict_proba(X_test)[:, 1]
pred = opt.predict(X_test)

In [47]:
roc = "{:.4f}".format(roc_auc_score(y_test, proba))
f1 = "{:.4f}".format(f1_score(y_test, pred))
print(f"spółczynnik roc_auc wynosi: {roc}")
print(f"spółczynnik f1 wynosi: {f1}")

spółczynnik roc_auc wynosi: 0.6887
spółczynnik f1 wynosi: 0.5509


# Uczenie przy parametrach wybranych metodą grid search

In [48]:
params_for_grid = {"max_depth" : [i for i in range(12, 14)],
                   "bootstrap" : [True],
                   'criterion':['gini','entropy'],
                   "min_samples_split" : [i for i in range(11, 12, 13)],
                   "min_samples_leaf" : [5, 6, 7],
                   "n_estimators" : [i for i in range(300, 350, 400)]}

In [49]:
rfc = RandomForestClassifier()
gsr = GridSearchCV(rfc, params_for_grid, scoring = 'roc_auc', cv = 3, n_jobs = -1, verbose = 2)
gsr.fit(X_train, y_train)
gsr.best_params_

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  32 out of  36 | elapsed:  1.9min remaining:   14.0s
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:  1.9min finished


{'bootstrap': True,
 'criterion': 'gini',
 'max_depth': 13,
 'min_samples_leaf': 6,
 'min_samples_split': 11,
 'n_estimators': 300}

In [50]:
proba = gsr.predict_proba(X_test)[:, 1]
pred = gsr.predict(X_test)

In [51]:
roc = "{:.4f}".format(roc_auc_score(y_test, proba))
f1 = "{:.4f}".format(f1_score(y_test, pred))
print(f"spółczynnik roc_auc wynosi: {roc}")
print(f"spółczynnik f1 wynosi: {f1}")

spółczynnik roc_auc wynosi: 0.6919
spółczynnik f1 wynosi: 0.5587
