# Praca domowa 2

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

In [2]:
data = pd.read_csv('diabetic_data_2.csv')

In [3]:
data.head(10)

Unnamed: 0,gender,age,discharge_disposition_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,...,diag_1_Other,diag_1_Respiratory,max_glu_serum_>200,max_glu_serum_>300,max_glu_serum_None,max_glu_serum_Norm,A1Cresult_>7,A1Cresult_>8,A1Cresult_None,A1Cresult_Norm
0,0,15,1,3,59,0,18,0,0,0,...,1,0,0,0,1,0,0,0,1,0
1,0,25,1,2,11,5,13,2,0,1,...,1,0,0,0,1,0,0,0,1,0
2,1,35,1,2,44,1,16,0,0,0,...,1,0,0,0,1,0,0,0,1,0
3,1,45,1,1,51,0,8,0,0,0,...,0,0,0,0,1,0,0,0,1,0
4,1,55,1,3,31,6,16,0,0,0,...,0,0,0,0,1,0,0,0,1,0
5,1,65,1,4,70,1,21,0,0,0,...,0,0,0,0,1,0,0,0,1,0
6,1,75,1,5,73,0,12,0,0,0,...,0,0,0,0,1,0,0,0,1,0
7,0,85,1,13,68,2,28,0,0,0,...,0,0,0,0,1,0,0,0,1,0
8,0,95,0,12,33,3,18,0,0,0,...,0,0,0,0,1,0,0,0,1,0
9,0,45,1,9,47,2,17,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [4]:
data.shape

(98052, 56)

W pierwszej pracy domowej pozbyłem się rekordów które były kolejnymi wizytami tych pacjentów zostawiając tylko pierwszą z wizyt. Usunąłem także osoby które zmarły. W wyniku tych działań zmniejszyłem liczbę rekordów do około 68 tys. Jednak jak zauważyłem wpłynęło to negatywnie na mój model dlatego zdecydowałem się przywrócić tych pacjentów.

In [5]:
df = data.copy()

y = df['readmitted'].copy()
df.drop('readmitted', axis=1, inplace=True)
X = df.copy()

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 21, stratify = y)

## Trening z domyślnymi parametrami (default)

In [7]:
from sklearn.ensemble import ExtraTreesClassifier

In [15]:
from sklearn.ensemble import ExtraTreesClassifier
clf = ExtraTreesClassifier()
clf.fit(X_train, y_train)

ExtraTreesClassifier()

In [16]:
proba = clf.predict_proba(X_test)[:, 1]
pred = clf.predict(X_test)

In [17]:
roc = "{:.5f}".format(roc_auc_score(y_test, proba))
print(f"Współczynnik roc_auc wynosi: {roc}")

Współczynnik roc_auc wynosi: 0.64493


In [18]:
acc = "{:.5f}".format(accuracy_score(y_test, pred))
print(f"Współczynnik accuracy wynosi: {acc}")

Współczynnik accuracy wynosi: 0.60915


## Trening z ręcznie zmienianymi parametrami

In [38]:
clf = ExtraTreesClassifier(criterion = 'gini', n_estimators = 200, max_features = 40, min_samples_leaf=100)
clf.fit(X_train, y_train)

ExtraTreesClassifier(max_features=40, min_samples_leaf=100, n_estimators=200)

In [39]:
proba = clf.predict_proba(X_test)[:, 1]
pred = clf.predict(X_test)

In [40]:
roc = "{:.5f}".format(roc_auc_score(y_test, proba))
print(f"Współczynnik roc_auc wynosi: {roc}")

Współczynnik roc_auc wynosi: 0.67791


In [41]:
acc = "{:.5f}".format(accuracy_score(y_test, pred))
print(f"Współczynnik accuracy wynosi: {acc}")

Współczynnik accuracy wynosi: 0.63000


## Grid Search

In [19]:
from sklearn.model_selection import GridSearchCV

In [None]:
clf = ExtraTreesClassifier()
params = {
    'criterion': ['gini', 'entropy'],
    'n_estimators': [100, 200, 500],
    'min_samples_leaf': [5, 50, 100],
    'max_features' : [40, 'auto']
}

gs = GridSearchCV(estimator = clf, param_grid = params, n_jobs = -1, verbose = 2, cv = 3,
                  scoring='roc_auc')
gs.fit(X_train, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


In [None]:
gs.best_params_

In [None]:
gs = gs.best_estimator_

In [None]:
pred = gs.predict(X_test)
proba = gs.predict_proba(X_test)[:, 1]

In [None]:
roc = "{:.5f}".format(roc_auc_score(y_test, proba))
print(f"Współczynnik roc_auc wynosi: {roc}")

In [None]:
acc = "{:.5f}".format(accuracy_score(y_test, pred))
print(f"Współczynnik accuracy wynosi: {acc}")

## Random Search

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
clf = ExtraTreesClassifier()
params = {
    'criterion': ['gini', 'entropy'],
    'n_estimators': np.arange(100, 501, 50, dtype=int),
    'min_samples_leaf': np.arange(5, 101, 10, dtype=int),
    'max_features' : np.arange(5, 50, 5, dtype=int)
}
rs = RandomizedSearchCV(estimator = clf, param_distributions = params, verbose=10, 
                               cv = 3, n_iter = 2, n_jobs = -1, scoring='roc_auc')
rs.fit(X_train, y_train)

In [10]:
rs.best_params_

{'n_estimators': 450,
 'min_samples_leaf': 95,
 'max_features': 45,
 'criterion': 'gini'}

In [11]:
rs = rs.best_estimator_

In [16]:
pred = rs.predict(X_test)
proba = rs.predict_proba(X_test)[:, 1]

In [17]:
roc = "{:.5f}".format(roc_auc_score(y_test, proba))
print(f"Współczynnik roc_auc wynosi: {roc}")

Współczynnik roc_auc wynosi: 0.67826


In [15]:
acc = "{:.5f}".format(accuracy_score(y_test, pred))
print(f"Współczynnik accuracy wynosi: {acc}")

Współczynnik accuracy wynosi: 0.62954


## Optymalizacja Bayesowska

In [None]:
from skopt import BayesSearchCV

In [43]:
params = {
    'criterion': ['gini', 'entropy'],
    'n_estimators': np.arange(100, 501, 50, dtype=int),
    'min_samples_leaf': np.arange(5, 101, 10, dtype=int),
    'max_features' : np.arange(5, 50, 5, dtype=int)
}

In [None]:
clf = ExtraTreesClassifier()
bayes = BayesSearchCV(
        estimator=clf,
        search_spaces=params,
        scoring='roc_auc',
        n_jobs=-1,
        n_iter=20,
        verbose=2,
        random_state=21,
        cv=3
    )
bayes.fit(X_train, y_train)
bayes.best_params_

In [None]:
bayes.best_params_

In [None]:
bayes = bayes.best_estimator_

In [None]:
pred = bayes.predict(X_test)
proba = bayes.predict_proba(X_test)[:, 1]

In [None]:
roc = "{:.5f}".format(roc_auc_score(y_test, proba))
print(f"Współczynnik roc_auc wynosi: {roc}")

In [None]:
acc = "{:.5f}".format(accuracy_score(y_test, pred))
print(f"Współczynnik accuracy wynosi: {acc}")