In [143]:
import pandas as pd
import matplotlib.pyplot as plt

In [144]:
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv')
y_test = pd.read_csv('y_test.csv')

In [145]:
string_features = []
for col in X_train.columns:
    if X_train[col].dtype == 'O':
        string_features.append(col)

In [146]:
string_features

['race',
 'gender',
 'age',
 'diag_1',
 'diag_2',
 'diag_3',
 'max_glu_serum',
 'A1Cresult']

In [147]:
lbl_encode = []
onehot_encode = []
for features in string_features:
    if X_train[features].nunique() > 15:
        lbl_encode.append(features)
    else:
        onehot_encode.append(features)

In [148]:
print(lbl_encode)
print(onehot_encode)

['diag_1', 'diag_2', 'diag_3']
['race', 'gender', 'age', 'max_glu_serum', 'A1Cresult']


In [149]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import GridSearchCV

import pickle

In [150]:
lbl_transformer = LabelEncoder()
onehot_transformer = OneHotEncoder(drop='first')

In [151]:
X_train

Unnamed: 0,race,gender,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult
0,Caucasian,Male,[60-70),1,74,0,9,2,0,0,786,414,401,7,,>8
1,Caucasian,Female,[80-90),4,29,0,11,0,0,0,722,401,530,9,,
2,Caucasian,Male,[70-80),3,3,1,8,5,2,0,996,E878,V43,5,,
3,Caucasian,Male,[80-90),1,28,0,6,2,0,2,494,428,427,9,,
4,AfricanAmerican,Male,[50-60),4,48,1,25,0,0,0,824,401,V58,9,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22781,Caucasian,Male,[60-70),3,24,1,25,0,0,0,715,424,V45,7,,
22782,Caucasian,Female,[60-70),1,1,2,5,0,0,0,426,426,426,8,,
22783,AfricanAmerican,Female,[40-50),1,27,0,8,0,1,0,786,250,402,3,,
22784,Caucasian,Male,[90-100),3,33,0,8,0,0,0,250.8,599,428,5,,


In [152]:
# Label Encoding
X_train[lbl_encode] = X_train[lbl_encode].apply(lbl_transformer.fit_transform)
X_test[lbl_encode] = X_test[lbl_encode].apply(lbl_transformer.fit_transform)

In [153]:
# One hot encoding
# X_train[onehot_encode] = X_train[onehot_encode].apply(onehot_transformer.fit_transform)
# X_test[onehot_encode] = X_test[onehot_encode].apply(onehot_transformer.fit_transform)

onehot_transformer.fit(X_train[onehot_encode])
data = pd.DataFrame(onehot_transformer.transform(X_train[onehot_encode]).toarray())

In [154]:
data_train = pd.get_dummies(X_train[onehot_encode],drop_first=True)
data_test = pd.get_dummies(X_train[onehot_encode],drop_first=True)

In [155]:
X_train.drop(onehot_encode,inplace=True,axis=1)
X_test.drop(onehot_encode,inplace=True,axis=1)

X_train = X_train.join(data_train)
X_test = X_test.join(data_test)

In [157]:
RF = RandomForestClassifier(random_state=0)
LR = LogisticRegression(random_state=0)
DC = DecisionTreeClassifier(random_state=0)

In [158]:
RF_param = {}
RF_param['n_estimators'] = [10, 50, 100, 250]
RF_param['max_depth'] = [5, 10, 20]
RF_param['class_weight'] = [None, {0:1,1:5}, {0:1,1:10}, {0:1,1:25}]

LR_param = {}
LR_param['C'] = [10**-2, 10**-1, 10**0, 10**1, 10**2]
LR_param['penalty'] = ['l1', 'l2']
LR_param['class_weight'] = [None, {0:1,1:5}, {0:1,1:10}, {0:1,1:25}]

DC_param = {}
DC_param['max_depth'] = [5,10,25,None]
DC_param['min_samples_split'] = [2,5,10]
DC_param['class_weight'] = [None, {0:1,1:5}, {0:1,1:10}, {0:1,1:25}]

In [161]:
lR_grid = GridSearchCV(LogisticRegression(),LR_param,scoring='accuracy',cv=10, n_jobs=-1).fit(X_train, y_train)
DC_grid = GridSearchCV(DecisionTreeClassifier(),DC_param,scoring='accuracy',cv=10, n_jobs=-1).fit(X_train, y_train)
RF_grid = GridSearchCV(RandomForestClassifier(),RF_param,scoring='roc_auc',cv=10, n_jobs=-1).fit(X_train, y_train)

        nan        nan        nan 0.61375332        nan        nan
        nan        nan        nan        nan        nan 0.61296305
        nan        nan        nan        nan        nan        nan
        nan 0.61344615        nan        nan        nan        nan
        nan        nan        nan 0.6140163         nan        nan
        nan        nan        nan        nan]
  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
 0.5955414  0.59426857 0.5970771  0.5942681  0.5929956  0.59650643
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan    

In [162]:
print(lR_grid.best_estimator_)
print(DC_grid.best_estimator_)
print(RF_grid.best_estimator_)

LogisticRegression(C=0.01)
DecisionTreeClassifier(max_depth=10)
RandomForestClassifier(max_depth=20, n_estimators=250)


In [164]:
print('LR :',lR_grid.score(X_test,y_test))
print('DC :',DC_grid.score(X_test,y_test))
print('RF :',RF_grid.score(X_test,y_test))

LR : 0.5855713533438652
DC : 0.5222046691241004
RF : 0.599381517239042
