# Logistic Regression
# Resample — SMOTE
### Import Libraries

conda install -c glemaitre imbalanced-learn 


conda update scikit-learn

In [58]:
from sklearn import __version__ 
print('The scikit-learn version is {}.'.format(__version__))

The scikit-learn version is 0.19.1.


In [59]:
import pandas as pd, numpy as np #basic packages

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split

from sklearn.feature_selection import RFE

from sklearn.metrics import accuracy_score, precision_score, log_loss, confusion_matrix

from imblearn.over_sampling import SMOTE

# Import Data and Define Global Variables

In [60]:
df = pd.read_csv("data_folder/df2_diag_drop.csv", index_col = 0)
df.head()

Unnamed: 0,AfricanAmerican,Asian,Caucasian,Hispanic,Other,Female,Male,Unknown/Invalid,[0-10),[10-20),...,Yes,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,readmitted_tran
0,0,0,1,0,0,1,0,0,1,0,...,0,1,41,0,1,0,0,0,1,0
1,0,0,1,0,0,1,0,0,0,1,...,1,3,59,0,18,0,0,0,9,0
2,1,0,0,0,0,1,0,0,0,0,...,1,2,11,5,13,2,0,1,6,0
3,0,0,1,0,0,0,1,0,0,0,...,1,2,44,1,16,0,0,0,7,0
4,0,0,1,0,0,0,1,0,0,0,...,1,1,51,0,8,0,0,0,5,0


In [61]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
len(df.columns)

237

# Imbalanced Data

In [62]:
pd.Series(y).value_counts()

0    88324
1    11169
Name: readmitted_tran, dtype: int64

# Resample with SMOTE

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

In [64]:
type(y_test)

pandas.core.series.Series

In [65]:
sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_sample(X_train, y_train)

X_train = pd.DataFrame(X_train)
y_train = pd.Series(y_train)

In [66]:
(type(y_test) == type(y_train)) & (type(X_test) == type(X_train))

True

In [67]:
y_train.value_counts()

1    61722
0    61722
dtype: int64

# Wrapper Method Feature Selection

In [68]:
def top_40(model, X, y):

    rfe = RFE(model, 40)
    fit = rfe.fit(X, y)
    
    keep = []
    for i,a in enumerate(fit.support_):
        if (a == True):
            keep.append(i)
    
    return keep

# Logistic Regression Algorithm

In [None]:
def lr(X_train, y_train, X_test):
    
    lr = LogisticRegression()
    lr.fit(X_train,y_train)
    y_pred = lr.predict(X_test)
    
    return y_pred

In [None]:
lr_keep = top_40(LogisticRegression(), X_train, y_train)

In [None]:
lr_pred = lr(X_train.iloc[:,lr_keep], y_train, X_test.iloc[:,lr_keep])

# Results

In [None]:
print("Accuracy:" , accuracy_score(y_test, lr_pred)," Precision:" , precision_score(y_test, lr_pred))

In [None]:
cf = confusion_matrix(y_test, lr_pred)
cfa = list(cf[0]) + list(cf[1])
label = ["tn", "fp", "fn", "tp"]

In [None]:
for i in range(0,4):
    print(label[i], cfa[i])