# Logistic Regression
# Resample — SMOTE
### Import Libraries

conda install -c glemaitre imbalanced-learn 


conda update scikit-learn

In [1]:
from sklearn import __version__ 
print('The scikit-learn version is {}.'.format(__version__))

The scikit-learn version is 0.19.1.


In [2]:
import pandas as pd, numpy as np #basic packages

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split

from sklearn.feature_selection import RFE

from sklearn.metrics import accuracy_score, precision_score, log_loss, confusion_matrix

from imblearn.over_sampling import SMOTE

# Import Data and Define Global Variables

In [3]:
df = pd.read_csv("df1_diag_as_single_column.csv", index_col = 0)

X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [4]:
df.head()

Unnamed: 0,diag_1,diag_2,diag_3,AfricanAmerican,Asian,Caucasian,Hispanic,Other,Female,Male,...,Yes,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,readmitted_tran
0,250.0,434.952844,409.650469,0,0,1,0,0,1,0,...,0,-1.137746,-0.10523,-0.786854,-1.850615,-0.292413,-0.214198,-0.505835,-3.3432,0
1,276.0,250.0,255.0,0,0,1,0,0,1,0,...,1,-0.468172,0.808668,-0.786854,0.243035,-0.292413,-0.214198,-0.505835,0.810349,0
2,648.0,250.0,27.0,1,0,0,0,0,1,0,...,1,-0.802959,-1.628393,2.147906,-0.372744,1.274004,-0.214198,0.280721,-0.747232,0
3,8.0,250.0,403.0,0,0,1,0,0,0,1,...,1,-0.802959,0.047087,-0.199902,-0.003277,-0.292413,-0.214198,-0.505835,-0.228039,0
4,197.0,157.0,250.0,0,0,1,0,0,0,1,...,1,-1.137746,0.402491,-0.786854,-0.988524,-0.292413,-0.214198,-0.505835,-1.266426,0


# Imbalanced Data

In [5]:
pd.Series(y).value_counts()

0    88324
1    11169
Name: readmitted_tran, dtype: int64

# Resample with SMOTE

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

In [7]:
type(y_test)

pandas.core.series.Series

In [8]:
sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_sample(X_train, y_train)

X_train = pd.DataFrame(X_train)
y_train = pd.Series(y_train)

In [9]:
(type(y_test) == type(y_train)) & (type(X_test) == type(X_train))

True

In [10]:
y_train.value_counts()

1    61722
0    61722
dtype: int64

# Wrapper Method Feature Selection

In [11]:
def top_40(model, X, y):

    rfe = RFE(model, 40)
    fit = rfe.fit(X, y)
    
    keep = []
    for i,a in enumerate(fit.support_):
        if (a == True):
            keep.append(i)
    
    return keep

# Logistic Regression Algorithm

In [12]:
def lr(X_train, y_train, X_test):
    
    lr = LogisticRegression()
    lr.fit(X_train,y_train)
    y_pred = lr.predict(X_test)
    
    return y_pred

In [13]:
lr_keep = top_40(LogisticRegression(), X_train, y_train)

In [14]:
lr_pred = lr(X_train.iloc[:,lr_keep], y_train, X_test.iloc[:,lr_keep])

# Results

In [15]:
print("Accuracy:" , accuracy_score(y_test, lr_pred)," Precision:" , precision_score(y_test, lr_pred))

Accuracy: 0.880461002412  Precision: 0.276388888889


In [16]:
cf = confusion_matrix(y_test, lr_pred)
cfa = list(cf[0]) + list(cf[1])
label = ["tn", "fp", "fn", "tp"]

In [17]:
for i in range(0,4):
    print(label[i], cfa[i])

tn 26081
fp 521
fn 3047
tp 199
