# Logistic Regression
# Resample — SMOTE
### Import Libraries

conda install -c glemaitre imbalanced-learn 


conda update scikit-learn

In [1]:
from sklearn import __version__ 
print('The scikit-learn version is {}.'.format(__version__))

The scikit-learn version is 0.19.1.


In [2]:
import pandas as pd, numpy as np #basic packages

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split

from sklearn.feature_selection import RFE

from sklearn.metrics import accuracy_score, precision_score, log_loss, confusion_matrix

from imblearn.over_sampling import SMOTE

# Import Data and Define Global Variables

In [3]:
df = pd.read_csv("pp_df1.csv", index_col = 0)

X = df.iloc[:,:-1]
y = df.iloc[:,-1]

# Imbalanced Data

In [4]:
pd.Series(y).value_counts()

0    90409
1    11357
Name: readmitted_tran, dtype: int64

# Resample with SMOTE

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

In [6]:
type(y_test)

pandas.core.series.Series

In [8]:
sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_sample(X_train, y_train)

X_train = pd.DataFrame(X_train)
y_train = pd.Series(y_train)

In [10]:
(type(y_test) == type(y_train)) & (type(X_test) == type(X_train))

True

In [11]:
y_train.value_counts()

1    63304
0    63304
dtype: int64

# Wrapper Method Feature Selection

In [13]:
def top_40(model, X, y):

    rfe = RFE(model, 40)
    fit = rfe.fit(X, y)
    
    keep = []
    for i,a in enumerate(fit.support_):
        if (a == True):
            keep.append(i)
    
    return keep

# Logistic Regression Algorithm

In [12]:
def lr(X_train, y_train, X_test):
    
    lr = LogisticRegression()
    lr.fit(X_train,y_train)
    y_pred = lr.predict(X_test)
    
    return y_pred

In [14]:
lr_keep = top_40(LogisticRegression(), X_train, y_train)

In [15]:
lr_pred = lr(X_train.iloc[:,lr_keep], y_train, X_test.iloc[:,lr_keep])

# Results

In [16]:
print("Accuracy:" , accuracy_score(y_test, lr_pred)," Precision:" , precision_score(y_test, lr_pred))

Accuracy: 0.876482148706  Precision: 0.282663316583


In [18]:
cf = confusion_matrix(y_test, lr_pred)
cfa = list(cf[0]) + list(cf[1])
label = ["tn", "fp", "fn", "tp"]

In [19]:
for i in range(0,4):
    print(label[i], cfa[i])

tn 26534
fp 571
fn 3200
tp 225
