In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [None]:
import sklearn
print(sklearn.__version__)


In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
# importing data
df = pd.read_csv('https://raw.githubusercontent.com/gauravjain2/stage-f-07-heart-failure/master/data/heart_failure_clinical_records_dataset.csv')

In [None]:
df.describe()

In [None]:
df.head()

In [None]:
print("\t Data Types of Columns")
print(df.dtypes)
print("\n\t Null Values in the Dataset")
print(df.isna().sum())

In [None]:
X = df.drop('DEATH_EVENT', axis = 1)
Y = df[['DEATH_EVENT']]

In [None]:
def get_data(smote=False, pca_val=False, split=0.2):
    
    # SMOTE
    if(smote == True):
        smote = SMOTE(random_state=1)
        X_bal, Y_bal = smote.fit_sample(X, Y)
        X_bal = pd.DataFrame(X_bal, columns = X.columns)
        
    else:
        X_bal = X
        Y_bal = Y
        
    # Train-Test Split
    X_train, X_test, Y_train, Y_test = train_test_split(X_bal, Y_bal, test_size=split, random_state=1)
    
    # MinMax Scaling
    stdscl = MinMaxScaler()
    cols = X.columns
    X_train = pd.DataFrame(stdscl.fit_transform(X_train), columns = cols)
    X_test = pd.DataFrame(stdscl.transform(X_test), columns = cols)
    
    # Applying PCA
    if(pca_val):
        pca = PCA(pca_val)
        X_train = pca.fit_transform(X_train)
        X_test = pca.transform(X_test)
        columns = []
        for i in range(X_train.shape[1]):
            columns.append("col" + str(i))
        X_train = pd.DataFrame(X_train, columns = columns)
        X_test = pd.DataFrame(X_test, columns = columns)
        
    # Reset Index due to Shuffling of Data
    X_train = X_train.reset_index(drop=True)
    X_test = X_test.reset_index(drop=True)
    Y_train = Y_train.reset_index(drop=True)
    Y_test = Y_test.reset_index(drop=True)
    
    return (X_train, X_test, Y_train, Y_test)

In [None]:
def score_pred(Y_test, Y_pred, model_name = "this model"):

    # confusion matrix
    matrix = confusion_matrix(Y_test, Y_pred)
    print(matrix)
    
    # accuracy: (tp + tn) / (p + n)
    accuracy = accuracy_score(Y_test, Y_pred)
    print('\nAccuracy of ' + model_name + ' is : %f' % accuracy)

    # precision tp / (tp + fp)
    precision = precision_score(Y_test, Y_pred)
    print('Precision of ' + model_name + ' is : %f' % precision)

    # recall: tp / (tp + fn)
    recall = recall_score(Y_test, Y_pred)
    print('Recall of ' + model_name + ' is : %f' % recall)

    # f1: 2 tp / (2 tp + fp + fn)
    f1 = f1_score(Y_test, Y_pred)
    print('F1 score of ' + model_name + ' is : %f' % f1)

In [None]:
# Linear Regression without smote

X_train, X_test, Y_train, Y_test = get_data(pca_val=0.99)
lr = LogisticRegression()
lr.fit(X_train, Y_train)
Y_pred = lr.predict(X_test)
score_pred(Y_test, Y_pred, model_name = "Linear Regression without SMOTE")

In [None]:
# Linear Regression with SMOTE

X_train, X_test, Y_train, Y_test = get_data(smote = True, pca_val=0.99)
lr2 = LogisticRegression()
lr2.fit(X_train, Y_train)
Y_pred = lr2.predict(X_test)
score_pred(Y_test, Y_pred, model_name = "Linear Regression with SMOTE")

In [None]:
# Random Forest Classifier without SMOTE

X_train, X_test, Y_train, Y_test = get_data(smote = False, pca_val = 0.95)
rf = RandomForestClassifier(max_depth=9, n_estimators=50, min_samples_leaf=1, min_samples_split=3)
rf.fit(X_train, Y_train)
Y_pred = rf.predict(X_test)
score_pred(Y_test, Y_pred, model_name = "Random Forest without SMOTE")

In [None]:
# Random Forest Classifier with SMOTE

X_train, X_test, Y_train, Y_test = get_data(smote = True, pca_val = False)
rf = RandomForestClassifier()
rf.fit(X_train, Y_train)
Y_pred = rf.predict(X_test)
score_pred(Y_test, Y_pred, model_name = "Random Forest with SMOTE")

In [None]:
rf.get_params()