In [16]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [17]:
import sklearn
print(sklearn.__version__)


1.0.2


In [3]:
from imblearn.over_sampling import SMOTE

In [44]:
#Github URL
url = 'https://raw.githubusercontent.com/Gloriaihuoma/stage-f-07-heart-failure/master/data/heart_failure_clinical_records_dataset.csv'

In [45]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google.colab'

In [46]:
#Importing our dataset
df = pd.read_csv(url)

In [47]:
df.describe()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
count,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0
mean,60.833893,0.431438,581.839465,0.41806,38.083612,0.351171,263358.029264,1.39388,136.625418,0.648829,0.32107,130.26087,0.32107
std,11.894809,0.496107,970.287881,0.494067,11.834841,0.478136,97804.236869,1.03451,4.412477,0.478136,0.46767,77.614208,0.46767
min,40.0,0.0,23.0,0.0,14.0,0.0,25100.0,0.5,113.0,0.0,0.0,4.0,0.0
25%,51.0,0.0,116.5,0.0,30.0,0.0,212500.0,0.9,134.0,0.0,0.0,73.0,0.0
50%,60.0,0.0,250.0,0.0,38.0,0.0,262000.0,1.1,137.0,1.0,0.0,115.0,0.0
75%,70.0,1.0,582.0,1.0,45.0,1.0,303500.0,1.4,140.0,1.0,1.0,203.0,1.0
max,95.0,1.0,7861.0,1.0,80.0,1.0,850000.0,9.4,148.0,1.0,1.0,285.0,1.0


In [20]:
df.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [21]:
print("\t Data Types of Columns")
print(df.dtypes)
print("\n\t Null Values in the Dataset")
print(df.isna().sum())

	 Data Types of Columns
age                         float64
anaemia                       int64
creatinine_phosphokinase      int64
diabetes                      int64
ejection_fraction             int64
high_blood_pressure           int64
platelets                   float64
serum_creatinine            float64
serum_sodium                  int64
sex                           int64
smoking                       int64
time                          int64
DEATH_EVENT                   int64
dtype: object

	 Null Values in the Dataset
age                         0
anaemia                     0
creatinine_phosphokinase    0
diabetes                    0
ejection_fraction           0
high_blood_pressure         0
platelets                   0
serum_creatinine            0
serum_sodium                0
sex                         0
smoking                     0
time                        0
DEATH_EVENT                 0
dtype: int64


In [22]:
X = df.drop('DEATH_EVENT', axis = 1)
Y = df[['DEATH_EVENT']]

In [23]:
def get_data(smote=False, pca_val=False, split=0.2):
    
    # SMOTE
    if(smote == True):
        smote = SMOTE(random_state=1)
        X_bal, Y_bal = smote.fit_sample(X, Y)
        X_bal = pd.DataFrame(X_bal, columns = X.columns)
        
    else:
        X_bal = X
        Y_bal = Y
        
    # Train-Test Split
    X_train, X_test, Y_train, Y_test = train_test_split(X_bal, Y_bal, test_size=split, random_state=1)
    
    # MinMax Scaling
    stdscl = MinMaxScaler()
    cols = X.columns
    X_train = pd.DataFrame(stdscl.fit_transform(X_train), columns = cols)
    X_test = pd.DataFrame(stdscl.transform(X_test), columns = cols)
    
    # Applying PCA
    if(pca_val):
        pca = PCA(pca_val)
        X_train = pca.fit_transform(X_train)
        X_test = pca.transform(X_test)
        columns = []
        for i in range(X_train.shape[1]):
            columns.append("col" + str(i))
        X_train = pd.DataFrame(X_train, columns = columns)
        X_test = pd.DataFrame(X_test, columns = columns)
        
    # Reset Index due to Shuffling of Data
    X_train = X_train.reset_index(drop=True)
    X_test = X_test.reset_index(drop=True)
    Y_train = Y_train.reset_index(drop=True)
    Y_test = Y_test.reset_index(drop=True)
    
    return (X_train, X_test, Y_train, Y_test)

In [24]:
def score_pred(Y_test, Y_pred, model_name = "this model"):

    # confusion matrix
    matrix = confusion_matrix(Y_test, Y_pred)
    print(matrix)
    
    # accuracy: (tp + tn) / (p + n)
    accuracy = accuracy_score(Y_test, Y_pred)
    print('\nAccuracy of ' + model_name + ' is : %f' % accuracy)

    # precision tp / (tp + fp)
    precision = precision_score(Y_test, Y_pred)
    print('Precision of ' + model_name + ' is : %f' % precision)

    # recall: tp / (tp + fn)
    recall = recall_score(Y_test, Y_pred)
    print('Recall of ' + model_name + ' is : %f' % recall)

    # f1: 2 tp / (2 tp + fp + fn)
    f1 = f1_score(Y_test, Y_pred)
    print('F1 score of ' + model_name + ' is : %f' % f1)

In [25]:
# Linear Regression without smote

X_train, X_test, Y_train, Y_test = get_data(pca_val=0.99)
lr = LogisticRegression()
lr.fit(X_train, Y_train)
Y_pred = lr.predict(X_test)
score_pred(Y_test, Y_pred, model_name = "Linear Regression without SMOTE")

[[42  4]
 [ 4 10]]

Accuracy of Linear Regression without SMOTE is : 0.866667
Precision of Linear Regression without SMOTE is : 0.714286
Recall of Linear Regression without SMOTE is : 0.714286
F1 score of Linear Regression without SMOTE is : 0.714286


  y = column_or_1d(y, warn=True)


In [42]:
from imblearn.over_sampling import SMOTE
import pandas as pd
# Other necessary imports (e.g., for loading data, splitting datasets)

def get_data(smote=False, pca_val=0.99):
    # Load your dataset
    # X, Y = load_your_data()

    # Split your dataset into training and testing sets
    # This part assumes you already have X (features) and Y (target) loaded
    from sklearn.model_selection import train_test_split
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

    if smote:
        smote = SMOTE(random_state=1)
        # Use fit_resample instead of fit_sample to balance the training dataset
        X_train_bal, Y_train_bal = smote.fit_resample(X_train, Y_train)
        # Optionally, convert the balanced arrays back into a DataFrame if needed
        X_train_bal = pd.DataFrame(X_train_bal, columns=X_train.columns)
        # Use the balanced dataset for training
        return X_train_bal, X_test, Y_train_bal, Y_test
    else:
        # Return the original dataset if SMOTE is not applied
        return X_train, X_test, Y_train, Y_test


In [43]:
# Linear Regression with SMOTE

X_train, X_test, Y_train, Y_test = get_data(smote = True, pca_val=0.99)
lr2 = LogisticRegression()
lr2.fit(X_train, Y_train)
Y_pred = lr2.predict(X_test)
score_pred(Y_test, Y_pred, model_name = "Linear Regression with SMOTE")


[[28  7]
 [ 9 16]]

Accuracy of Linear Regression with SMOTE is : 0.733333
Precision of Linear Regression with SMOTE is : 0.695652
Recall of Linear Regression with SMOTE is : 0.640000
F1 score of Linear Regression with SMOTE is : 0.666667


  y = column_or_1d(y, warn=True)


In [36]:
# Random Forest Classifier without SMOTE

X_train, X_test, Y_train, Y_test = get_data(smote = False, pca_val = 0.95)
rf = RandomForestClassifier(max_depth=9, n_estimators=50, min_samples_leaf=1, min_samples_split=3)
rf.fit(X_train, Y_train)
Y_pred = rf.predict(X_test)
score_pred(Y_test, Y_pred, model_name = "Random Forest without SMOTE")

[[44  2]
 [ 5  9]]

Accuracy of Random Forest without SMOTE is : 0.883333
Precision of Random Forest without SMOTE is : 0.818182
Recall of Random Forest without SMOTE is : 0.642857
F1 score of Random Forest without SMOTE is : 0.720000


  rf.fit(X_train, Y_train)


In [None]:
# Random Forest Classifier with SMOTE

X_train, X_test, Y_train, Y_test = get_data(smote = True, pca_val = False)
rf = RandomForestClassifier()
rf.fit(X_train, Y_train)
Y_pred = rf.predict(X_test)
score_pred(Y_test, Y_pred, model_name = "Random Forest with SMOTE")

In [None]:
rf.get_params()