# import libaries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTEN, SMOTENC, RandomOverSampler
from imblearn.pipeline import Pipeline 
from collections import Counter

# import the dataset

In [8]:
df=pd.read_csv('../heart_disease/heart_disease.csv')
#df.isnull().sum() #check for null values 

In [None]:
model = DecisionTreeClassifier()

# RandomOverSampler

In [4]:
X = df.drop (labels = ['HeartDiseaseorAttack'], axis =1)
y = df['HeartDiseaseorAttack']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

over = RandomOverSampler(sampling_strategy=0.3)
#under = RandomUnderSampler(sampling_strategy=0.5)

pipeline = Pipeline([('over', over), ('model', model)])

pipeline.fit(X_train, y_train) 

y_hat = pipeline.predict(X_test)
print(classification_report_imbalanced(y_test, y_hat))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.92      0.91      0.27      0.92      0.50      0.26     68840
        1.0       0.24      0.27      0.91      0.26      0.50      0.23      7264

avg / total       0.86      0.85      0.33      0.85      0.50      0.26     76104



# SmoteN

In [7]:
X = df.drop (labels = ['HeartDiseaseorAttack'], axis =1)
y = df['HeartDiseaseorAttack']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

overSMOTEN = SMOTEN(sampling_strategy=0.3, n_jobs=-1)
#under = RandomUnderSampler(sampling_strategy=0.5)

pipeline = Pipeline([('over', overSMOTEN), ('model', model)])

pipeline.fit(X_train, y_train) 

y_hat = pipeline.predict(X_test)
print(classification_report_imbalanced(y_test, y_hat))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.92      0.89      0.31      0.91      0.53      0.29     68840
        1.0       0.23      0.31      0.89      0.26      0.53      0.26      7264

avg / total       0.86      0.83      0.37      0.85      0.53      0.29     76104



# SmoteNC

In [9]:
X = df.drop (labels = ['HeartDiseaseorAttack'], axis =1)
y = df['HeartDiseaseorAttack']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

overSMOTENC = SMOTENC(sampling_strategy=0.3, n_jobs=-1, categorical_features=[0,1,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20])
#under = RandomUnderSampler(sampling_strategy=0.5)

pipeline = Pipeline([('over', overSMOTENC), ('model', model)])

pipeline.fit(X_train, y_train) 

y_hat = pipeline.predict(X_test)
print(classification_report_imbalanced(y_test, y_hat))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.93      0.88      0.36      0.90      0.56      0.33     68840
        1.0       0.23      0.36      0.88      0.28      0.56      0.30      7264

avg / total       0.86      0.83      0.40      0.84      0.56      0.32     76104



# Manual for testing

In [6]:
X = df.drop (labels = ['HeartDiseaseorAttack'], axis =1)
y = df['HeartDiseaseorAttack']

print(f'Original dataset shape {Counter(y)}')

X_train, X_test, y_train , y_test = train_test_split(X,y,test_size=0.3, random_state=0) 

overSMOTEN = SMOTEN(sampling_strategy=0.3, n_jobs=-1)
under = RandomUnderSampler(sampling_strategy=0.5)

print(f'Y train {Counter(y_train)}')
print(f'Y test {Counter(y_test)}')

X_balanced, y_balanced = overSMOTEN.fit_resample(X_train, y_train)
X_balanced, y_balanced = under.fit_resample(X_balanced, y_balanced)

print(f'Y balanced {Counter(y_balanced)}')

Original dataset shape Counter({0.0: 229787, 1.0: 23893})
Y train Counter({0.0: 160947, 1.0: 16629})
Y test Counter({0.0: 68840, 1.0: 7264})
Y balanced Counter({0.0: 96568, 1.0: 48284})


In [20]:
model.fit(X_balanced, y_balanced)
y_pred = model.predict(X_test)

print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.92      0.91      0.28      0.92      0.50      0.27     68840
        1.0       0.24      0.28      0.91      0.26      0.50      0.24      7264

avg / total       0.86      0.85      0.34      0.85      0.50      0.27     76104

