# import libaries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, EditedNearestNeighbours, NearMiss, EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, InstanceHardnessThreshold
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline 
from collections import Counter

# import the dataset

In [3]:
df=pd.read_csv('../heart_disease/heart_disease.csv')
#df.isnull().sum() #check for null values 
#df.columns

In [None]:
model = DecisionTreeClassifier()

# Random under-sampling

In [20]:
X = df.drop (labels = ['HeartDiseaseorAttack'], axis =1)
y = df['HeartDiseaseorAttack']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

#over = RandomOverSampler(sampling_strategy=0.2)
under = RandomUnderSampler(sampling_strategy=0.5)

pipeline = Pipeline([('under', under), ('model', model)])

pipeline.fit(X_train, y_train) 

y_hat = pipeline.predict(X_test)
print(classification_report_imbalanced(y_test, y_hat))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.94      0.77      0.55      0.85      0.65      0.43     68840
        1.0       0.20      0.55      0.77      0.29      0.65      0.41      7264

avg / total       0.87      0.75      0.57      0.79      0.65      0.43     76104



# Nearmiss (not a fit, terrible f1 Score)

In [4]:
X = df.drop (labels = ['HeartDiseaseorAttack'], axis =1)
y = df['HeartDiseaseorAttack']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

#test with over and without over
#over = RandomOverSampler(sampling_strategy=0.3)
under = NearMiss(sampling_strategy=0.5, version=1, n_jobs= -1)

pipeline = Pipeline([('under', under), ('model', model)])

pipeline.fit(X_train, y_train) 

y_hat = pipeline.predict(X_test)
print(classification_report_imbalanced(y_test, y_hat))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.94      0.41      0.77      0.58      0.56      0.31     68840
        1.0       0.12      0.77      0.41      0.21      0.56      0.33      7264

avg / total       0.87      0.45      0.74      0.54      0.56      0.31     76104



# EditedNearestNeighbours

In [13]:
X = df.drop (labels = ['HeartDiseaseorAttack'], axis =1)
y = df['HeartDiseaseorAttack']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

#test with over and without over
#over = RandomOverSampler(sampling_strategy=0.3)

#kind_sel -all yelds slightly better results than -mode
under = EditedNearestNeighbours(sampling_strategy='majority', kind_sel='all', n_jobs= -1)

pipeline = Pipeline([('under', under), ('model', model)])

pipeline.fit(X_train, y_train) 

y_hat = pipeline.predict(X_test)
print(classification_report_imbalanced(y_test, y_hat))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.94      0.86      0.43      0.90      0.61      0.39     68840
        1.0       0.25      0.43      0.86      0.32      0.61      0.36      7264

avg / total       0.87      0.82      0.47      0.84      0.61      0.39     76104



# RepeatedEditedNearestNeighbours

In [4]:
X = df.drop (labels = ['HeartDiseaseorAttack'], axis =1)
y = df['HeartDiseaseorAttack']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

#test with over and without over
#over = RandomOverSampler(sampling_strategy=0.3)
under = RepeatedEditedNearestNeighbours(sampling_strategy='majority',max_iter=15, kind_sel='all', n_jobs= -1)

pipeline = Pipeline([('under', under), ('model', model)])

pipeline.fit(X_train, y_train) 

y_hat = pipeline.predict(X_test)
print(classification_report_imbalanced(y_test, y_hat))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.94      0.84      0.51      0.89      0.65      0.44     68840
        1.0       0.25      0.51      0.84      0.33      0.65      0.41      7264

avg / total       0.88      0.81      0.54      0.83      0.65      0.43     76104



# AllKNN

In [6]:
X = df.drop (labels = ['HeartDiseaseorAttack'], axis =1)
y = df['HeartDiseaseorAttack']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

#test with over-sampling and without over
over = RandomOverSampler(sampling_strategy=0.3)
under = EditedNearestNeighbours(sampling_strategy='majority', kind_sel='all', n_jobs=-1)

pipeline = Pipeline([('under', under), ('model', model)])

pipeline.fit(X_train, y_train) 

y_hat = pipeline.predict(X_test)
print(classification_report_imbalanced(y_test, y_hat))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.94      0.86      0.43      0.90      0.61      0.39     68840
        1.0       0.25      0.43      0.86      0.32      0.61      0.36      7264

avg / total       0.87      0.82      0.47      0.84      0.61      0.39     76104



# Tomek's Links

In [7]:
X = df.drop (labels = ['HeartDiseaseorAttack'], axis =1)
y = df['HeartDiseaseorAttack']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

#test with over-sampling and without over
#over = RandomOverSampler(sampling_strategy=0.3)
under = TomekLinks(sampling_strategy='majority', n_jobs=-1)

pipeline = Pipeline([('under', under), ('model', model)])

pipeline.fit(X_train, y_train) 

y_hat = pipeline.predict(X_test)
print(classification_report_imbalanced(y_test, y_hat))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.92      0.91      0.29      0.91      0.51      0.28     68840
        1.0       0.25      0.29      0.91      0.27      0.51      0.25      7264

avg / total       0.86      0.85      0.35      0.85      0.51      0.28     76104



# InstanceHardnessThreshold (bad f1 Score)

In [17]:
X = df.drop (labels = ['HeartDiseaseorAttack'], axis =1).astype(int)
y = df['HeartDiseaseorAttack'].astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

#test with over-sampling and without over
#over = RandomOverSampler(sampling_strategy=0.3)
under = InstanceHardnessThreshold(sampling_strategy=0.5, n_jobs=-1)

pipeline = Pipeline([('under', under), ('model', model)])

pipeline.fit(X_train, y_train) 

y_hat = pipeline.predict(X_test)
print(classification_report_imbalanced(y_test, y_hat))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      0.58      0.88      0.73      0.71      0.49     68840
          1       0.18      0.88      0.58      0.30      0.71      0.52      7264

avg / total       0.90      0.61      0.85      0.69      0.71      0.50     76104



# without pipeline (for testing)

In [18]:
from collections import Counter

print(f'Original dataset shape {Counter(y)}')

X_train, X_test, y_train , y_test = train_test_split(X,y,test_size=0.3, random_state=0) 

print(f'Y train {Counter(y_train)}')
print(f'Y test {Counter(y_test)}')

resample = InstanceHardnessThreshold(sampling_strategy=0.5, n_jobs=-1)

X_balanced, y_balanced = resample.fit_resample(X_train, y_train)

print(f'Y balanced {Counter(y_balanced)}')

Original dataset shape Counter({0: 229787, 1: 23893})
Y train Counter({0: 160947, 1: 16629})
Y test Counter({0: 68840, 1: 7264})
Y balanced Counter({0: 54219, 1: 16629})


In [19]:
model.fit(X_balanced, y_balanced)
y_pred = model.predict(X_test)

print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      0.57      0.89      0.72      0.71      0.49     68840
          1       0.18      0.89      0.57      0.30      0.71      0.53      7264

avg / total       0.90      0.60      0.86      0.68      0.71      0.50     76104

