# import libaries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from imblearn.pipeline import Pipeline 
from imblearn.combine import SMOTETomek, SMOTEENN 
from imblearn.under_sampling import TomekLinks, RepeatedEditedNearestNeighbours, EditedNearestNeighbours, RandomUnderSampler
from imblearn.over_sampling import SMOTENC, SMOTEN, RandomOverSampler
from sklearn.preprocessing import StandardScaler
from collections import Counter

# import the dataset

In [2]:
df=pd.read_csv('../heart_disease/heart_disease.csv')
#df.isnull().sum() #check for null values 
#df.columns

FileNotFoundError: [Errno 2] No such file or directory: '../heart_disease/heart_disease.csv'

In [None]:
model = DecisionTreeClassifier()

# Combined: RandomOverSampler-RandomUnderSampler

In [None]:
X = df.drop (labels = ['HeartDiseaseorAttack'], axis =1)
y = df['HeartDiseaseorAttack']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

over = RandomOverSampler(sampling_strategy=0.3)
under = RandomUnderSampler(sampling_strategy=0.5)

pipeline = Pipeline([('StandardScaler', StandardScaler()), ('over', over),('under', under), ('model', model)])

pipeline.fit(X_train, y_train) 

y_hat = pipeline.predict(X_test)
print(classification_report_imbalanced(y_test, y_hat))

# Combined: SmoteNC-RandomUnderSampler

In [None]:
X = df.drop (labels = ['HeartDiseaseorAttack'], axis =1)
y = df['HeartDiseaseorAttack']

over=  SMOTENC(sampling_strategy=0.3, n_jobs=-1, categorical_features=[0,1,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20])
under= RandomUnderSampler(sampling_strategy=0.5)

pipeline = Pipeline([('StandardScaler', StandardScaler()), ('over', over), ('under', under), ('model', model)])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

pipeline.fit(X_train, y_train) 

y_hat = pipeline.predict(X_test)
print(classification_report_imbalanced(y_test, y_hat))

# Combined: RandomOverSampler-RepeatedENN

In [None]:
X = df.drop (labels = ['HeartDiseaseorAttack'], axis =1)
y = df['HeartDiseaseorAttack']

over=  RandomOverSampler(sampling_strategy=0.3)
under= RepeatedEditedNearestNeighbours(sampling_strategy='majority', max_iter=15, kind_sel='all', n_jobs= -1)

pipeline = Pipeline([('StandardScaler', StandardScaler()), ('over', over), ('under', under), ('model', model)])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

pipeline.fit(X_train, y_train) 

y_hat = pipeline.predict(X_test)
print(classification_report_imbalanced(y_test, y_hat))

# Combined: SmoteNCRepeatedEditedNearestNeighbours

In [None]:
X = df.drop (labels = ['HeartDiseaseorAttack'], axis =1)
y = df['HeartDiseaseorAttack']

over=  SMOTENC(sampling_strategy=0.3, n_jobs=-1, categorical_features=[0,1,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20])
under= RepeatedEditedNearestNeighbours(sampling_strategy='majority', max_iter=15, kind_sel='all', n_jobs= -1)

pipeline = Pipeline([('StandardScaler', StandardScaler()), ('over', over), ('under', under), ('model', model)])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

pipeline.fit(X_train, y_train) 

y_hat = pipeline.predict(X_test)
print(classification_report_imbalanced(y_test, y_hat))

# Combined: SMOTEN-EditedNearestNeighbours

In [None]:
X = df.drop (labels = ['HeartDiseaseorAttack'], axis =1)
y = df['HeartDiseaseorAttack']

over=  SMOTEN(sampling_strategy=0.3, n_jobs=-1)
under= EditedNearestNeighbours(sampling_strategy='majority', kind_sel='all', n_jobs= -1)

pipeline = Pipeline([('over', over), ('under', under), ('model', model)])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

pipeline.fit(X_train, y_train) 

y_hat = pipeline.predict(X_test)
print(classification_report_imbalanced(y_test, y_hat))

# without pipeline (for testing)

In [None]:
print(f'Original dataset shape {Counter(y)}')

X_train, X_test, y_train , y_test = train_test_split(X,y,test_size=0.3, random_state=0) 

print(f'Y train {Counter(y_train)}')
print(f'Y test {Counter(y_test)}')

over=  SMOTEN(sampling_strategy=0.3, n_jobs=-1)
under= EditedNearestNeighbours(sampling_strategy='majority', kind_sel='all', n_jobs= -1)

X_balanced, y_balanced = over.fit_resample(X_train, y_train)
X_balanced, y_balanced = under.fit_resample(X_balanced, y_balanced)

print(f'Y balanced {Counter(y_balanced)}')

In [None]:
model = DecisionTreeClassifier()

model.fit(X_balanced, y_balanced)
y_pred = model.predict(X_test)

print(classification_report_imbalanced(y_test, y_pred))