# Imbalance Data

In [1]:
import pandas as pd
import numpy  as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
import random
import seaborn as SB
import matplotlib.pyplot as PLT

In [2]:
n = 10000
skip = sorted(random.sample(range(1,319795),319795-n))
DF = pd.read_csv('../../datasets/heart_2020_cleaned.csv',skiprows=skip).reset_index(drop=True)

In [3]:
# Drop rows with out of rage BMI
# BMI_Out_Of_Range = DF[DF['BMI']>50]
# DF = DF.drop(BMI_Out_Of_Range.index,errors='ignore')

# Drop rows with out of rage SleepTime
# DF = DF.drop(DF[DF['SleepTime']>16]['SleepTime'].index,axis=0)

In [4]:
CatCols = [
    'Smoking','AlcoholDrinking','Stroke','DiffWalking','Sex','AgeCategory','Race','Diabetic',
    'PhysicalActivity','GenHealth','Asthma','KidneyDisease','SkinCancer'
]
NumCols = [
    'BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime'
]

In [6]:
X_cols = [*NumCols,*CatCols]
Y_col = 'HeartDisease'
X_train,X_test,Y_train,Y_test = train_test_split(DF[X_cols],DF[Y_col],random_state=3020)

# <font color="Green">One Hot Encode Categorical Label :</font>

In [8]:
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

def OHC_transformer_single_column(DF,columns):
    merged = pd.DataFrame()
    for column_name in columns:
        label_encoder = LabelEncoder()
        encoded = label_encoder.fit_transform(DF[column_name])
        labels =pd.Series(label_encoder.classes_)

        clmt = ColumnTransformer([(column_name,OneHotEncoder(),[0])], remainder='passthrough')
        clmt_fited_data = clmt.fit_transform(DF[[column_name]]) 
        clmt_fited_data = clmt_fited_data if type(clmt_fited_data)== np.ndarray else clmt_fited_data.toarray()
        NDF = pd.DataFrame(clmt_fited_data,columns=[f'{column_name}_{x}' for x in labels]).set_index(DF.index)
        merged = pd.concat([merged,NDF],axis=1)
    DF = DF.drop(columns=columns)
    DF = DF.merge(merged, left_index=True,right_index=True)

    return DF

In [9]:
Y_label_encoder = LabelEncoder()
Y_train = Y_label_encoder.fit_transform(Y_train)
X_train = OHC_transformer_single_column(X_train,CatCols) 

 # <font color="Green">Normalize  :</font>

In [10]:
if (len(NumCols)):
    ss = StandardScaler()
    X_train[NumCols] = ss.fit_transform(X_train[NumCols])

# <font color="Green">Imbalance Data  :</font>

In [11]:
from imblearn.combine import SMOTEENN  
randOvrSmpl = SMOTEENN(random_state=3020)
X_train, Y_train = randOvrSmpl.fit_resample(X_train,Y_train)

In [12]:
# from imblearn.combine import SMOTETomek
# randOvrSmpl = SMOTETomek(random_state=3020)
# X_train, Y_train = randOvrSmpl.fit_resample(X_train,Y_train)

In [13]:
X_train.shape

(11652, 50)

 # <font color="Green">Testing Data  :</font>

In [14]:
Y_label_encoder = LabelEncoder()
Y_test = Y_label_encoder.fit_transform(Y_test)
X_test  = OHC_transformer_single_column(X_test,CatCols) 


if (len(NumCols)):
    ss = StandardScaler()
    X_test[NumCols] = ss.fit_transform(X_test[NumCols])

 # <font color="Green">RandomForest Modeling  :</font>

In [None]:
from sklearn.ensemble import RandomForestClassifier
random_forest_classifier = RandomForestClassifier(
    random_state=3020,
    n_estimators=5000,
    verbose=True
)

random_forest_classifier.fit(X_train,Y_train)
# random_forest_classifier.score(X_test,Y_test)

Y_predicted = random_forest_classifier.predict(X_test)
print(metrics.classification_report(Y_test,Y_predicted))
print(metrics.confusion_matrix(Y_test,Y_predicted))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


 # <font color="Green">CatBoostClassifier Modeling  :</font>

In [None]:
from catboost import Pool, CatBoostClassifier

In [None]:
catboostCLF = CatBoostClassifier(
    iterations=1500,learning_rate=.01,depth=1,eval_metric='Accuracy',random_state=3020,
    verbose=True
)

catboostCLF.fit(X_train,Y_train)
# random_forest_classifier.score(X_test,Y_test)

Y_predicted = catboostCLF.predict(X_test)
print(metrics.classification_report(Y_test,Y_predicted))
print(metrics.confusion_matrix(Y_test,Y_predicted))

 # <font color="Green">DecisionTreeClassifier Modeling  :</font>

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
D3CLF = DecisionTreeClassifier(
    random_state=3020,min_samples_split=8,min_samples_leaf=2,class_weight={0:100,1:1},
)

D3CLF.fit(X_train,Y_train)
# random_forest_classifier.score(X_test,Y_test)

Y_predicted = D3CLF.predict(X_test)
print(metrics.classification_report(Y_test,Y_predicted))
print(metrics.confusion_matrix(Y_test,Y_predicted))

 # <font color="Green">KNeighborsClassifier Modeling  :</font>

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knnCLF = KNeighborsClassifier(n_neighbors=2 ,weights='distance',p=1)

knnCLF.fit(X_train,Y_train)
# random_forest_classifier.score(X_test,Y_test)

Y_predicted = knnCLF.predict(X_test)
print(metrics.classification_report(Y_test,Y_predicted))
print(metrics.confusion_matrix(Y_test,Y_predicted))

 # <font color="Green">xgboost Modeling  :</font>

In [None]:
import xgboost as xgb

In [None]:
data_matrix = xgb.DMatrix(data=X_train,label=Y_train)

xgbCLF = xgb.XGBRFClassifier(colsample_bytree=.9,max_depth=5000,n_estimators=3000,)

xgbCLF.fit(X_train,Y_train)
# random_forest_classifier.score(X_test,Y_test)

Y_predicted = xgbCLF.predict(X_test)
print(metrics.classification_report(Y_test,Y_predicted))
print(metrics.confusion_matrix(Y_test,Y_predicted))

In [None]:
fig, ax = PLT.subplots(figsize=(12, 20))
xgb.plot_importance(xgbCLF,ax=ax)