# Imbalance Data

In [14]:
import pandas as pd
import numpy  as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
import random
import seaborn as SB

In [2]:
n = 10000
skip = sorted(random.sample(range(1,319795),319795-n))
DF = pd.read_csv('../../datasets/heart_2020_cleaned.csv',skiprows=skip).reset_index(drop=True)

In [3]:
# Drop rows with out of rage BMI
BMI_Out_Of_Range = DF[DF['BMI']>50]
DF = DF.drop(BMI_Out_Of_Range.index,errors='ignore')

# Drop rows with out of rage SleepTime
DF = DF.drop(DF[DF['SleepTime']>16]['SleepTime'].index,axis=0)

In [27]:
X_cols = DF.columns.difference(['HeartDisease'])
Y_col = 'HeartDisease'
X_train,X_test,Y_train,Y_test = train_test_split(DF[X_cols],DF[Y_col],random_state=3020)

In [28]:
CatCols = [
    'Smoking','AlcoholDrinking','Stroke','DiffWalking','Sex','AgeCategory','Race','Diabetic',
    'PhysicalActivity','GenHealth','Asthma','KidneyDisease','SkinCancer'
]
NumCols = [
    'BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime'
]

# <font color="Green">One Hot Encode Categorical Label :</font>

In [29]:
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

def OHC_transformer_single_column(DF,columns):
    merged = pd.DataFrame()
    for column_name in columns:
        label_encoder = LabelEncoder()
        encoded = label_encoder.fit_transform(DF[column_name])
        labels =pd.Series(label_encoder.classes_)

        clmt = ColumnTransformer([(column_name,OneHotEncoder(),[0])], remainder='passthrough')
        clmt_fited_data = clmt.fit_transform(DF[[column_name]]) 
        clmt_fited_data = clmt_fited_data if type(clmt_fited_data)== np.ndarray else clmt_fited_data.toarray()
        NDF = pd.DataFrame(clmt_fited_data,columns=[f'{column_name}_{x}' for x in labels]).set_index(DF.index)
        merged = pd.concat([merged,NDF],axis=1)
    DF = DF.drop(columns=columns)
    DF = DF.merge(merged, left_index=True,right_index=True)

    return DF

In [30]:
Y_label_encoder = LabelEncoder()
Y_train = Y_label_encoder.fit_transform(Y_train)
X_train = OHC_transformer_single_column(X_train,CatCols) 

 # <font color="Green">Normalize  :</font>

In [31]:
ss = StandardScaler()
X_train[NumCols] = ss.fit_transform(X_train[NumCols])

# <font color="Green">Imbalance Data  :</font>

In [25]:
from imblearn.over_sampling import RandomOverSampler
randOvrSmpl = RandomOverSampler(random_state=3020)
X_train, Y_train = randOvrSmpl.fit_resample(X_train,Y_train)

 # <font color="Green">Modeling  :</font>

In [35]:
from sklearn.ensemble import RandomForestClassifier
random_forest_classifier = RandomForestClassifier(
    random_state=3020,
    n_estimators=5000,
    verbose=True
)

random_forest_classifier.fit(X_train,Y_train)
# random_forest_classifier.score(X_test,Y_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 5000 out of 5000 | elapsed:   51.6s finished


RandomForestClassifier(n_estimators=5000, random_state=3020, verbose=True)

In [33]:
Y_label_encoder = LabelEncoder()
Y_test = Y_label_encoder.fit_transform(Y_test)
X_test  = OHC_transformer_single_column(X_test,CatCols) 
# Y_test = Y_test[X_test.index]


ss = StandardScaler()
X_test[NumCols] = ss.fit_transform(X_test[NumCols])



# randOvrSmpl = RandomOverSampler(random_state=3020)
# X_test, Y_test = randOvrSmpl.fit_resample(X_test,Y_test)


In [36]:
Y_predicted = random_forest_classifier.predict(X_test)
print(metrics.classification_report(Y_test,Y_predicted))
print(metrics.confusion_matrix(Y_test,Y_predicted))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


              precision    recall  f1-score   support

           0       0.92      0.99      0.95      2268
           1       0.37      0.05      0.09       208

    accuracy                           0.91      2476
   macro avg       0.64      0.52      0.52      2476
weighted avg       0.87      0.91      0.88      2476

[[2249   19]
 [ 197   11]]


[Parallel(n_jobs=1)]: Done 5000 out of 5000 | elapsed:    5.1s finished
