In [74]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv('heart_clean.csv')
df = df.drop('Unnamed: 0', axis = 1)
df.sample(10)

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
35677,No,19.58,No,Yes,No,0.0,5.0,No,Female,50-54,White,No,Yes,Excellent,7.0,No,No,Yes
93678,No,32.72,No,No,No,0.0,0.0,No,Male,40-44,White,No,Yes,Very good,6.0,No,No,No
123757,No,25.06,No,No,No,0.0,0.0,No,Male,45-49,Other,No,No,Very good,6.0,No,No,No
40795,No,33.45,No,No,No,0.0,10.0,No,Female,35-39,White,No,Yes,Fair,5.0,No,No,No
298045,No,22.5,Yes,No,No,0.0,0.0,No,Male,45-49,Hispanic,No,No,Excellent,7.0,No,No,No
133660,No,25.96,No,No,No,5.0,15.0,Yes,Female,80 or older,White,No,Yes,Good,10.0,Yes,No,Yes
204084,No,22.96,No,No,No,3.0,5.0,No,Female,18-24,White,No,Yes,Very good,6.0,No,No,No
43514,Yes,34.95,Yes,No,Yes,0.0,20.0,Yes,Female,55-59,Other,Yes,No,Good,4.0,No,No,No
259698,No,23.06,Yes,No,No,3.0,0.0,No,Male,75-79,White,No,Yes,Very good,7.0,No,No,Yes
171316,No,25.1,No,No,No,0.0,0.0,Yes,Male,80 or older,White,No,Yes,Good,7.0,Yes,No,No


In [33]:
temp = df.copy()
temp['SkinCancer'] = temp['SkinCancer'].replace(['No', 'Yes'], [0, 1])

In [34]:
pos_df = df[df['SkinCancer'] == 'Yes']
neg_df = df[df['SkinCancer'] == 'No']

In [35]:
print(df['SkinCancer'].value_counts())

No     272425
Yes     29292
Name: SkinCancer, dtype: int64


In [69]:
categorical_val = []
continous_val = []
for column in df.columns:
    if len(df[column].unique()) <= 13:
        categorical_val.append(column)
    else:
        continous_val.append(column)

print(categorical_val)
print(continous_val)

['HeartDisease', 'Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'Sex', 'AgeCategory', 'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'Asthma', 'KidneyDisease', 'SkinCancer']
['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime']


In [56]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    if train:
        pred = clf.predict(X_train)
        clf_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(y_train, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_train, pred)}\n")
        
    elif train==False:
        pred = clf.predict(X_test)
        clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
        print("Test Result:\n================================================")        
        print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")


In [76]:
count_neg, count_pos = df['SkinCancer'].value_counts()

neg_under = neg_df.sample(count_pos)
test_under = pd.concat([neg_under, pos_df], axis = 0)

print(test_under['SkinCancer'].value_counts())

No     29292
Yes    29292
Name: SkinCancer, dtype: int64


In [77]:
pos_over = pos_df.sample(count_neg, replace = True)
test_over = pd.concat([pos_over, neg_df], axis = 0)

print(test_over['SkinCancer'].value_counts())

Yes    272425
No     272425
Name: SkinCancer, dtype: int64


In [81]:
cat_copy = categorical_val.copy()
cat_copy.remove('SkinCancer')
dummy = pd.get_dummies(test_under, columns = cat_copy)
ddf = dummy.drop(['SkinCancer'], axis = 1)
target = temp['SkinCancer']
df_under = ddf.join(target)
df_under.sample(10)

Unnamed: 0,BMI,PhysicalHealth,MentalHealth,SleepTime,HeartDisease_No,HeartDisease_Yes,Smoking_No,Smoking_Yes,AlcoholDrinking_No,AlcoholDrinking_Yes,...,GenHealth_Excellent,GenHealth_Fair,GenHealth_Good,GenHealth_Poor,GenHealth_Very good,Asthma_No,Asthma_Yes,KidneyDisease_No,KidneyDisease_Yes,SkinCancer
46295,38.62,10.0,20.0,9.0,1,0,0,1,1,0,...,0,0,0,1,0,1,0,1,0,0
269549,28.25,25.0,0.0,7.0,1,0,0,1,1,0,...,0,0,1,0,0,1,0,1,0,1
87986,33.28,7.0,2.0,8.0,1,0,1,0,1,0,...,0,0,1,0,0,0,1,1,0,0
192866,23.67,0.0,0.0,8.0,1,0,1,0,1,0,...,0,0,1,0,0,1,0,1,0,1
197161,28.37,0.0,0.0,7.0,1,0,1,0,1,0,...,0,0,1,0,0,1,0,1,0,1
109920,36.49,0.0,0.0,8.0,1,0,0,1,1,0,...,0,0,0,0,1,1,0,1,0,0
175588,23.87,0.0,0.0,7.0,0,1,1,0,1,0,...,0,0,0,0,1,1,0,1,0,1
70724,22.96,4.0,0.0,8.0,1,0,1,0,1,0,...,0,1,0,0,0,1,0,1,0,0
94345,30.18,7.0,7.0,6.0,0,1,0,1,1,0,...,0,0,1,0,0,1,0,1,0,1
136858,29.53,0.0,10.0,7.0,1,0,1,0,1,0,...,1,0,0,0,0,1,0,1,0,0


In [83]:
cat_copy = categorical_val.copy()
cat_copy.remove('SkinCancer')
dummy = pd.get_dummies(test_over, columns = cat_copy)
ddf = dummy.drop(['SkinCancer'], axis = 1)
target = temp['SkinCancer']
df_over = ddf.join(target)
df_over.sample(10)

Unnamed: 0,BMI,PhysicalHealth,MentalHealth,SleepTime,HeartDisease_No,HeartDisease_Yes,Smoking_No,Smoking_Yes,AlcoholDrinking_No,AlcoholDrinking_Yes,...,GenHealth_Excellent,GenHealth_Fair,GenHealth_Good,GenHealth_Poor,GenHealth_Very good,Asthma_No,Asthma_Yes,KidneyDisease_No,KidneyDisease_Yes,SkinCancer
24966,21.79,0.0,4.0,6.0,1,0,1,0,0,1,...,0,0,1,0,0,1,0,1,0,0
137978,26.22,10.0,12.0,7.0,1,0,1,0,1,0,...,1,0,0,0,0,1,0,1,0,0
202823,28.35,0.0,0.0,8.0,1,0,0,1,1,0,...,0,0,1,0,0,1,0,1,0,1
27664,22.81,0.0,5.0,6.0,1,0,1,0,1,0,...,0,0,1,0,0,1,0,1,0,0
20809,15.11,0.0,0.0,6.0,1,0,1,0,1,0,...,0,0,0,0,1,1,0,1,0,1
48028,23.44,0.0,0.0,3.0,1,0,0,1,1,0,...,0,0,0,0,1,1,0,0,1,1
2863,23.73,0.0,8.0,8.0,0,1,0,1,1,0,...,0,0,1,0,0,1,0,1,0,1
7747,34.21,21.0,14.0,7.0,1,0,0,1,1,0,...,0,0,0,1,0,1,0,1,0,1
150782,28.32,0.0,0.0,6.0,1,0,0,1,1,0,...,0,0,0,0,1,1,0,1,0,0
141170,39.46,1.0,0.0,7.0,1,0,1,0,1,0,...,0,0,1,0,0,1,0,1,0,1


In [84]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
col_to_scale = ['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime']
df_under[col_to_scale] = sc.fit_transform(df_under[col_to_scale])
df_under.sample(10)

Unnamed: 0,BMI,PhysicalHealth,MentalHealth,SleepTime,HeartDisease_No,HeartDisease_Yes,Smoking_No,Smoking_Yes,AlcoholDrinking_No,AlcoholDrinking_Yes,...,GenHealth_Excellent,GenHealth_Fair,GenHealth_Good,GenHealth_Poor,GenHealth_Very good,Asthma_No,Asthma_Yes,KidneyDisease_No,KidneyDisease_Yes,SkinCancer
69987,-0.976768,-0.461436,-0.469269,0.568255,1,0,1,0,1,0,...,0,0,0,0,1,0,1,1,0,0
54221,-0.122247,-0.461436,-0.469269,1.255582,1,0,1,0,1,0,...,1,0,0,0,0,1,0,1,0,1
129612,-0.345308,-0.230373,-0.469269,0.568255,1,0,1,0,1,0,...,0,0,0,0,1,1,0,0,1,1
260129,0.397683,-0.461436,-0.469269,-1.493725,1,0,1,0,1,0,...,0,0,1,0,0,1,0,1,0,0
153823,0.528895,-0.461436,-0.469269,-0.119071,1,0,0,1,1,0,...,0,0,1,0,0,0,1,1,0,0
161361,-0.592971,-0.461436,-0.469269,0.568255,1,0,0,1,1,0,...,1,0,0,0,0,1,0,1,0,1
116724,0.15658,-0.345905,-0.21377,-0.806398,1,0,0,1,1,0,...,0,0,0,0,1,1,0,1,0,0
137351,-0.604453,0.69388,-0.469269,0.568255,0,1,0,1,0,1,...,0,0,1,0,0,1,0,1,0,0
268217,0.528895,-0.461436,-0.469269,-0.806398,1,0,0,1,1,0,...,1,0,0,0,0,1,0,1,0,0
149890,0.68635,-0.230373,-0.34152,-0.119071,0,1,0,1,1,0,...,0,0,1,0,0,1,0,0,1,1


In [85]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
col_to_scale = ['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime']
df_over[col_to_scale] = sc.fit_transform(df_over[col_to_scale])
df_over.sample(10)

Unnamed: 0,BMI,PhysicalHealth,MentalHealth,SleepTime,HeartDisease_No,HeartDisease_Yes,Smoking_No,Smoking_Yes,AlcoholDrinking_No,AlcoholDrinking_Yes,...,GenHealth_Excellent,GenHealth_Fair,GenHealth_Good,GenHealth_Poor,GenHealth_Very good,Asthma_No,Asthma_Yes,KidneyDisease_No,KidneyDisease_Yes,SkinCancer
133067,-0.406807,-0.459196,-0.469748,-0.118272,1,0,1,0,1,0,...,0,0,0,0,1,1,0,1,0,1
33063,-1.358183,-0.459196,-0.469748,1.945234,1,0,0,1,1,0,...,1,0,0,0,0,1,0,1,0,0
258189,0.567454,-0.459196,-0.469748,-0.806107,1,0,0,1,1,0,...,0,1,0,0,0,1,0,1,0,1
246105,0.590339,-0.459196,-0.469748,0.569563,1,0,0,1,1,0,...,0,0,0,0,1,1,0,1,0,0
37861,-0.086412,0.584978,2.081077,-0.806107,1,0,1,0,1,0,...,0,0,0,0,1,1,0,1,0,1
26088,-0.249879,-0.459196,-0.469748,0.569563,1,0,1,0,1,0,...,0,0,0,0,1,0,1,1,0,0
199214,1.535177,-0.227157,-0.214665,-0.118272,0,1,1,0,1,0,...,0,0,1,0,0,0,1,0,1,1
24775,-0.714124,-0.459196,-0.469748,0.569563,1,0,0,1,1,0,...,1,0,0,0,0,1,0,1,0,0
38498,0.155518,-0.459196,-0.469748,0.569563,1,0,1,0,1,0,...,0,0,0,0,1,1,0,1,0,0
171901,0.338601,1.165074,-0.469748,0.569563,1,0,0,1,1,0,...,1,0,0,0,0,1,0,1,0,0


In [94]:
from sklearn.model_selection import train_test_split

XU = df_under.drop('SkinCancer', axis = 1)
yU = df_under['SkinCancer']

XO = df_over.drop('SkinCancer', axis = 1)
yO = df_over['SkinCancer']

XU_train, XU_test, yU_train, yU_test = train_test_split(XU, yU, test_size = 0.3, random_state = 42)
XO_train, XO_test, yO_train, yO_test = train_test_split(XO, yO, test_size = 0.3, random_state = 42)

## Modeling

#### KNN Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors = 10)
knn.fit(X_train, y_train)
y_pred1 = knn.predict(X_test)
print(accuracy_score(y_test, y_pred1))