In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import random
import plotly.graph_objects as go
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler, KMeansSMOTE

random.seed(1)

## Notebook Purpose
I used this notebook to test the accuracy of different machine-learning algorithms on the dataset

In [None]:
df = pd.read_csv('Health_data_cleaned.csv')
df.drop(["Unnamed: 0"], axis=1, inplace=True)
df.columns

In [19]:
X = df.drop("Angina/Heart_Disease",axis=1)
y = df["Angina/Heart_Disease"]

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [21]:
KNNmodel = KNeighborsClassifier()
KNNmodel.fit(X_train, y_train)

In [23]:
y_pred = KNNmodel.predict(X_test)
accuracy_score(y_pred, y_test)

0.941546967624259

In [25]:
confusion_matrix(y_pred, y_test)

array([[65930,  3732],
       [  370,   144]], dtype=int64)

In [27]:
xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train)

In [29]:
y_pred = xgb_model.predict(X_test)
accuracy_score(y_pred, y_test)

0.9500398996808026

In [31]:
confusion_matrix(y_pred, y_test)

array([[65491,  2697],
       [  809,  1179]], dtype=int64)

In [35]:
counts = y.value_counts()
counts

Angina/Heart_Disease
0.0    265195
1.0     15508
Name: count, dtype: int64

In [41]:
counts[1]/len(y)

0.055247004841416016

In [43]:
RandomOver = RandomOverSampler()
X_train_RO, y_train_RO = RandomOver.fit_resample(X_train, y_train)
xgb_model.fit(X_train_RO, y_train_RO)

In [45]:
y_pred = xgb_model.predict(X_test)
accuracy_score(y_pred, y_test)

0.8519009347925217

In [47]:
len(X_train), len(X_train_RO)

(210527, 397790)

In [49]:
confusion_matrix(y_pred, y_test)

array([[56945,  1038],
       [ 9355,  2838]], dtype=int64)

In [51]:
smote = SMOTE(random_state=1)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
xgb_model.fit(X_train_smote, y_train_smote)

In [53]:
y_pred = xgb_model.predict(X_test)
accuracy_score(y_pred, y_test)

0.9508948928408573

In [55]:
len(X_train), len(X_train_smote)

(210527, 397790)

In [57]:
confusion_matrix(y_pred, y_test)

array([[65493,  2639],
       [  807,  1237]], dtype=int64)

# Stroke

In [63]:
X = df.drop("Stroke",axis=1)
y = df.Stroke

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [65]:
xgb_model.fit(X_train, y_train)

In [67]:
y_pred = xgb_model.predict(X_test)
accuracy_score(y_pred, y_test)

0.9625370497036023

In [69]:
confusion_matrix(y_pred, y_test)

array([[67462,  2476],
       [  153,    85]], dtype=int64)

In [71]:
smote = SMOTE(random_state=1)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
xgb_model.fit(X_train_smote, y_train_smote)

In [73]:
y_pred = xgb_model.predict(X_test)
accuracy_score(y_pred, y_test)

0.9626937984496124

In [75]:
confusion_matrix(y_pred, y_test)

array([[67474,  2477],
       [  141,    84]], dtype=int64)

In [77]:
df.columns

Index(['Sex', 'Education_Level', 'Household_Income', 'Veteran', 'Age',
       'High_Blood_Pressure', 'Blood_Pressure_Meds', 'High_Cholesterol',
       'Heart_Attack', 'Angina/Heart_Disease', 'Stroke', 'Asthma_(ever)',
       'Asthma_(currently)', 'Skin_Cancer', 'Cancer_(not skin)',
       'COPD_emphasema_chronic_bronchitis',
       'Arthritis_RA_gout_lupus_fibromyalgia', 'Depression', 'Kidney_Disease',
       'Diabetes', 'Smoker', 'Days_at_least_1_drink_recently',
       'Binge_Drinking_last_30_days', 'Chewing_tobacco', 'Any_Excercise',
       'Strength_Excercise', 'Aerobic_Excercise', 'Daily_Fruit_Juice',
       'Daily_Fruits', 'Daily_Beans', 'Daily_Green_Veg', 'Daily_Orange_Veg',
       'Daily_Other_Veg', 'BMI', 'Has_Health_Coverage',
       'Could_not_afford_to_see_doctor', 'General_Health_Self_Rating',
       'Poor_Mental_Health', 'Poor_Physical_Health',
       'Kept_from_doing_activities', 'Limited_Activities',
       'Uses_Special_Equipment', 'Difficulty_Concentrating',
       'D

# Heart Attack

In [84]:
X = df.drop("Heart_Attack",axis=1)
y = df.Heart_Attack

X_train, X_test, y_train, y_test = train_test_split(X, y)
xgb_model.fit(X_train, y_train)

In [86]:
y_pred = xgb_model.predict(X_test)
accuracy_score(y_pred, y_test)

0.9521488828089375

In [88]:
confusion_matrix(y_pred, y_test)

array([[65821,  2564],
       [  794,   997]], dtype=int64)

# Cancer (Not Skin)

In [97]:
X = df.drop("Cancer_(not skin)",axis=1)
y = df["Cancer_(not skin)"]

X_train, X_test, y_train, y_test = train_test_split(X, y)
xgb_model.fit(X_train, y_train)

In [99]:
y_pred = xgb_model.predict(X_test)
accuracy_score(y_pred, y_test)

0.906079001367989

In [101]:
confusion_matrix(y_pred, y_test)

array([[63457,  6413],
       [  178,   128]], dtype=int64)

# Skin Cancer

In [104]:
X = df.drop("Skin_Cancer",axis=1)
y = df.Skin_Cancer

X_train, X_test, y_train, y_test = train_test_split(X, y)
xgb_model.fit(X_train, y_train)

In [106]:
y_pred = xgb_model.predict(X_test)
accuracy_score(y_pred, y_test)

0.9043547651618787

In [108]:
confusion_matrix(y_pred, y_test)

array([[63280,  6456],
       [  256,   184]], dtype=int64)

# COPD_emphasema_chronic_bronchitis

In [111]:
X = df.drop("COPD_emphasema_chronic_bronchitis",axis=1)
y = df.COPD_emphasema_chronic_bronchitis

X_train, X_test, y_train, y_test = train_test_split(X, y)
xgb_model.fit(X_train, y_train)

In [113]:
y_pred = xgb_model.predict(X_test)
accuracy_score(y_pred, y_test)

0.9315862973096215

In [115]:
confusion_matrix(y_pred, y_test)

array([[64070,  3994],
       [  807,  1305]], dtype=int64)

In [117]:
df.columns

Index(['Sex', 'Education_Level', 'Household_Income', 'Veteran', 'Age',
       'High_Blood_Pressure', 'Blood_Pressure_Meds', 'High_Cholesterol',
       'Heart_Attack', 'Angina/Heart_Disease', 'Stroke', 'Asthma_(ever)',
       'Asthma_(currently)', 'Skin_Cancer', 'Cancer_(not skin)',
       'COPD_emphasema_chronic_bronchitis',
       'Arthritis_RA_gout_lupus_fibromyalgia', 'Depression', 'Kidney_Disease',
       'Diabetes', 'Smoker', 'Days_at_least_1_drink_recently',
       'Binge_Drinking_last_30_days', 'Chewing_tobacco', 'Any_Excercise',
       'Strength_Excercise', 'Aerobic_Excercise', 'Daily_Fruit_Juice',
       'Daily_Fruits', 'Daily_Beans', 'Daily_Green_Veg', 'Daily_Orange_Veg',
       'Daily_Other_Veg', 'BMI', 'Has_Health_Coverage',
       'Could_not_afford_to_see_doctor', 'General_Health_Self_Rating',
       'Poor_Mental_Health', 'Poor_Physical_Health',
       'Kept_from_doing_activities', 'Limited_Activities',
       'Uses_Special_Equipment', 'Difficulty_Concentrating',
       'D

# Kidney_Disease

In [120]:
X = df.drop("Kidney_Disease",axis=1)
y = df.Kidney_Disease

X_train, X_test, y_train, y_test = train_test_split(X, y)
xgb_model.fit(X_train, y_train)

In [122]:
y_pred = xgb_model.predict(X_test)
accuracy_score(y_pred, y_test)

0.9675245098039216

In [126]:
confusion_matrix(y_pred, y_test)

array([[67875,  2212],
       [   67,    22]], dtype=int64)

In [128]:
smote = SMOTE(random_state=1)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
xgb_model.fit(X_train_smote, y_train_smote)

In [130]:
y_pred = xgb_model.predict(X_test)
accuracy_score(y_pred, y_test)

0.9678522571819426

In [132]:
confusion_matrix(y_pred, y_test)

array([[67897,  2211],
       [   45,    23]], dtype=int64)

In [159]:
from imblearn.under_sampling import RandomUnderSampler, OneSidedSelection

In [138]:
RUS = RandomUnderSampler(random_state=1)
X_train_RUS, y_train_RUS = RUS.fit_resample(X_train, y_train)
xgb_model.fit(X_train_RUS, y_train_RUS)

In [140]:
y_pred = xgb_model.predict(X_test)
accuracy_score(y_pred, y_test)

0.7164415184678523

In [163]:
one = OneSidedSelection(random_state=1)
X_train_one, y_train_one = one.fit_resample(X_train, y_train)
xgb_model.fit(X_train_one, y_train_one)

In [165]:
y_pred = xgb_model.predict(X_test)
accuracy_score(y_pred, y_test)

0.9676670086639307

In [167]:
confusion_matrix(y_pred, y_test)

array([[67878,  2205],
       [   64,    29]], dtype=int64)

# Depression

In [170]:
X = df.drop("Depression",axis=1)
y = df.Depression

X_train, X_test, y_train, y_test = train_test_split(X, y)
xgb_model.fit(X_train, y_train)

In [172]:
y_pred = xgb_model.predict(X_test)
accuracy_score(y_pred, y_test)

0.851843935248518

In [174]:
confusion_matrix(y_pred, y_test)

array([[54411,  7922],
       [ 2475,  5368]], dtype=int64)

In [176]:
df.columns

Index(['Sex', 'Education_Level', 'Household_Income', 'Veteran', 'Age',
       'High_Blood_Pressure', 'Blood_Pressure_Meds', 'High_Cholesterol',
       'Heart_Attack', 'Angina/Heart_Disease', 'Stroke', 'Asthma_(ever)',
       'Asthma_(currently)', 'Skin_Cancer', 'Cancer_(not skin)',
       'COPD_emphasema_chronic_bronchitis',
       'Arthritis_RA_gout_lupus_fibromyalgia', 'Depression', 'Kidney_Disease',
       'Diabetes', 'Smoker', 'Days_at_least_1_drink_recently',
       'Binge_Drinking_last_30_days', 'Chewing_tobacco', 'Any_Excercise',
       'Strength_Excercise', 'Aerobic_Excercise', 'Daily_Fruit_Juice',
       'Daily_Fruits', 'Daily_Beans', 'Daily_Green_Veg', 'Daily_Orange_Veg',
       'Daily_Other_Veg', 'BMI', 'Has_Health_Coverage',
       'Could_not_afford_to_see_doctor', 'General_Health_Self_Rating',
       'Poor_Mental_Health', 'Poor_Physical_Health',
       'Kept_from_doing_activities', 'Limited_Activities',
       'Uses_Special_Equipment', 'Difficulty_Concentrating',
       'D

# Arthritis, etc.

In [179]:
X = df.drop("Arthritis_RA_gout_lupus_fibromyalgia",axis=1)
y = df.Arthritis_RA_gout_lupus_fibromyalgia

X_train, X_test, y_train, y_test = train_test_split(X, y)
xgb_model.fit(X_train, y_train)

In [181]:
y_pred = xgb_model.predict(X_test)
accuracy_score(y_pred, y_test)

0.7679406064751482

In [183]:
confusion_matrix(y_pred, y_test)

array([[41657, 10366],
       [ 5919, 12234]], dtype=int64)

# Asthma (Ever)

In [192]:
X = df.drop(["Asthma_(currently)", "Asthma_(ever)"],axis=1)
y = df["Asthma_(ever)"]

X_train, X_test, y_train, y_test = train_test_split(X, y)
xgb_model.fit(X_train, y_train)

In [194]:
y_pred = xgb_model.predict(X_test)
accuracy_score(y_pred, y_test)

0.8732757637938896

In [196]:
confusion_matrix(y_pred, y_test)

array([[60426,  8123],
       [  770,   857]], dtype=int64)

# Heart Attack

In [205]:
sgd = SGDClassifier()
X = df.drop(["Heart_Attack"],axis=1)
y = df.Heart_Attack

X_train, X_test, y_train, y_test = train_test_split(X, y)
sgd.fit(X_train, y_train)

In [207]:
y_pred = sgd.predict(X_test)
accuracy_score(y_pred, y_test)

0.935405266757866

In [215]:
from sklearn.kernel_approximation import AdditiveChi2Sampler, PolynomialCountSketch

In [211]:
Add = AdditiveChi2Sampler()
X = df.drop(["Heart_Attack"],axis=1)
y = df.Heart_Attack

X_train, X_test, y_train, y_test = train_test_split(X, y)
Add.fit(X_train, y_train)

In [213]:
y_pred = sgd.predict(X_test)
accuracy_score(y_pred, y_test)

0.9361035111719106

In [266]:
poly = PolynomialCountSketch()
X = df.drop(["Heart_Attack"],axis=1)
y = df.Heart_Attack

X_train, X_test, y_train, y_test = train_test_split(X, y)
poly.fit(X_train, y_train)

In [270]:
y_pred = poly.predict(X_test)
accuracy_score(y_pred, y_test)

AttributeError: 'PolynomialCountSketch' object has no attribute 'predict'

In [221]:
from sklearn.svm import LinearSVC, SVC

In [223]:
Lsvc = LinearSVC()
Lsvc.fit(X_train, y_train)



# Ensemble Classifiers

In [251]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, RandomForestClassifier

In [230]:
X = df.drop(["Heart_Attack"],axis=1)
y = df.Heart_Attack

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [232]:
ada = AdaBoostClassifier()
ada.fit(X_train, y_train)

In [234]:
y_pred = sgd.predict(X_test)
accuracy_score(y_pred, y_test)

0.9355905152758778

In [239]:
hist = HistGradientBoostingClassifier()
hist.fit(X_train, y_train)

In [241]:
y_pred = sgd.predict(X_test)
accuracy_score(y_pred, y_test)

0.9355905152758778

In [243]:
confusion_matrix(y_pred, y_test)

array([[64136,  2222],
       [ 2298,  1520]], dtype=int64)

In [245]:
gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)

In [255]:
y_pred = gbc.predict(X_test)
accuracy_score(y_pred, y_test)

0.9525336297309621

In [257]:
confusion_matrix(y_pred, y_test)

array([[65736,  2633],
       [  698,  1109]], dtype=int64)

In [253]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [259]:
y_pred = rf.predict(X_test)
accuracy_score(y_pred, y_test)

0.9507951436388509

In [261]:
confusion_matrix(y_pred, y_test)

array([[65967,  2986],
       [  467,   756]], dtype=int64)