<a href="https://colab.research.google.com/github/moosenasser/ipynb-samples/blob/main/machine-learning/classification/binary_classification_with_cardiovascular_disease.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

*Note: You are currently reading this using Google Colaboratory which is a cloud-hosted version of Jupyter Notebook. This is a document containing both text cells for documentation and runnable code cells. If you are unfamiliar with Jupyter Notebook, watch this 3-minute introduction before starting this challenge: https://www.youtube.com/watch?v=inN8seMm7UI*

---

The goal of this notebook is predicting cardiovascular disease. Scikit-learn 0.24.2 is utilized to create a support vector machine for accomplishing this task.


In [1]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

In [2]:
bmi_overweight_threshold = 25

cols_to_norm = ['gender', 'cholesterol', 'gluc']
features = ['age', 'gender', 'height', 'weight', 'cholesterol', 'gluc', 'smoke',
            'active', 'overweight', 'alco', 'blood_pressure']

med_exam_csv_path = '/content/sample_data/medical_examination.csv'

In [3]:
# Import data
df = pd.read_csv(med_exam_csv_path)

In [4]:
df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [5]:
df.age = df.age / 365
df.height = df.height / 100

# Initializing list of BMI's
bmis = df.weight / df.height ** 2

# Add 'overweight' column
df['overweight'] = bmis > bmi_overweight_threshold

'''Add 'blood_pressure' column by computing quotient of systolic and diastolic
   pressures'''
df['blood_pressure'] = df.ap_hi / df.ap_lo

In [6]:
# Get indexes with column value of + or - infinity for 'blood pressure'.
indexes_with_inf = df[np.isinf(df.blood_pressure)].index
 
# Delete row indexes from dataframe.
df.drop(indexes_with_inf, inplace=True)

In [7]:
# Normalize data by making 0 always good and 1 always bad.
# If the value of 'cholesterol' or 'gluc' is 1, make the value 0. If the value is more than 1, make the value 1.
sub_df = df[cols_to_norm]

set_from_sub_df = set(sub_df.to_numpy().flatten())

set_from_sub_df_to_binary = {num: 1 if num > 1 else 0 for num in set_from_sub_df}

df[cols_to_norm] = sub_df.replace(set_from_sub_df_to_binary)

In [8]:
df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,overweight,blood_pressure
0,0,50.391781,1,1.68,62.0,110,80,0,0,0,0,1,0,False,1.375
1,1,55.419178,0,1.56,85.0,140,90,1,0,0,0,1,1,True,1.555556
2,2,51.663014,0,1.65,64.0,130,70,1,0,0,0,0,1,False,1.857143
3,3,48.282192,1,1.69,82.0,150,100,0,0,0,0,1,1,True,1.5
4,4,47.873973,0,1.56,56.0,100,60,0,0,0,0,0,0,False,1.666667


In [9]:
X_train, X_valid, y_train, y_valid = train_test_split(df[features], df.cardio,
                                                      random_state=1)

In [10]:
model = SVC()

model.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [12]:
valid_score = model.score(X_valid, y_valid)
print('Score (mean accuracy) for validation data: {:.3f}'.format(valid_score))

Score (mean accuracy) for validation data: 0.633
