In [1]:
# Import libraries needed for this analysis.
import altair as alt
import numpy as np
import pandas as pd
import string
from sklearn import tree
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import (
    FunctionTransformer,
    Normalizer,
    OneHotEncoder,
    OrdinalEncoder,
    StandardScaler,
    normalize,
    scale,
    )
from sklearn.metrics import plot_confusion_matrix, classification_report
from sklearn.svm import SVC, SVR

In [2]:
# Read in the table Dataset_DSChallenge.csv.
awake_df = pd.read_csv("Dataset_DSChallenge.csv")

In [3]:
# let's look at the first couple rows of our data
awake_df.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output,o2Saturation
0,63.0,1.0,3.0,145.0,233.0,1.0,0.0,150.0,0.0,2.3,0.0,0.0,1.0,1.0,98.6
1,37.0,1.0,2.0,130.0,250.0,0.0,1.0,187.0,0.0,3.5,0.0,0.0,2.0,1.0,98.6
2,41.0,0.0,1.0,130.0,204.0,0.0,0.0,172.0,0.0,1.4,2.0,0.0,2.0,1.0,98.6
3,56.0,1.0,1.0,120.0,236.0,0.0,1.0,178.0,0.0,0.8,2.0,0.0,2.0,1.0,98.6
4,57.0,0.0,0.0,120.0,354.0,0.0,1.0,163.0,1.0,0.6,2.0,0.0,2.0,1.0,98.1


In [4]:
# let's clean our feature names so that we understand them
awake_df = awake_df.rename(columns={'cp': 'chest_pain', 'trtbps': 'Rblood_pressure',
                                   'chol': 'cholesterol', 'fbs': 'Fblood_sugar', 'restecg': 'Rest_ECG',
                                   'thalachh': 'Max_heart_rate', 'exng': 'exercize_angina',
                                   'slp': 'slope', 'caa': 'major_vessels', 'thall': 'TStress_result',
                                   'output': 'Heart_attack', 'o2Saturation':'Blood_oxygen'})
awake_df.head()

Unnamed: 0,age,sex,chest_pain,Rblood_pressure,cholesterol,Fblood_sugar,Rest_ECG,Max_heart_rate,exercize_angina,oldpeak,slope,major_vessels,TStress_result,Heart_attack,Blood_oxygen
0,63.0,1.0,3.0,145.0,233.0,1.0,0.0,150.0,0.0,2.3,0.0,0.0,1.0,1.0,98.6
1,37.0,1.0,2.0,130.0,250.0,0.0,1.0,187.0,0.0,3.5,0.0,0.0,2.0,1.0,98.6
2,41.0,0.0,1.0,130.0,204.0,0.0,0.0,172.0,0.0,1.4,2.0,0.0,2.0,1.0,98.6
3,56.0,1.0,1.0,120.0,236.0,0.0,1.0,178.0,0.0,0.8,2.0,0.0,2.0,1.0,98.6
4,57.0,0.0,0.0,120.0,354.0,0.0,1.0,163.0,1.0,0.6,2.0,0.0,2.0,1.0,98.1


In [8]:
# Use .info() to find out more about our dataset.
awake_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3586 entries, 0 to 3585
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              303 non-null    float64
 1   sex              303 non-null    float64
 2   chest_pain       303 non-null    float64
 3   Rblood_pressure  303 non-null    float64
 4   cholesterol      303 non-null    float64
 5   Fblood_sugar     303 non-null    float64
 6   Rest_ECG         303 non-null    float64
 7   Max_heart_rate   303 non-null    float64
 8   exercize_angina  303 non-null    float64
 9   oldpeak          303 non-null    float64
 10  slope            303 non-null    float64
 11  major_vessels    303 non-null    float64
 12  TStress_result   303 non-null    float64
 13  Heart_attack     303 non-null    float64
 14  Blood_oxygen     3586 non-null   float64
dtypes: float64(15)
memory usage: 420.4 KB


Our dataset has 15 features. The target feature is `Heart_attack`. 14 features have 303 values, and noticeably the feature `Blood_oxygen` has 3586 values. A close examination reveals that our dataset is only complete for the first 303 entries. This explains the discrepancy between the feature `Blood_oxygen` and all other features. We will remove rows with null values, NaN.

In [10]:
# Remove NaN values in our target feature, FatLevel.
awake_df = awake_df.dropna(subset=["Heart_attack"])
awake_df.head()

Unnamed: 0,age,sex,chest_pain,Rblood_pressure,cholesterol,Fblood_sugar,Rest_ECG,Max_heart_rate,exercize_angina,oldpeak,slope,major_vessels,TStress_result,Heart_attack,Blood_oxygen
0,63.0,1.0,3.0,145.0,233.0,1.0,0.0,150.0,0.0,2.3,0.0,0.0,1.0,1.0,98.6
1,37.0,1.0,2.0,130.0,250.0,0.0,1.0,187.0,0.0,3.5,0.0,0.0,2.0,1.0,98.6
2,41.0,0.0,1.0,130.0,204.0,0.0,0.0,172.0,0.0,1.4,2.0,0.0,2.0,1.0,98.6
3,56.0,1.0,1.0,120.0,236.0,0.0,1.0,178.0,0.0,0.8,2.0,0.0,2.0,1.0,98.6
4,57.0,0.0,0.0,120.0,354.0,0.0,1.0,163.0,1.0,0.6,2.0,0.0,2.0,1.0,98.1


In [11]:
# Verify that our NaN are removed
awake_df.info()s

<class 'pandas.core.frame.DataFrame'>
Int64Index: 303 entries, 0 to 302
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              303 non-null    float64
 1   sex              303 non-null    float64
 2   chest_pain       303 non-null    float64
 3   Rblood_pressure  303 non-null    float64
 4   cholesterol      303 non-null    float64
 5   Fblood_sugar     303 non-null    float64
 6   Rest_ECG         303 non-null    float64
 7   Max_heart_rate   303 non-null    float64
 8   exercize_angina  303 non-null    float64
 9   oldpeak          303 non-null    float64
 10  slope            303 non-null    float64
 11  major_vessels    303 non-null    float64
 12  TStress_result   303 non-null    float64
 13  Heart_attack     303 non-null    float64
 14  Blood_oxygen     303 non-null    float64
dtypes: float64(15)
memory usage: 37.9 KB


Above we see that our data is now cleaned of all null values. Let's look at some statistic summary of our dataset

In [12]:
# Use .describe() to find out more about our dataset
awake_df.describe()

Unnamed: 0,age,sex,chest_pain,Rblood_pressure,cholesterol,Fblood_sugar,Rest_ECG,Max_heart_rate,exercize_angina,oldpeak,slope,major_vessels,TStress_result,Heart_attack,Blood_oxygen
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.544554,97.484488
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.498835,0.352649
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0,96.5
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0,97.5
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0,97.5
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0,97.5
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0,98.6


Above we can readily see some quick statistics on the dataset such as mean, minimum and maximum values. For instance, the mean age is around 54, the mean oxygen saturation in the blood is around 97. The youngest person is 29 and the oldest is 77, the lowest heart rate is 71 and the highest is 202, and the lowest level of oxygen saturation in the blood is 96.5; the highest level is 98.6.