In [20]:
pip install ucimlrepo


Note: you may need to restart the kernel to use updated packages.


In [262]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
heart_disease = fetch_ucirepo(id=45) 
  

df = pd.read_csv('cleveland_data.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [263]:
#select features used for training model

df = df.rename({'num':'res'}, axis=1)
cols = [0,1,2,3, 4, 5, 6, 8, 13]
df = df[df.columns[cols]]
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,exang,res
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,0.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,1.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,1.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,0.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,0.0,0


In [264]:
#check dtype of features
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    float64
 1   sex       303 non-null    float64
 2   cp        303 non-null    float64
 3   trestbps  303 non-null    float64
 4   chol      303 non-null    float64
 5   fbs       303 non-null    float64
 6   restecg   303 non-null    float64
 7   exang     303 non-null    float64
 8   res       303 non-null    int64  
dtypes: float64(8), int64(1)
memory usage: 21.4 KB


In [265]:
#check whether there's null values
df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
exang       0
res         0
dtype: int64

In [266]:
def convert_one(x):
    if x >= 1:
        return 1

    if x == 0:
        return 0

df['res'] = df['res'].apply(convert_one)
df['res'].unique()

array([0, 1], dtype=int64)

In [268]:
#split dataset

In [318]:
y = df['res'].values
X = df[['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'exang']].values

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test =\
train_test_split(X, y,
                 test_size=0.3,
                 random_state=1,
                 stratify=y)


In [320]:
pip install xgboost




In [321]:
#evaluate xgboost model

import xgboost as xgb
from sklearn.metrics import accuracy_score

model = xgb.XGBClassifier(learning_rate=0.01, max_depth=4, random_state=1)

gbm = model.fit(X_train, y_train)

y_train_pred = gbm.predict(X_train)
y_test_pred = gbm.predict(X_test)

gbm_train = accuracy_score(y_train, y_train_pred)
gbm_test = accuracy_score(y_test, y_test_pred)

print(f'XGboost train/test accuracies '
      f'{gbm_train:.3f}/{gbm_test:.3f}')

XGboost train/test accuracies 0.887/0.747


In [340]:
#testing with an input
X = np.array([[10.0, 0.0, 1.0, 80.0, 200.0, 1.0, 2.0, 1.0]])

y_pred_xgb = gbm.predict(X)
print(y_pred_xgb)


[0]


In [342]:
import pickle

In [344]:
data = {"model": gbm}
with open('saved_steps.pkl', 'wb') as file:
    pickle.dump(data, file)

In [346]:
with open('saved_steps.pkl', 'rb') as file:
    data = pickle.load(file)

gbm_loaded = data["model"]

In [348]:
y_pred = gbm_loaded.predict(X)
y_pred

array([0])