Information about the Statlog (Heart) dataset can be found at https://archive.ics.uci.edu/dataset/145/statlog+heart

In [1]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
statlog_heart = fetch_ucirepo(id=145)

In [2]:
statlog_heart.metadata

{'uci_id': 145,
 'name': 'Statlog (Heart)',
 'repository_url': 'https://archive.ics.uci.edu/dataset/145/statlog+heart',
 'data_url': 'https://archive.ics.uci.edu/static/public/145/data.csv',
 'abstract': 'This dataset is a heart disease database similar to a database already present in the repository (Heart Disease databases) but in a slightly different form',
 'area': 'Health and Medicine',
 'tasks': ['Classification'],
 'characteristics': ['Multivariate'],
 'num_instances': 270,
 'num_features': 13,
 'feature_types': ['Categorical', 'Real'],
 'demographics': ['Age', 'Sex'],
 'target_col': ['heart-disease'],
 'index_col': None,
 'has_missing_values': 'no',
 'missing_values_symbol': None,
 'year_of_dataset_creation': None,
 'last_updated': 'Fri Feb 09 2024',
 'dataset_doi': '10.24432/C57303',
 'creators': [],
 'intro_paper': None,
 'additional_info': {'summary': 'Cost Matrix\r\n\r\n_______\t abse  pres\r\nabsence\t 0\t1\r\npresence  5\t0\r\n\r\nwhere the rows represent the true values 

In [3]:
statlog_heart.variables

Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,age,Feature,Continuous,Age,,,no
1,sex,Feature,Binary,Sex,,,no
2,chest-pain,Feature,Categorical,,chest pain type,,no
3,rest-bp,Feature,Continuous,,resting blood pressure,,no
4,serum-chol,Feature,Continuous,,serum cholesterol,mg/dl,no
5,fasting-blood-sugar,Feature,Binary,,fasting blood sugar > 120 mg/dl,,no
6,electrocardiographic,Feature,Categorical,,resting electrocardiographic results,,no
7,max-heart-rate,Feature,Continuous,,maximum heart rate achieved,,no
8,angina,Feature,Binary,,exercise induced anigna,,no
9,oldpeak,Feature,Continuous,,oldpeak = ST depression induced by exercise re...,,no


In [4]:
  # data (as pandas dataframes) 
X = statlog_heart.data.features 
y = statlog_heart.data.targets 

In [7]:
X.describe()

Unnamed: 0,age,sex,chest-pain,rest-bp,serum-chol,fasting-blood-sugar,electrocardiographic,max-heart-rate,angina,oldpeak,slope,major-vessels,thal
count,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0
mean,54.433333,0.677778,3.174074,131.344444,249.659259,0.148148,1.022222,149.677778,0.32963,1.05,1.585185,0.67037,4.696296
std,9.109067,0.468195,0.95009,17.861608,51.686237,0.355906,0.997891,23.165717,0.470952,1.14521,0.61439,0.943896,1.940659
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0,3.0
25%,48.0,0.0,3.0,120.0,213.0,0.0,0.0,133.0,0.0,0.0,1.0,0.0,3.0
50%,55.0,1.0,3.0,130.0,245.0,0.0,2.0,153.5,0.0,0.8,2.0,0.0,3.0
75%,61.0,1.0,4.0,140.0,280.0,0.0,2.0,166.0,1.0,1.6,2.0,1.0,7.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,3.0,7.0


In [11]:
X.corr().style.background_gradient(cmap='coolwarm')

Unnamed: 0,age,sex,chest-pain,rest-bp,serum-chol,fasting-blood-sugar,electrocardiographic,max-heart-rate,angina,oldpeak,slope,major-vessels,thal
age,1.0,-0.094401,0.09692,0.273053,0.220056,0.123458,0.128171,-0.402215,0.098297,0.194234,0.159774,0.356081,0.1061
sex,-0.094401,1.0,0.034636,-0.062693,-0.201647,0.04214,0.039253,-0.076101,0.180022,0.097412,0.050545,0.08683,0.391046
chest-pain,0.09692,0.034636,1.0,-0.043196,0.090465,-0.098537,0.074325,-0.317682,0.35316,0.167244,0.1369,0.22589,0.262659
rest-bp,0.273053,-0.062693,-0.043196,1.0,0.173019,0.155681,0.116157,-0.039136,0.082793,0.2228,0.142472,0.085697,0.132045
serum-chol,0.220056,-0.201647,0.090465,0.173019,1.0,0.025186,0.167652,-0.018739,0.078243,0.027709,-0.005755,0.126541,0.028836
fasting-blood-sugar,0.123458,0.04214,-0.098537,0.155681,0.025186,1.0,0.053499,0.022494,-0.004107,-0.025538,0.044076,0.123774,0.049237
electrocardiographic,0.128171,0.039253,0.074325,0.116157,0.167652,0.053499,1.0,-0.074628,0.095098,0.120034,0.160614,0.114368,0.007337
max-heart-rate,-0.402215,-0.076101,-0.317682,-0.039136,-0.018739,0.022494,-0.074628,1.0,-0.380719,-0.349045,-0.386847,-0.265333,-0.253397
angina,0.098297,0.180022,0.35316,0.082793,0.078243,-0.004107,0.095098,-0.380719,1.0,0.274672,0.255908,0.153347,0.321449
oldpeak,0.194234,0.097412,0.167244,0.2228,0.027709,-0.025538,0.120034,-0.349045,0.274672,1.0,0.609712,0.255005,0.324333


In [9]:
y.describe()

Unnamed: 0,heart-disease
count,270.0
mean,1.444444
std,0.497827
min,1.0
25%,1.0
50%,1.0
75%,2.0
max,2.0


In [19]:
from sklearn.model_selection import train_test_split

# TODO : add seed here
import numpy
numpy.random.seed(42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [20]:
X_train.describe()

Unnamed: 0,age,sex,chest-pain,rest-bp,serum-chol,fasting-blood-sugar,electrocardiographic,max-heart-rate,angina,oldpeak,slope,major-vessels,thal
count,189.0,189.0,189.0,189.0,189.0,189.0,189.0,189.0,189.0,189.0,189.0,189.0,189.0
mean,54.751323,0.693122,3.21164,131.809524,251.529101,0.137566,1.042328,149.31746,0.37037,1.155556,1.608466,0.698413,4.783069
std,9.132393,0.462423,0.966278,17.952233,52.55489,0.345359,0.999099,22.877443,0.484186,1.195707,0.614655,0.967035,1.948957
min,29.0,0.0,1.0,94.0,141.0,0.0,0.0,88.0,0.0,0.0,1.0,0.0,3.0
25%,48.0,0.0,3.0,120.0,215.0,0.0,0.0,132.0,0.0,0.0,1.0,0.0,3.0
50%,56.0,1.0,4.0,130.0,244.0,0.0,2.0,154.0,0.0,1.0,2.0,0.0,3.0
75%,62.0,1.0,4.0,140.0,282.0,0.0,2.0,166.0,1.0,1.8,2.0,1.0,7.0
max,74.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,3.0,7.0


In [21]:
X_test.describe()

Unnamed: 0,age,sex,chest-pain,rest-bp,serum-chol,fasting-blood-sugar,electrocardiographic,max-heart-rate,angina,oldpeak,slope,major-vessels,thal
count,81.0,81.0,81.0,81.0,81.0,81.0,81.0,81.0,81.0,81.0,81.0,81.0,81.0
mean,53.691358,0.641975,3.08642,130.259259,245.296296,0.17284,0.975309,150.518519,0.234568,0.803704,1.530864,0.604938,4.493827
std,9.067307,0.482407,0.911009,17.711421,49.644094,0.380464,0.999691,23.94844,0.426369,0.981,0.614134,0.88993,1.917834
min,35.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0,3.0
25%,46.0,0.0,3.0,120.0,208.0,0.0,0.0,138.0,0.0,0.0,1.0,0.0,3.0
50%,54.0,1.0,3.0,130.0,249.0,0.0,0.0,153.0,0.0,0.5,1.0,0.0,3.0
75%,59.0,1.0,4.0,140.0,273.0,0.0,2.0,167.0,0.0,1.4,2.0,1.0,7.0
max,77.0,1.0,4.0,192.0,407.0,1.0,2.0,195.0,1.0,4.0,3.0,3.0,7.0


In [23]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()

clf.fit(X_train, y_train)

  return fit_method(estimator, *args, **kwargs)


In [25]:
y_train.shape

(189, 1)