In [28]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

# Loading Heart Dataset

In [37]:
df = pd.read_csv('heart_disease.csv')
df.sample(50)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
133,52,1,0,125.152471,211.954241,0,1.0,167.850104,0,0.985901,2.0,2,3.082813,0
31,45,0,1,130.014076,234.236787,0,0.0,175.029338,0,0.569119,1.0,0,1.857329,1
54,56,1,1,119.934048,236.054538,0,1.0,177.953598,0,0.944532,2.0,0,2.144511,1
207,56,1,2,,256.189595,1,0.0,141.981335,1,0.606726,1.0,1,0.983927,0
75,58,1,0,145.979414,218.102085,0,1.0,104.95275,0,1.963749,1.0,1,3.136325,0
164,64,1,3,169.797158,227.099945,0,0.0,155.01616,0,0.533652,1.0,0,3.272126,1
72,59,1,3,159.912844,272.954293,0,0.0,124.980398,0,0.105165,2.0,0,2.047473,0
203,61,1,0,139.931504,206.958953,0,0.0,,1,2.093503,2.0,1,2.998184,0
199,46,0,1,104.942611,204.083958,0,1.0,172.166496,0,-0.176438,2.0,0,1.970499,1
71,42,1,0,135.818989,315.085809,0,1.0,124.923144,1,1.81986,1.0,0,0.858554,0


# Feature descriptions
### Below is the group of features presents in the dataset segregated by their type (numerical, categorical, ordinal, binary)

# Binary
### sex (0 = female; 1 = male)
### fbs: Fasting blood sugar > 120 mg/dl
### exang: Exercise induced angina (0 = no; 1 = yes)
# Categorical
### cp: Chest pain type (0 = Asymptomatic angina; 1 = Atypical angina; 2 = Non-angina; 3 = Typical angina)
### restecg: Resting ECG (0 = Left ventricular hypertrophy; 1 = Normal; 2 = ST-T wave abnormality)
### slope: Slope of the peak exercise ST segment (0 = downsloping; 1 = upsloping; 2 = flat)
### thal: Thalium stress test result (0 = NA; 1 = Fixed defect; 2 = Normal; 3 = Reversible defect)
# Ordinal
### ca: number of major vessels (0-3) colored by flourosopy
# Numeric
### age
### oldpeak: ST depression induced by exercise relative to rest
### trestbps: Resting blood pressure
### chol: Serum cholestoral in mg/dl
### thalach: Maximum heart rate achieved during thalium stress test
# Target
### target: 1 = heart disease; 0 = no heart disease

In [30]:
bins = ['sex', 'fbs', 'exang']
cats = ['cp', 'restecg', 'slope', 'thal']
ords = ['ca']
nums = ['age', 'oldpeak', 'trestbps', 'chol', 'thalach']
target = ['target']

In [31]:
# simple numeric encoding the categorical variables for easy data exploration, you may reverse it later to numeric value for model training and testing

df.cp = df.cp.replace({0:'Asympt.', 1:'Atypical', 2:'Non', 3:'Typical'})
df.restecg = df.restecg.replace({0:'LV hyper', 1:'Normal', 2:'ST-T wave'})
df.slope = df.slope.replace({0:'down', 1:'up', 2:'flat'})
df.thal = df.thal.replace({0:'NA', 1:'Fixed', 2:'Normal', 3:'Revers.'})

In [32]:
df.describe()

Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,ca,thal,target
count,212.0,212.0,205.0,202.0,212.0,208.0,212.0,200.0,212.0,211.0,212.0
mean,54.311321,0.688679,131.78461,244.133256,0.132075,149.647978,0.34434,1.113106,0.731132,2.349112,0.542453
std,9.145339,0.46413,18.057222,46.444257,0.339374,22.076206,0.476277,1.255908,1.038762,0.602117,0.499374
min,29.0,0.0,93.944184,126.085811,0.0,88.032613,0.0,-0.185668,0.0,0.858554,0.0
25%,47.0,0.0,119.968114,211.969594,0.0,135.946808,0.0,0.050778,0.0,1.949795,0.0
50%,55.0,1.0,130.010256,241.467023,0.0,151.939216,0.0,0.72606,0.0,2.078759,1.0
75%,61.0,1.0,139.96547,272.484222,0.0,165.260092,1.0,1.816733,1.0,2.970842,1.0
max,77.0,1.0,192.0202,406.932689,1.0,202.138041,1.0,6.157114,4.0,3.277466,1.0


# Load Iris Datset

# Feature descriptions
### The columns in this dataset are:

### Id
### SepalLengthCm
### SepalWidthCm
### PetalLengthCm
### PetalWidthCm
### Species

In [33]:
df_iris = pd.read_csv('iris_dataset.csv')

In [34]:
df_iris.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,105.0,101.0,97.0,105.0
mean,5.858909,3.059083,3.81237,1.199708
std,0.861638,0.455116,1.793489,0.787193
min,4.344007,1.94601,1.033031,-0.072203
25%,5.159145,2.768688,1.545136,0.333494
50%,5.736104,3.049459,4.276817,1.331797
75%,6.435413,3.290318,5.094427,1.817211
max,7.795561,4.409565,6.768611,2.603123


In [35]:
df_iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.04507,2.508203,3.018024,1.164924,Iris-versicolor
1,6.325517,2.115481,4.542052,1.413651,Iris-versicolor
2,5.257497,3.814303,1.47066,0.395348,Iris-setosa
3,6.675168,3.2017,5.785461,2.362764,Iris-virginica
4,5.595237,2.678166,4.07775,1.369266,Iris-versicolor


In [36]:
df_iris.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')