## Reading and Cleaning Data


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [34]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

In [8]:
df= pd.read_csv("heart.csv")

In [9]:
df.head(10)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
5,57,1,0,140,192,0,1,148,0,0.4,1,0,1,1
6,56,0,1,140,294,0,0,153,0,1.3,1,0,2,1
7,44,1,1,120,263,0,1,173,0,0.0,2,0,3,1
8,52,1,2,172,199,1,1,162,0,0.5,2,0,3,1
9,57,1,2,150,168,0,1,174,0,1.6,2,0,2,1


In [10]:
df.info

<bound method DataFrame.info of      age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
0     63    1   3       145   233    1        0      150      0      2.3   
1     37    1   2       130   250    0        1      187      0      3.5   
2     41    0   1       130   204    0        0      172      0      1.4   
3     56    1   1       120   236    0        1      178      0      0.8   
4     57    0   0       120   354    0        1      163      1      0.6   
..   ...  ...  ..       ...   ...  ...      ...      ...    ...      ...   
298   57    0   0       140   241    0        1      123      1      0.2   
299   45    1   3       110   264    0        1      132      0      1.2   
300   68    1   0       144   193    1        1      141      0      3.4   
301   57    1   0       130   131    0        1      115      1      1.2   
302   57    0   1       130   236    0        0      174      0      0.0   

     slope  ca  thal  target  
0        0   0     1    

In [11]:
df.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.544554
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.498835
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [12]:
df.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object')

In [13]:
df.shape

(303, 14)

In [14]:
df[df.duplicated()]

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
164,38,1,2,138,175,0,1,173,0,0.0,2,4,2,1


In [15]:
print("Total Observations: " + str(df.shape))

df = df.drop_duplicates()

print("Total Observations After Removing Duplicates: " + str(df.shape))

Total Observations: (303, 14)
Total Observations After Removing Duplicates: (302, 14)


In [16]:
df.loc[:, "slope"] = df.loc[:, "slope"].map({0: "downsloping", 1: "flat", 2: "upsloping"})
df.loc[:, "thal"] = df.loc[:, "thal"].map({1: "fixed_effect", 2: "normal", 3: "reversable_defect", 0: "else"})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


## Preprocessing

In [25]:
encode_cols = ["slope", "thal"]
dummies = pd.get_dummies(df[encode_cols], drop_first = True)
fin = pd.concat([df, dummies], axis = 1).drop(encode_cols, axis = 1)
fin

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,ca,target,slope_flat,slope_upsloping,thal_fixed_effect,thal_normal,thal_reversable_defect
0,63,1,3,145,233,1,0,150,0,2.3,0,1,0,0,1,0,0
1,37,1,2,130,250,0,1,187,0,3.5,0,1,0,0,0,1,0
2,41,0,1,130,204,0,0,172,0,1.4,0,1,0,1,0,1,0
3,56,1,1,120,236,0,1,178,0,0.8,0,1,0,1,0,1,0
4,57,0,0,120,354,0,1,163,1,0.6,0,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,0,0,1,0,0,0,1
299,45,1,3,110,264,0,1,132,0,1.2,0,0,1,0,0,0,1
300,68,1,0,144,193,1,1,141,0,3.4,2,0,1,0,0,0,1
301,57,1,0,130,131,0,1,115,1,1.2,1,0,1,0,0,0,1


In [26]:
target = "target"
predictors = [col for col in fin.columns if col != target]

X_train, X_test, y_train, y_test = train_test_split(fin[predictors],
                                                    fin[target],
                                                    test_size = 0.25,
                                                    random_state = 42)


def cv_model(model, X = X_train, y = y_train):
    return cross_val_score(model, X, y, scoring = "accuracy", n_jobs = -1).mean()

In [27]:
for col in num_cols:   
    scaler = StandardScaler()

    X_train[col] = scaler.fit_transform(X_train[col].values.reshape(-1, 1))
    X_test[col] = scaler.transform(X_test[col].values.reshape(-1, 1))

In [28]:
X_train2 = X_train.drop("fbs", axis = 1)
X_test2 = X_test.drop("fbs", axis = 1)

## Models

In [29]:
logreg = LogisticRegression(random_state = 42)

train_accuracy = {}
test_accuracy = {}


In [30]:
models = {
    "LogisticRegression": logreg}

In [31]:
for name, model in models.items():
    
    model.fit(X_train2, y_train)
    train_preds = model.predict(X_train2)
    test_preds = model.predict(X_test2)
    
    train_accuracy[name] = accuracy_score(train_preds, y_train).round(4)
    test_accuracy[name] = accuracy_score(test_preds, y_test).round(4)

In [32]:
scores = pd.DataFrame([train_accuracy, test_accuracy], 
                      index = ["TrainAccuracy", "TestAccuracy"]).T

In [33]:
scores

Unnamed: 0,TrainAccuracy,TestAccuracy
LogisticRegression,0.8628,0.8816
