# Classification Model

In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

In [24]:
data = pd.read_csv('data/heart-disease.csv')
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [25]:
len(data)

303

In [26]:
# setup random seed
np.random.seed(42)

#make the data
X = data.drop("target", axis = 1)
y = data["target"]

#split the data
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

# # Instantiate LinearSVC
linear_svc = LinearSVC()
linear_svc.fit(X_train, y_train)

# #Evaluate the LinearSVC
linear_svc.score(X_test, y_test)




0.8688524590163934

In [27]:
# Instantiate RandomForestClassifier
linear_svc = RandomForestClassifier(n_estimators=100)
linear_svc.fit(X_train, y_train)
linear_svc.score(X_test, y_test)

0.8524590163934426

In [28]:
X.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [29]:
y.head()

0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64

In [30]:
# use a trained model to make predctions
linear_svc.predict(X_test)

array([0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0])

In [31]:
y_test

179    0
228    0
111    1
246    0
60     1
      ..
249    0
104    1
300    0
193    0
184    0
Name: target, Length: 61, dtype: int64

In [32]:
# compare predictions to truth labels to evaluate the model
y_preds = linear_svc.predict(X_test) 
np.mean(y_preds == y_test)

0.8524590163934426

In [33]:
# first 5 elements of the array using array slicing
linear_svc.predict(X_test[:5]) 

array([0, 1, 1, 0, 1])

In [35]:
# Accuracy score
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_preds)
accuracy

0.8524590163934426

In [39]:
# save and load the model with pickle
import pickle

# Save your model
with open('random_forest_model.pkl', 'wb') as file:
    pickle.dump(linear_svc, file)

# load the model
with open('random_forest_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

# make predictions with loaded model
predictions = loaded_model.predict(X_test)
predictions

array([0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0])