In [1]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt,seaborn as sns

In [2]:
data = pd.read_csv('heart.csv')

In [3]:
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [4]:
data.dtypes.value_counts()

int64      13
float64     1
dtype: int64

In [5]:
feature_cols = [x for x in data.columns if x!='target']

In [6]:
def plot_decision_boundary(estimator, X, y):
    estimator.fit(X,y)
    X_color = X.sample(300, random_state=42)
    y_color = y.loc[X_color.index]
    y_color = y_color.map(lambda x: 'red' if x==1 else 'yellow')

    
    x_axis, y_axis = np.arange(0,1.005, 0.005), np.arange(0, 1.0005, 0.005)
    xx, yy = np.meshgrid(x_axis, y_axis)
    xx_ravel = xx.ravel()
    yy_ravel = yy.ravel()
    X_grid = pd.DataFrame([xx_ravel, yy_ravel]).T
    y_grid_predictions = estimator.predict(X_grid)
    y_grid_predictions = y_grid_predictions.reshape(xx.shape)
    fig, ax = plt.subplots(figsize=(10, 10))
    ax.contourf(xx, yy, y_grid_predictions, cmap=plt.cm.autumn_r, alpha=.3)
    ax.scatter(X_color.iloc[:,0], X_color.iloc[:,1],color=y_color, alpha=1)
    ax.set(xlabel=fields[0], ylabel=fields[1], xlim=[0, 1], ylim=[0,1], title=str(estimator))

In [7]:
from sklearn.model_selection import StratifiedShuffleSplit

sss = StratifiedShuffleSplit(n_splits=1, test_size=100)
train_idx, test_idx = next(sss.split(data[feature_cols], data['target']))

In [8]:
X_train = data.loc[train_idx, feature_cols]
X_test = data.loc[test_idx, feature_cols]
Y_train = data.loc[train_idx, 'target']
Y_test = data.loc[test_idx, 'target']

In [9]:
from sklearn.svm import LinearSVC

Lsvc = LinearSVC()
Lsvc.fit(X_train, Y_train)
y_pred = Lsvc.predict(X_test)

In [10]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, classification_report, confusion_matrix
print(classification_report(Y_test, y_pred))
print(f'Recall:\n {recall_score(Y_test, y_pred)}')
print(f'Accuracy_score:\n{accuracy_score(Y_test, y_pred)}')
print(f'f1_score:\n{f1_score(Y_test, y_pred)}')


             precision    recall  f1-score   support

          0       0.86      0.41      0.56        46
          1       0.65      0.94      0.77        54

avg / total       0.75      0.70      0.67       100

Recall:
 0.9444444444444444
Accuracy_score:
0.7
f1_score:
0.7727272727272727


In [11]:
param_grid = {'gamma':[0.001,0.01,0.1,0.5,1,2,10],
             'C':[0.01,0.1,1,10]}
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
GS_SVC = GridSearchCV(SVC(kernel='rbf'),
                     param_grid=param_grid,
                      n_jobs=-1,
                     scoring='accuracy')

In [12]:
GS_SVC.fit(X_train, Y_train)
GS_SVC.best_estimator_

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [13]:
y_pred = GS_SVC.predict(X_test)

In [14]:
print(classification_report(Y_test, y_pred))
print(f'Recall:\n {recall_score(Y_test, y_pred)}')
print(f'Accuracy_score:\n{accuracy_score(Y_test, y_pred)}')
print(f'f1_score:\n{f1_score(Y_test, y_pred)}')

             precision    recall  f1-score   support

          0       0.66      0.46      0.54        46
          1       0.63      0.80      0.70        54

avg / total       0.64      0.64      0.63       100

Recall:
 0.7962962962962963
Accuracy_score:
0.64
f1_score:
0.7049180327868853


In [15]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt = dt.fit(X_train, Y_train)

In [16]:
dt.tree_.node_count, dt.tree_.max_depth

(75, 9)

In [17]:
param_grid = {'max_depth':range(1, dt.tree_.max_depth+1,2),
             'max_features':range(1, len(dt.feature_importances_)+1)}
GR_DT = GridSearchCV(DecisionTreeClassifier(),
                    param_grid = param_grid,
                    scoring='accuracy',
                    n_jobs=-1)
GR_DT = GR_DT.fit(X_train, Y_train)
GR_DT.best_estimator_

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
            max_features=8, max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [18]:
GR_DT.best_estimator_.tree_.node_count

3

In [19]:
GR_DT.best_estimator_.tree_.max_depth

1

In [20]:
y_pred= GR_DT.predict(X_test)

In [21]:
print(classification_report(Y_test, y_pred))
print(f'Recall:\n {recall_score(Y_test, y_pred)}')
print(f'Accuracy_score:\n{accuracy_score(Y_test, y_pred)}')
print(f'f1_score:\n{f1_score(Y_test, y_pred)}')

             precision    recall  f1-score   support

          0       0.78      0.61      0.68        46
          1       0.72      0.85      0.78        54

avg / total       0.75      0.74      0.74       100

Recall:
 0.8518518518518519
Accuracy_score:
0.74
f1_score:
0.7796610169491525


In [23]:
corr = data.corr

In [24]:
corr

<bound method DataFrame.corr of      age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
0     63    1   3       145   233    1        0      150      0      2.3   
1     37    1   2       130   250    0        1      187      0      3.5   
2     41    0   1       130   204    0        0      172      0      1.4   
3     56    1   1       120   236    0        1      178      0      0.8   
4     57    0   0       120   354    0        1      163      1      0.6   
..   ...  ...  ..       ...   ...  ...      ...      ...    ...      ...   
298   57    0   0       140   241    0        1      123      1      0.2   
299   45    1   3       110   264    0        1      132      0      1.2   
300   68    1   0       144   193    1        1      141      0      3.4   
301   57    1   0       130   131    0        1      115      1      1.2   
302   57    0   1       130   236    0        0      174      0      0.0   

     slope  ca  thal  target  
0        0   0     1    

In [25]:
corr.plot()

AttributeError: 'function' object has no attribute 'plot'

In [26]:
data['target'].value_counts()

1    165
0    138
Name: target, dtype: int64

In [36]:
df = data.drop(columns='target')
df_uniques = df.nunique()
df_uniques
# categorical_varibles = list(data.nunique())

age          41
sex           2
cp            4
trestbps     49
chol        152
fbs           2
restecg       3
thalach      91
exang         2
oldpeak      40
slope         3
ca            5
thal          4
dtype: int64

In [37]:
categorical_varibles = list(df_uniques[(df_uniques<=6)].index)

In [41]:
categorical_varibles = categorical_varibles[1:-1]

In [42]:
categorical_varibles

['cp', 'fbs', 'restecg', 'exang', 'slope', 'ca']

In [43]:
data = pd.get_dummies(data, columns=categorical_varibles, drop_first=True)

In [44]:
data

Unnamed: 0,age,sex,trestbps,chol,thalach,oldpeak,thal,target,cp_1,cp_2,...,fbs_1,restecg_1,restecg_2,exang_1,slope_1,slope_2,ca_1,ca_2,ca_3,ca_4
0,63,1,145,233,150,2.3,1,1,0,0,...,1,0,0,0,0,0,0,0,0,0
1,37,1,130,250,187,3.5,2,1,0,1,...,0,1,0,0,0,0,0,0,0,0
2,41,0,130,204,172,1.4,2,1,1,0,...,0,0,0,0,0,1,0,0,0,0
3,56,1,120,236,178,0.8,2,1,1,0,...,0,1,0,0,0,1,0,0,0,0
4,57,0,120,354,163,0.6,2,1,0,0,...,0,1,0,1,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,140,241,123,0.2,3,0,0,0,...,0,1,0,1,1,0,0,0,0,0
299,45,1,110,264,132,1.2,3,0,0,0,...,0,1,0,0,1,0,0,0,0,0
300,68,1,144,193,141,3.4,3,0,0,0,...,1,1,0,0,1,0,0,1,0,0
301,57,1,130,131,115,1.2,3,0,0,0,...,0,1,0,1,1,0,1,0,0,0


In [49]:
feature_cols = [x for x in data.columns if x!='target']

In [50]:
from sklearn.model_selection import StratifiedShuffleSplit

sss = StratifiedShuffleSplit(n_splits=1, test_size=100)
train_idx, test_idx = next(sss.split(data[feature_cols], data['target']))

In [51]:
X_train = data.loc[train_idx, feature_cols]
X_test = data.loc[test_idx, feature_cols]
Y_train = data.loc[train_idx, 'target']
Y_test = data.loc[test_idx, 'target']

In [52]:
from sklearn.svm import LinearSVC

Lsvc = LinearSVC()
Lsvc.fit(X_train, Y_train)
y_pred = Lsvc.predict(X_test)

from sklearn.metrics import accuracy_score, f1_score, recall_score, classification_report, confusion_matrix
print(classification_report(Y_test, y_pred))
print(f'Recall:\n {recall_score(Y_test, y_pred)}')
print(f'Accuracy_score:\n{accuracy_score(Y_test, y_pred)}')
print(f'f1_score:\n{f1_score(Y_test, y_pred)}')

             precision    recall  f1-score   support

          0       0.92      0.26      0.41        46
          1       0.61      0.98      0.75        54

avg / total       0.75      0.65      0.59       100

Recall:
 0.9814814814814815
Accuracy_score:
0.65
f1_score:
0.75177304964539


In [59]:
linsvm = pd.DataFrame(data=[Y_test, y_pred]).T

In [61]:
linsvm['Unnamed 0'] = linsvm['Unnamed 0'].fillna(0)

In [64]:
linsvm

Unnamed: 0,target,Unnamed 0
27,1.0,1.0
151,1.0,0.0
183,0.0,0.0
35,1.0,1.0
162,1.0,0.0
...,...,...
171,0.0,0.0
54,1.0,0.0
161,1.0,0.0
221,0.0,0.0


In [65]:
linsvm.to_csv('linsvm.csv')

In [66]:
param_grid = {'gamma':[0.001,0.01,0.1,0.5,1,2,10],
             'C':[0.01,0.1,1,10]}
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
GS_SVC = GridSearchCV(SVC(kernel='rbf'),
                     param_grid=param_grid,
                      n_jobs=-1,
                     scoring='accuracy')

GS_SVC.fit(X_train, Y_train)
GS_SVC.best_estimator_

y_pred = GS_SVC.predict(X_test)

print(classification_report(Y_test, y_pred))
print(f'Recall:\n {recall_score(Y_test, y_pred)}')
print(f'Accuracy_score:\n{accuracy_score(Y_test, y_pred)}')
print(f'f1_score:\n{f1_score(Y_test, y_pred)}')

gasvm = pd.DataFrame(data=[Y_test, y_pred]).T
gasvm['Unnamed 0'] = linsvm['Unnamed 0'].fillna(0)
gasvm
gasvm.to_csv('gasvm.csv')

             precision    recall  f1-score   support

          0       0.66      0.59      0.62        46
          1       0.68      0.74      0.71        54

avg / total       0.67      0.67      0.67       100

Recall:
 0.7407407407407407
Accuracy_score:
0.67
f1_score:
0.7079646017699114


In [67]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt = dt.fit(X_train, Y_train)

param_grid = {'max_depth':range(1, dt.tree_.max_depth+1,2),
             'max_features':range(1, len(dt.feature_importances_)+1)}
GR_DT = GridSearchCV(DecisionTreeClassifier(),
                    param_grid = param_grid,
                    scoring='accuracy',
                    n_jobs=-1)
GR_DT = GR_DT.fit(X_train, Y_train)
y_pred= GR_DT.predict(X_test)
print(classification_report(Y_test, y_pred))
print(f'Recall:\n {recall_score(Y_test, y_pred)}')
print(f'Accuracy_score:\n{accuracy_score(Y_test, y_pred)}')
print(f'f1_score:\n{f1_score(Y_test, y_pred)}')

dtsvm = pd.DataFrame(data=[Y_test, y_pred]).T
dtsvm['Unnamed 0'] = linsvm['Unnamed 0'].fillna(0)
dtsvm
dtsvm.to_csv('dtsvm.csv')

             precision    recall  f1-score   support

          0       0.74      0.61      0.67        46
          1       0.71      0.81      0.76        54

avg / total       0.72      0.72      0.72       100

Recall:
 0.8148148148148148
Accuracy_score:
0.72
f1_score:
0.7586206896551724
