In [2]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import SGDClassifier 
from sklearn.svm import SVC 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import precision_score, recall_score 
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import GridSearchCV 
%matplotlib inline 
sns.set(style="ticks")


In [3]:
data=pd.read_csv('heart.csv', sep=",")

In [4]:
data.dtypes

age         int64
sex         int64
cp          int64
trestbps    int64
chol        int64
restecg     int64
thalach     int64
slope       int64
ca          int64
thal        int64
target      int64
dtype: object

In [5]:
data.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
restecg     0
thalach     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [6]:
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,restecg,thalach,slope,ca,thal,target
0,63,1,3,145,233,0,150,0,0,1,1
1,37,1,2,130,250,1,187,0,0,2,1
2,41,0,1,130,204,0,172,2,0,2,1
3,56,1,1,120,236,1,178,2,0,2,1
4,57,0,0,120,354,1,163,2,0,2,1


Разделение выборки на обучающую и тестовую

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    data, data['target'], test_size= 0.2, random_state= 1)

In [8]:
# Размер обучающей выборки
X_train.shape, y_train.shape

((242, 11), (242,))

In [9]:
# Размер тестовой выборки
X_test.shape, y_test.shape

((61, 11), (61,))

In [10]:
#обучение линейной модели
sgd = SGDClassifier().fit(X_train, y_train)



In [12]:
target_sgd = sgd.predict(X_test)

In [13]:

accuracy_score(y_test, target_sgd), \
precision_score(y_test, target_sgd),  \
recall_score(y_test, target_sgd)

(0.6557377049180327, 0.65625, 0.6774193548387096)

In [14]:
#обучение SVC
svc = SVC(gamma='auto').fit(X_train, y_train)

In [15]:
target_svc = svc.predict(X_test)

In [16]:
accuracy_score(y_test, target_svc), \
precision_score(y_test, target_svc),  \
recall_score(y_test, target_svc)

(0.5081967213114754, 0.5081967213114754, 1.0)

In [17]:
#обучение дерева решений

In [18]:
tree = DecisionTreeClassifier(random_state=1, max_depth=0.75).fit(X_train, y_train)

In [20]:
target_tree = tree.predict(X_test)

In [21]:
accuracy_score(y_test, target_tree), \
precision_score(y_test, target_tree),  \
recall_score(y_test, target_tree)

(0.5081967213114754, 0.5081967213114754, 1.0)

Подбор гиперпараметра

In [22]:
scores_sgd = cross_val_score(SGDClassifier(), 
                         X_train, y_train, cv=2)
scores_sgd



array([0.55371901, 0.66115702])

In [23]:
scores_svm_svc = cross_val_score(SVC(gamma='auto'), 
                         X_train, y_train, cv=2)
scores_svm_svc

array([0.55371901, 0.55371901])

In [24]:
scores_decision_tree = cross_val_score(DecisionTreeClassifier(), 
                         X_train, y_train, cv=2)
scores_decision_tree

array([1., 1.])

In [25]:
parameters = {'alpha':[0.5,0.4,0.3,0.2,0.1]}
clf_gs_sgd = GridSearchCV(SGDClassifier(), parameters, cv=2, scoring='accuracy')
clf_gs_sgd.fit(X_train, y_train)



GridSearchCV(cv=2, error_score='raise-deprecating',
       estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'alpha': [0.5, 0.4, 0.3, 0.2, 0.1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [26]:
#для линейной модели
clf_gs_sgd.best_params_

{'alpha': 0.1}

In [27]:
parameters = {'gamma':[0.9,0.8,0.7,0.6,0.5,0.4,0.3,0.2,0.1]}
clf_gs_svm_svc = GridSearchCV(SVC(), parameters, cv=2, scoring='accuracy')
clf_gs_svm_svc.fit(X_train, y_train)

GridSearchCV(cv=2, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'gamma': [0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [28]:
#для SVC

clf_gs_svm_svc.best_params_

{'gamma': 0.9}

In [29]:
parameters = {'min_impurity_decrease':[0.9,0.8,0.7,0.6,0.5,0.4,0.3,0.2,0.1]}
clf_gs_decision_tree = GridSearchCV(DecisionTreeClassifier(), parameters, cv=2, scoring='accuracy')
clf_gs_decision_tree.fit(X_train, y_train)

GridSearchCV(cv=2, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'min_impurity_decrease': [0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [30]:
#для дерева решений
clf_gs_decision_tree.best_params_

{'min_impurity_decrease': 0.4}

Для найденных оптимальных значений:

In [31]:
sgd2 = SGDClassifier(alpha=0.1).fit(X_train, y_train)



In [32]:
target_sgd2= sgd2.predict(X_test)

In [33]:
accuracy_score(y_test, target_sgd2), \
precision_score(y_test, target_sgd2),  \
recall_score(y_test, target_sgd2)

(0.5573770491803278, 0.75, 0.1935483870967742)

In [34]:
svc2 = SVC(gamma=0.9).fit(X_train, y_train)

In [35]:
target_svc2 = svc2.predict(X_test)

In [36]:
accuracy_score(y_test, target_svc2), \
precision_score(y_test, target_svc2),  \
recall_score(y_test, target_svc2)

(0.5081967213114754, 0.5081967213114754, 1.0)

In [37]:
tree2 = DecisionTreeClassifier(random_state=1, min_impurity_decrease=0.4, max_depth=0.75).fit(X_train, y_train)

In [38]:
target_tree2 = tree2.predict(X_test)

In [39]:
accuracy_score(y_test, target_tree2), \
precision_score(y_test, target_tree2),  \
recall_score(y_test, target_tree2)

(0.5081967213114754, 0.5081967213114754, 1.0)