### Загружаем библиотеки

In [52]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

### Загружаем, редактируем датасет

In [2]:
dataset = pd.read_csv('Data.csv')

In [3]:
dataset.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [5]:
dataset = dataset.drop('Sample code number', axis=1)

In [6]:
dataset.head()

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2


In [7]:
X = dataset.drop('Class', axis = 1)

In [8]:
y = dataset.Class

### Разделяем датасет на тестовый и тренировочный

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

### Стандартизируем значения

In [12]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

### Тренируем модели

#### 1. Логистическая регрессия 

In [24]:
log_clf = LogisticRegression()
log_clf.fit(X_train, y_train)

LogisticRegression()

#### 2. KNN - классификатор (указываем число ближайших соседий, способ измерения расстояния)

In [28]:
knn_clf = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
knn_clf.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=9)

#### 3. Линейный SVM

In [34]:
lin_svm_clf = SVC(kernel = 'linear')
lin_svm_clf.fit(X_train, y_train)

SVC(kernel='linear')

#### 3. Ядерный SVM (можно выбирать разные ядра)

In [41]:
kernel_svm_clf = SVC(kernel = 'rbf')
kernel_svm_clf.fit(X_train, y_train)

SVC()

#### 4. Naive Bayes классификатор

In [46]:
nb_clf = GaussianNB()
nb_clf.fit(X_train, y_train)

GaussianNB()

#### 5. Решающие деревья

In [50]:
dt_clf = DecisionTreeClassifier(criterion = 'entropy')
dt_clf.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy')

#### 5. Случайные леса

In [54]:
rf_clf = RandomForestClassifier(n_estimators = 10, criterion = 'entropy')
rf_clf.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', n_estimators=10)

### Оцениваем эффективность модели

#### 1. Логистическая регрессия

In [39]:
y_pred_log = log_clf.predict(X_test)
log_cm = confusion_matrix(y_test, y_pred_log)
print(log_cm)
accuracy_score(y_test, y_pred_log).round(3)

[[114   2]
 [  2  53]]


0.977

#### 2. KNN - классификатор

In [38]:
y_pred_knn = knn_clf.predict(X_test)
cm_knn = confusion_matrix(y_test, y_pred_knn)
print(cm_knn)
accuracy_score(y_test, y_pred_knn).round(3)

[[114   2]
 [  2  53]]


0.977

#### 3. Линейный SVM

In [37]:
y_pred_lin_svm = lin_svm_clf.predict(X_test)
lin_svm_cm = confusion_matrix(y_test, y_pred_lin_svm)
print(lin_svm_cm)
accuracy_score(y_test, y_pred_lin_svm).round(3)

[[114   2]
 [  1  54]]


0.982

#### 3. Ядерный SVM

In [47]:
y_pred_kernel_svm = kernel_svm_clf.predict(X_test)
kernel_svm_cm = confusion_matrix(y_test, y_pred_kernel_svm)
print(kernel_svm_cm)
accuracy_score(y_test, y_pred_kernel_svm).round(3)

[[113   3]
 [  1  54]]


0.977

#### 4. Naive Bayes классификатор

In [48]:
y_pred_nb = nb_clf.predict(X_test)
nb_cm = confusion_matrix(y_test, y_pred_nb)
print(nb_cm)
accuracy_score(y_test, y_pred_nb).round(3)

[[111   5]
 [  1  54]]


0.965

#### 5. Решающие деревья

In [51]:
y_pred_dt = dt_clf.predict(X_test)
dt_cm = confusion_matrix(y_test, y_pred_dt)
print(dt_cm)
accuracy_score(y_test, y_pred_dt).round(3)

[[115   1]
 [  2  53]]


0.982

#### 5. Случайные леса

In [55]:
y_pred_rf = rf_clf.predict(X_test)
rf_cm = confusion_matrix(y_test, y_pred_rf)
print(rf_cm)
accuracy_score(y_test, y_pred_rf).round(3)

[[116   0]
 [  1  54]]


0.994