# Chapter 11: Spot-Check Classification Algorithms

## 11.3. Linear Machine Learning Algorithms

### 11.3.1. Logistic Regression

In [None]:
# Logistic Regression Classification
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
# Tên file dữ liệu
filename = 'pima-indians-diabetes-dataset.csv'
# Đặt tên cột phù hợp
names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
# Đọc dữ liệu
dataframe = pd.read_csv(filename, names=names, header=0)
X = dataframe.iloc[:, 0:8].values
Y = dataframe.iloc[:, 8].values
# Thiết lập KFold và mô hình
kfold = KFold(n_splits=10, shuffle=True, random_state=7)
model = LogisticRegression(max_iter=200)  # max_iter để tránh warning
# Đánh giá mô hình
results = cross_val_score(model, X, Y, cv=kfold, scoring='accuracy')
# In kết quả
print("Accuracy: {:.3f}".format(results.mean()))

Accuracy: 0.772


### 11.3.2. Linear Discriminant Analysis

In [4]:
# LDA Classification
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# Tên file dữ liệu
filename = 'pima-indians-diabetes-dataset.csv'
# Đặt tên cột phù hợp
names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
# Đọc dữ liệu
dataframe = pd.read_csv(filename, names=names, header=0)
X = dataframe.iloc[:, 0:8].values
Y = dataframe.iloc[:, 8].values
num_folds = 10
kfold = KFold(n_splits=10, shuffle=True, random_state=7)
model = LinearDiscriminantAnalysis()
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

0.7669685577580315


## 11.4. Nonlinear Machine Learning Algorithms

### 11.4.1. k-Nearest Neighbors

In [5]:
# KNN Classification
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
# Tên file dữ liệu
filename = 'pima-indians-diabetes-dataset.csv'
# Đặt tên cột phù hợp
names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
# Đọc dữ liệu
dataframe = pd.read_csv(filename, names=names, header=0)
X = dataframe.iloc[:, 0:8].values
Y = dataframe.iloc[:, 8].values
num_folds = 10
kfold = KFold(n_splits=10, shuffle=True, random_state=7)
model = KNeighborsClassifier()
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

0.7109876965140123


### 11.4.2. Naive Bayes

In [6]:
# Gaussian Naive Bayes Classification
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.naive_bayes import GaussianNB
# Tên file dữ liệu
filename = 'pima-indians-diabetes-dataset.csv'
# Đặt tên cột phù hợp
names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
# Đọc dữ liệu
dataframe = pd.read_csv(filename, names=names, header=0)
X = dataframe.iloc[:, 0:8].values
Y = dataframe.iloc[:, 8].values
kfold = KFold(n_splits=10, shuffle=True, random_state=7)
model = GaussianNB()
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

0.7591421736158578


### 11.4.3. Classification and Regression Trees

In [7]:
# CART Classification
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.tree import DecisionTreeClassifier
# Tên file dữ liệu
filename = 'pima-indians-diabetes-dataset.csv'
# Đặt tên cột phù hợp
names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
# Đọc dữ liệu
dataframe = pd.read_csv(filename, names=names, header=0)
X = dataframe.iloc[:, 0:8].values
Y = dataframe.iloc[:, 8].values
kfold = KFold(n_splits=10, shuffle=True, random_state=7)
model = DecisionTreeClassifier()
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

0.6928571428571428


### 11.4.4. Support Vector Machines

In [8]:
# SVM Classification
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.svm import SVC
# Tên file dữ liệu
filename = 'pima-indians-diabetes-dataset.csv'
# Đặt tên cột phù hợp
names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
# Đọc dữ liệu
dataframe = pd.read_csv(filename, names=names, header=0)
X = dataframe.iloc[:, 0:8].values
Y = dataframe.iloc[:, 8].values
kfold = KFold(n_splits=10, shuffle=True, random_state=7)
model = SVC()
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

0.760457963089542
