# Evaluate the Performance of Machine Learning Algorithms with Resampling

## Split into Train and Test Sets

In [1]:
# Evaluate using a train and a test set
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
# Tên file dữ liệu
filename = 'pima-indians-diabetes-dataset.csv'
# Đặt tên cột phù hợp
names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin','BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
# Đọc dữ liệu
dataframe = pd.read_csv(filename, names=names, header=0)
array = dataframe.values
X = array[:, 0:8]
Y = array[:, 8]
# Chia tập train/test
test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
# Huấn luyện model
model = LogisticRegression(max_iter=1000)  # Thêm max_iter nếu cảnh báo convergence
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
# In kết quả
print("Accuracy: %.3f%%" % (result * 100.0))

Accuracy: 78.740%


## K-fold Cross Validation

In [2]:
# Evaluate using Cross Validation
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
# Tên file dữ liệu
filename = 'pima-indians-diabetes-dataset.csv'
# Đặt tên cột phù hợp
names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
# Đọc dữ liệu
dataframe = pd.read_csv(filename, names=names, header=0)
array = dataframe.values
X = array[:, 0:8]
Y = array[:, 8]
# Thiết lập Cross-Validation
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, shuffle=True, random_state=seed)
# Huấn luyện model
model = LogisticRegression(max_iter=1000)
results = cross_val_score(model, X, Y, cv=kfold)
# In kết quả
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean() * 100.0, results.std() * 100.0))

Accuracy: 77.216% (4.968%)


## Leave One Out Cross Validation

In [None]:
# Evaluate using Leave One Out Cross Validation
import pandas as pd
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
# Tên file dữ liệu
filename = 'pima-indians-diabetes-dataset.csv'
# Đặt tên cột phù hợp
names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
# Đọc dữ liệu
dataframe = pd.read_csv(filename, names=names, header=0)
array = dataframe.values
X = array[:, 0:8]
Y = array[:, 8]
# Khởi tạo Leave-One-Out CV
loocv = LeaveOneOut()
model = LogisticRegression(max_iter=200)  # Thêm max_iter để tránh warning
# Đánh giá mô hình
results = cross_val_score(model, X, Y, cv=loocv)
# In kết quả
print("Accuracy: {:.3f}% ({:.3f}%)".format(results.mean() * 100.0, results.std() * 100.0))

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Accuracy: 77.604% (41.689%)


## Repeated Random Test-Train Splits

In [10]:
# Evaluate using Shuffle Split Cross Validation
from pandas import read_csv
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
# Tên file dữ liệu
filename = 'pima-indians-diabetes-dataset.csv'
# Đặt tên cột phù hợp
names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
# Đọc dữ liệu
dataframe = read_csv(filename, names=names, header=0)
array = dataframe.values
X = array[:, 0:8]
Y = array[:, 8]
# Cấu hình đánh giá
n_splits = 10
test_size = 0.33
seed = 7
kfold = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=seed)
# Khởi tạo mô hình
model = LogisticRegression(max_iter=200)  # thêm max_iter để tránh warning hội tụ
# Đánh giá mô hình
results = cross_val_score(model, X, Y, cv=kfold)
# In kết quả theo cú pháp Python 3
print("Accuracy: {:.3f}% ({:.3f}%)".format(results.mean() * 100.0, results.std() * 100.0))

Accuracy: 76.535% (2.235%)
