In [2]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

import os
import numpy as np
import cv2
from skimage import feature, io, color
from sklearn.metrics import classification_report
import joblib

from utils import *

In [3]:
data_dir = "../data/images"
X, y = load_data_from_folder(data_dir)

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# *SVM*

In [None]:
svm = SVC()

param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'], 
    'gamma': [0.1, 0.5, 1, 5, 10] 
}

grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=5, scoring='accuracy')

grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Đánh giá trên tập test
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))

#save
joblib.dump(best_model, './checkpoint/svm_best_model.pkl')


Best Parameters: {'C': 1, 'gamma': 0.5, 'kernel': 'rbf'}
Best Score: 0.7510738671632526
              precision    recall  f1-score   support

           0       0.86      0.65      0.74       127
           1       0.65      0.86      0.74        97

    accuracy                           0.74       224
   macro avg       0.75      0.75      0.74       224
weighted avg       0.77      0.74      0.74       224



# *KNN*

In [None]:
knn = KNeighborsClassifier()

param_grid = {
    'n_neighbors': [1, 3, 5, 7, 9, 11, 15, 20],
    'weights': ['uniform', 'distance'],
    'metric': [
        'euclidean', 'manhattan', 'minkowski', 'chebyshev', 'cosine', 'hamming'
    ]
}

grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5, scoring='accuracy')

# Huấn luyện mô hình
grid_search.fit(X_train, y_train)

# Kết quả tốt nhất
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Đánh giá trên tập test
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))

#save
joblib.dump(best_model, './checkpoint/knn_best_model.pkl')

Best Parameters: {'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'distance'}
Best Score: 0.7377219118559901
              precision    recall  f1-score   support

           0       0.81      0.65      0.72       127
           1       0.64      0.79      0.71        97

    accuracy                           0.71       224
   macro avg       0.72      0.72      0.71       224
weighted avg       0.73      0.71      0.72       224



# *Random Forest*

In [None]:
rf = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

# Thiết lập GridSearchCV với 5-fold cross-validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy')

# Huấn luyện với GridSearchCV
grid_search.fit(X_train, y_train)

# In ra các tham số tốt nhất và điểm số tốt nhất
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Đánh giá mô hình trên tập test
test_score = grid_search.score(X_test, y_test)
print("Test Score:", test_score)

#save
joblib.dump(best_model, './checkpoint/rf_best_model.pkl')

KeyboardInterrupt: 