In [1]:
import os

import numpy as np
import cv2 as cv

try:
    from sklearnex import patch_sklearn
    patch_sklearn()
except:
    pass

from sklearn.feature_selection import r_regression, chi2, f_classif, SelectKBest
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, log_loss

from skimage.feature import hog

try:
    %load_ext autotime
except:
    pass


time: 267 µs (started: 2023-06-03 19:55:17 +08:00)


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
N_TRAIN = 12500

if os.path.isfile("cat_hog.npy") and os.path.isfile("dog_hog.npy"):
    cat = np.load("cat_hog.npy")
    dog = np.load("dog_hog.npy")
else:
    from ipywidgets import IntProgress
    from IPython.display import display
    p = IntProgress(max=N_TRAIN)
    display(p)
    cat, dog = np.ndarray((N_TRAIN, 72900), dtype=np.float32), np.ndarray(
        (N_TRAIN, 72900), dtype=np.float32)
    p.description = "Cat"
    for i in range(N_TRAIN):
        p.value = i
        cat[i] = hog(cv.resize(cv.imread(f"train/cat.{i}.jpg"), (256, 256)),
                     orientations=9,  # 將梯度的角度分到9個方向
                     pixels_per_cell=(8, 8),  # 每個cell裡有多少像素
                     cells_per_block=(3, 3),  # 每個block裡有多少cell
                     block_norm='L2-Hys',  # block內的normalization方法
                     visualize=False,  # 是否輸出視覺化矩陣
                     transform_sqrt=False,  # 是否預處理照片
                     feature_vector=True,  # 是否把特徵攤平成1D
                     channel_axis=-1  # RGB的維度是最後一個 (256*256*3)
                     )
    p.description = "Dog"
    for i in range(N_TRAIN):
        p.value = i
        dog[i] = hog(cv.resize(cv.imread(f"train/dog.{i}.jpg"), (256, 256)),
                     orientations=9,  # 將梯度的角度分到9個方向
                     pixels_per_cell=(8, 8),  # 每個cell裡有多少像素
                     cells_per_block=(3, 3),  # 每個block裡有多少cell
                     block_norm='L2-Hys',  # block內的normalization方法
                     visualize=False,  # 是否輸出視覺化矩陣
                     transform_sqrt=False,  # 是否預處理照片
                     feature_vector=True,  # 是否把特徵攤平成1D
                     channel_axis=-1  # RGB的維度是最後一個 (256*256*3)
                     )
    p.description = "Done"
    np.save("cat_hog.npy", cat)
    np.save("dog_hog.npy", dog)


IntProgress(value=0, max=12500)

time: 11min 38s (started: 2023-06-03 17:26:57 +08:00)


In [3]:
N_TRAIN = 12500

if os.path.isfile("cat_hog.npy") and os.path.isfile("dog_hog.npy"):
    cat = np.load("cat_hog.npy")
    dog = np.load("dog_hog.npy")
else:
    from joblib import Parallel, delayed
    def f(x: bool, i: int):
        return hog(cv.resize(cv.imread(f"train/{('cat', 'dog')[x]}.{i}.jpg"), (256, 256)),
                       orientations=9,  # 將梯度的角度分到9個方向
                       pixels_per_cell=(8, 8),  # 每個cell裡有多少像素
                       cells_per_block=(3, 3),  # 每個block裡有多少cell
                       block_norm='L2-Hys',  # block內的normalization方法
                       visualize=False,  # 是否輸出視覺化矩陣
                       transform_sqrt=False,  # 是否預處理照片
                       feature_vector=True,  # 是否把特徵攤平成1D
                       channel_axis=-1  # RGB的維度是最後一個 (256*256*3)
                       )
    cat = np.array(Parallel(n_jobs=-1)(delayed(f)(False, i) for i in range(N_TRAIN)))
    dog = np.array(Parallel(n_jobs=-1)(delayed(f)(True, i) for i in range(N_TRAIN)))
    np.save("cat_hog.npy", cat)
    np.save("dog_hog.npy", dog)


time: 1min 21s (started: 2023-06-03 17:38:36 +08:00)


In [2]:
N_TRAIN = 12500

cat = np.load("cat_hog.npy")
dog = np.load("dog_hog.npy")

x_train = np.concatenate([cat, dog])
y_train = np.concatenate([np.zeros(N_TRAIN, dtype=int), np.ones(N_TRAIN, dtype=int)])

time: 11.3 s (started: 2023-06-03 19:55:23 +08:00)


In [3]:
from joblib import parallel_backend

with parallel_backend("loky", n_jobs=-1):
    if os.path.exists("pearson.npy"):
        x_train_pearson = np.load("pearson.npy")
    else:
        x_train_pearson = SelectKBest(r_regression, k=729*69).fit_transform(x_train, y_train)
        np.save("pearson.npy", x_train_pearson)
    if os.path.exists("chi_square.npy"):
        x_train_chi_square = np.load("chi_square.npy")
    else:
        x_train_chi_square = SelectKBest(chi2, k=729*69).fit_transform(x_train, y_train)
        np.save("chi_square.npy", x_train_chi_square)
    if os.path.exists("anova.npy"):
        x_train_anova = np.load("anova.npy")
    else:
        x_train_anova = SelectKBest(f_classif, k=729*69).fit_transform(x_train, y_train)
        np.save("anova.npy", x_train_anova)

time: 12.3 s (started: 2023-06-03 19:55:38 +08:00)


In [4]:
from joblib import load, dump

with parallel_backend("loky", n_jobs=-1):
    if os.path.exists("model.joblib"):
        model = load("model.joblib")
    else:
        model = LogisticRegression(penalty=None) #, solver="newton-cholesky")
        model.fit(x_train, y_train)
        dump(model, "model.joblib")
    print("Pearson")
    if os.path.exists("model_pearson.joblib"):
        model = load("model_pearson.joblib")
    else:
        model_pearson = LogisticRegression(penalty=None) #, solver="newton-cholesky")
        model_pearson.fit(x_train_pearson, y_train)
        dump(model_pearson, "model_pearson.joblib")
    print("Chi-square")
    if os.path.exists("model_chi_square.joblib"):
        model = load("model_chi_square.joblib")
    else:
        model_chi_square = LogisticRegression(penalty=None) #, solver="newton-cholesky")
        model_chi_square.fit(x_train_chi_square, y_train)
        dump(model_chi_square, "model_chi_square.joblib")
    print("ANOVA")
    if os.path.exists("model_anova.joblib"):
        model = load("model_anova.joblib")
    else:
        model_anova = LogisticRegression(penalty=None) #, solver="saga")
        model_anova.fit(x_train_anova, y_train)
        dump(model_anova, "model_anova.joblib")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pearson


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Chi-square


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


ANOVA
time: 7min 44s (started: 2023-06-03 19:56:00 +08:00)
