**<font color=skyblue>使用 LogisticRegression 與 LogisticRegressionCV 進行分類</font>**

Data: Wine.xlsx

<font color=yellow>Prepare Data</font>

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Read data
df = pd.read_excel('data/Wine.xlsx')
X = np.array(df.iloc[:, :-1]) # 排除最後一欄標籤 N x p
y = np.array(df.iloc[:, -1])

# Split data into training and testing sets
# 70% training, 30% testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30)

# Standardize data
scaler = StandardScaler()
X_train_ = scaler.fit_transform(X_train) # 標準化訓練資料
X_test_ = scaler.fit_transform(X_test) # 標準化測試資料

# print the numbers of training and testing samples
print(f"Training samples: {X_train_.shape[0]}")
print(f"Testing samples: {X_test_.shape[0]}")
# print the shape of the data
print(f"Training data shape: {X_train_.shape}")
print(f"Testing data shape: {X_test_.shape}")

Training samples: 124
Testing samples: 54
Training data shape: (124, 13)
Testing data shape: (54, 13)


<font color=yellow>Use original data to train the machine by</font>

- LogisticRegression
- LogisticRegressionCV: LogisticRegression with Cross Validation

In [2]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import accuracy_score, classification_report

# clf_original = LogisticRegression(solver = 'lbfgs', tol = 1e-6, max_iter = int(1e6), verbose=True)
# setup parameters for LogisticRegression
opts = dict(tol = 1e-6, max_iter = int(1e6), verbose=0) # verbose=1 for more information
# --- Logistic Regression ---
solver = 'lbfgs'  # 'lbfgs' is the default
# solver = 'liblinear'
# solver = 'newton-cg'
clf_LR_original = LogisticRegression(solver = solver, **opts)
clf_LR_original.fit(X_train_, y_train) # input data must be (n_samples x n_features)
# --- Results Report ---
print(f"Logistic Regression with solver = {solver}")
# print training score
print(f"Training score = {accuracy_score(y_train, clf_LR_original.predict(X_train_)):.2%}\n")
# print testing score
y_pred = clf_LR_original.predict(X_test_)
# print(f"Testing score = {accuracy_score(y_test, y_pred):.2%}\n")
print(f"Testint score = {clf_LR_original.score(X_test_, y_test):.2%}\n")
# print classification report
print("Classification report:")
print(classification_report(y_test, y_pred))

Logistic Regression with solver = lbfgs
Training score = 100.00%

Testint score = 87.04%

Classification report:
              precision    recall  f1-score   support

           1       0.62      1.00      0.76         8
           2       1.00      0.73      0.84        26
           3       0.91      1.00      0.95        20

    accuracy                           0.87        54
   macro avg       0.84      0.91      0.85        54
weighted avg       0.91      0.87      0.87        54



<font color=yellow>Use LogisticRegressionCV</font>

Cross Validation with respect to Cs parameters.

In [3]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import accuracy_score

opts = dict(tol = 1e-6, max_iter = int(1e6), verbose=0)
solver = 'lbfgs'  # 'lbfgs' is the default
# solver = 'liblinear'
# solver = 'newton-cg'
Cs = np.logspace(-5, 5, 20) # 20 values of C from 1e-5 to 1e5
cv = 5 # 5-fold cross-validation
# --- Logistic Regression with Cross Validation ---
clf_originalCV = LogisticRegressionCV(solver = solver, Cs = Cs, cv = cv, **opts) 
clf_originalCV.fit(X_train_, y_train) # input data must be (n_samples x n_features)
y_pred = clf_originalCV.predict(X_test_)

# --- Results Report ---
print(f"Logistic Regression with Cross Validation and solver = {solver}")
# print best C value
print(f"Best C = {clf_originalCV.C_}")
# print training score
print(f"Training score = {accuracy_score(y_train, clf_originalCV.predict(X_train_)):.2%}\n")
# print testing score
# print(f"Testing score = {accuracy_score(y_test, y_pred):.2%}\n")
print(f"Testing score = {clf_originalCV.score(X_test_, y_test):.2%}\n")
print(classification_report(y_test, y_pred))

Logistic Regression with Cross Validation and solver = lbfgs
Best C = [0.16237767 0.16237767 0.16237767]
Training score = 100.00%

Testing score = 88.89%

              precision    recall  f1-score   support

           1       0.62      1.00      0.76         8
           2       1.00      0.77      0.87        26
           3       0.95      1.00      0.98        20

    accuracy                           0.89        54
   macro avg       0.86      0.92      0.87        54
weighted avg       0.93      0.89      0.89        54



<hr>
<font color=yellow>Use principal components to train the machine</font>

<font color=yellow>最佳組合選擇 by GridSearchCV</font>

CV: Cross Validation 用在 hyperparameter tuning

<font color=red>警告：資料量大時，很費時</font>

In [4]:
from sklearn.model_selection import  GridSearchCV, \
                        StratifiedShuffleSplit
from datetime import datetime

# Get the current date and time
now = datetime.now()
# Format the date and time as a string
now_str = now.strftime("%Y_%m_%d_%H_%M_%S")
results_file = 'data/results_' + now_str + '.csv'

opts = dict(tol = 1e-6, max_iter = int(1e6)) # parameters for LogisticRegression
parameters = {'solver':['lbfgs', 'liblinear', 'newton-cg', 'sag','saga'], \
              'C':[0.1, 1, 10]} # parameters for GridSearchCV
# parameters = {'solver':['lbfgs', 'liblinear', 'newton-cg',\
#                         'sag','saga']}
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.3, \
                            random_state=0) # 5-fold CV
grid = GridSearchCV(estimator=LogisticRegression(**opts), \
                param_grid=parameters, cv=cv, 
                scoring=['accuracy','f1_macro'], refit="accuracy") 
grid.fit(X_train_, y_train)
# grid.fit(X, y)
cv_logistic = pd.DataFrame(data = grid.cv_results_)
# print the results
# print(cv_logistic.head())
# cv_logistic.to_csv(results_file) # 打開來觀察結果
print(grid.best_params_)
print(grid.best_score_)
print(grid.best_estimator_)

{'C': 0.1, 'solver': 'liblinear'}
1.0
LogisticRegression(C=0.1, max_iter=1000000, solver='liblinear', tol=1e-06)
