In [1]:
# Import necessary libraries
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the dataset (Breast Cancer dataset)
data = datasets.load_breast_cancer()
X = data.data  # Features
y = data.target  # Labels (0 or 1)

# Split the dataset into training and test sets (80% training, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features (important for logistic regression)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize Logistic Regression model
log_reg = LogisticRegression()

# Train the model
log_reg.fit(X_train, y_train)

# Make predictions on the test set
y_pred = log_reg.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print evaluation results
print(f"Accuracy: {accuracy:.4f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)


Accuracy: 0.9737
Confusion Matrix:
[[41  2]
 [ 1 70]]
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.95      0.96        43
           1       0.97      0.99      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114



In [2]:
# Trying different hyperparameters for Logistic Regression
log_reg = LogisticRegression(
    C=0.1,              # Inverse of regularization strength (smaller values = stronger regularization)
    penalty='l2',       # Use L2 regularization (can also try 'l1', 'elasticnet', or 'none')
    solver='lbfgs',     # Solver algorithm ('lbfgs', 'liblinear', 'newton-cg', 'saga', etc.)
    max_iter=200,       # Maximum number of iterations (increase if not converging)
    random_state=42
)

# Train the model with new hyperparameters
log_reg.fit(X_train, y_train)

# Make predictions and evaluate performance
y_pred = log_reg.predict(X_test)

# Evaluating the model with new hyperparameters
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)


Accuracy: 0.9825
Confusion Matrix:
[[41  2]
 [ 0 71]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.95      0.98        43
           1       0.97      1.00      0.99        71

    accuracy                           0.98       114
   macro avg       0.99      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114



In [3]:
from sklearn.model_selection import GridSearchCV

# # Define a set of hyperparameters to search over
# param_grid = {
#     'C': [0.01, 0.1, 1, 10, 100],        # Different values of regularization strength
#     'penalty': ['l1', 'l2'],             # L1 and L2 regularization (note: 'l1' works only with 'liblinear' or 'saga')
#     'solver': ['liblinear', 'lbfgs', 'saga'],  # Different solvers
#     'max_iter': [100, 200, 500]          # Maximum number of iterations
# }




# Initialize Logistic Regression model
log_reg = LogisticRegression()

# Initialize GridSearchCV
grid_search = GridSearchCV(log_reg, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Train the model with GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Make predictions with the best parameters
y_pred = grid_search.predict(X_test)

# Evaluate the model with best hyperparameters
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print the results
print(f"Best Hyperparameters: {best_params}")
print(f"Best Cross-Validation Score: {best_score:.4f}")
print(f"Test Set Accuracy: {accuracy:.4f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)


Best Hyperparameters: {'C': 0.1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best Cross-Validation Score: 0.9780
Test Set Accuracy: 0.9912
Confusion Matrix:
[[42  1]
 [ 0 71]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        43
           1       0.99      1.00      0.99        71

    accuracy                           0.99       114
   macro avg       0.99      0.99      0.99       114
weighted avg       0.99      0.99      0.99       114



75 fits failed out of a total of 450.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
75 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1194, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 67, in _check_solver
   

In [4]:
from sklearn.model_selection import GridSearchCV

# # Define a set of hyperparameters to search over

param_grid = [
    {'C': [0.01, 0.1, 1, 10, 100], 'penalty': ['l2'], 'solver': ['lbfgs', 'liblinear', 'saga'], 'max_iter': [100, 200, 500]},
    {'C': [0.01, 0.1, 1, 10, 100], 'penalty': ['l1'], 'solver': ['liblinear', 'saga'], 'max_iter': [100, 200, 500]}
]


# Initialize Logistic Regression model
log_reg = LogisticRegression()

# Initialize GridSearchCV
grid_search = GridSearchCV(log_reg, param_grid, cv=5, scoring='accuracy', n_jobs=-1, error_score='raise')

# Train the model with GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Make predictions with the best parameters
y_pred = grid_search.predict(X_test)

# Evaluate the model with best hyperparameters
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print the results
print(f"Best Hyperparameters: {best_params}")
print(f"Best Cross-Validation Score: {best_score:.4f}")
print(f"Test Set Accuracy: {accuracy:.4f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)


Best Hyperparameters: {'C': 0.1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best Cross-Validation Score: 0.9780
Test Set Accuracy: 0.9912
Confusion Matrix:
[[42  1]
 [ 0 71]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        43
           1       0.99      1.00      0.99        71

    accuracy                           0.99       114
   macro avg       0.99      0.99      0.99       114
weighted avg       0.99      0.99      0.99       114



In [7]:
Best Hyperparameters: {'C': 0.1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best Cross-Validation Score: 0.9780
Test Set Accuracy: 0.9912
Confusion Matrix:
[[42  1]
 [ 0 71]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        43
           1       0.99      1.00      0.99        71

    accuracy                           0.99       114
   macro avg       0.99      0.99      0.99       114
weighted avg       0.99      0.99      0.99       114
#
Best Hyperparameters: {'C': 0.1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best Cross-Validation Score: 0.9780
Test Set Accuracy: 0.9912
Confusion Matrix:
[[42  1]
 [ 0 71]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        43
           1       0.99      1.00      0.99        71

    accuracy                           0.99       114
   macro avg       0.99      0.99      0.99       114
weighted avg       0.99      0.99      0.99       114

In [10]:
from sklearn.model_selection import RandomizedSearchCV
import scipy.stats as stats

# Define the distribution of hyperparameters to search over
param_distributions = {
    'C': stats.uniform(0.01, 100),       # Sampling from a continuous range of C values
    'penalty': ['l1', 'l2'],             # Randomly pick between L1 and L2 regularization
    'solver': ['liblinear', 'saga'],     # Solvers that support both penalties
    'max_iter': [100, 200, 500,1000]          # Maximum number of iterations
}

# Initialize Logistic Regression model
log_reg = LogisticRegression()

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    log_reg,
    param_distributions,
    n_iter=100,              # Number of parameter settings to sample
    cv=5,                    # 5-fold cross-validation
    scoring='accuracy',       # Evaluate by accuracy
    n_jobs=-1,                # Use all available cores
    random_state=42
)

# Train the model with RandomizedSearchCV
random_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = random_search.best_params_
best_score = random_search.best_score_

# Make predictions with the best parameters
y_pred = random_search.predict(X_test)

# Evaluate the model with best hyperparameters
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print the results
print(f"Best Hyperparameters: {best_params}")
print(f"Best Cross-Validation Score: {best_score:.4f}")
print(f"Test Set Accuracy: {accuracy:.4f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)


Best Hyperparameters: {'C': 15.609452033620265, 'max_iter': 500, 'penalty': 'l1', 'solver': 'saga'}
Best Cross-Validation Score: 0.9780
Test Set Accuracy: 0.9825
Confusion Matrix:
[[42  1]
 [ 1 70]]
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98        43
           1       0.99      0.99      0.99        71

    accuracy                           0.98       114
   macro avg       0.98      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114





In [9]:
  # random search
Best Hyperparameters: {'C': 15.609452033620265, 'max_iter': 500, 'penalty': 'l1', 'solver': 'saga'}
Best Cross-Validation Score: 0.9780
Test Set Accuracy: 0.9825
Confusion Matrix:
[[42  1]
 [ 1 70]]
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98        43
           1       0.99      0.99      0.99        71

    accuracy                           0.98       114
   macro avg       0.98      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114

/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(


                # old



Best Hyperparameters: {'C': 0.1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best Cross-Validation Score: 0.9780
Test Set Accuracy: 0.9912
Confusion Matrix:
[[42  1]
 [ 0 71]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        43
           1       0.99      1.00      0.99        71

    accuracy                           0.99       114
   macro avg       0.99      0.99      0.99       114
weighted avg       0.99      0.99      0.99       114


In [12]:

# Set up hyperparameter grid for Elastic Net
param_distributions = {
    'C': stats.uniform(0.01, 100),  # Inverse of regularization strength
    'penalty': ['elasticnet'],  # Use Elastic Net
    'solver': ['saga'],  # 'saga' supports Elastic Net
    'l1_ratio': stats.uniform(0, 1),  # Ratio between L1 and L2 regularization
    'max_iter': [1000]  # Set max_iter to allow convergence
}

# Initialize the model
log_reg = LogisticRegression()

# Set up the Randomized Search
random_search = RandomizedSearchCV(
    log_reg,
    param_distributions,
    n_iter=100,  # Number of random combinations to try
    scoring='accuracy',
    cv=5,  # 5-fold cross-validation
    verbose=1,
    random_state=42,
    n_jobs=-1  # Use all available cores
)

# Fit the model
random_search.fit(X_train, y_train)

# Best hyperparameters
print("Best Hyperparameters:", random_search.best_params_)

# Best cross-validation score
print("Best Cross-Validation Score:", random_search.best_score_)

# Predict on the test set
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)

# Evaluate accuracy
test_accuracy = accuracy_score(y_test, y_pred)
print("Test Set Accuracy:", test_accuracy)

# Confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))


Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Hyperparameters: {'C': 5.818361216819946, 'l1_ratio': 0.8661761457749352, 'max_iter': 1000, 'penalty': 'elasticnet', 'solver': 'saga'}
Best Cross-Validation Score: 0.9780219780219781
Test Set Accuracy: 0.9736842105263158
Confusion Matrix:
[[42  1]
 [ 2 69]]
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.98      0.97        43
           1       0.99      0.97      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114



