<a href="https://colab.research.google.com/github/marco10507/ml-portfolio/blob/main/logistic_regression_1_class.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [66]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.pipeline import make_pipeline
import plotly.express as px
import matplotlib
import matplotlib.pyplot as plt

# Generate synthetic data
X, y = make_classification(n_samples=10000, n_features=10, n_classes=2, random_state=42)

Showing all classes in data

In [67]:
unique_values, counts = np.unique(y, return_counts=True)

for value, count in zip(unique_values, counts):
  print(f"{value}: {count} times")

0: 4988 times
1: 5012 times


In [68]:
x_scaled = StandardScaler().fit_transform(X)

pca = PCA(n_components=3)
x_pca = pca.fit_transform(x_scaled)

df = pd.DataFrame(data=x_pca, columns=["PC1", "PC2", "PC3"])
df["Target"] = y

fig_2d = px.scatter(df, x="PC1", y="PC2", color="Target", title="2 PCA")

fig_2d.show()

fig_3d = px.scatter_3d(df, x="PC1", y="PC2", z="PC3", color="Target", title="3 PCA")

fig_3d.show()

Conduct Logistic Regression with L2 regularization while transforming features into polynomials.



In [69]:
X_train, X_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.2, random_state=42)

model = make_pipeline(PolynomialFeatures(), LogisticRegression())
hyperparameter_candicates = {
    "polynomialfeatures__degree": [1,2,3],
    "logisticregression__C": [0.001, 0.01, 0.1, 1, 10, 100],
    "logisticregression__solver": ["liblinear"],
    "logisticregression__penalty": ["l1", "l2"]
}

grid_search = GridSearchCV(model, hyperparameter_candicates, cv=5, scoring="accuracy")

grid_search.fit(X_train, y_train)

In [70]:
print("Best hyperparameters:", grid_search.best_params_)
print("Model accurary on training data:", grid_search.best_score_)

accurary = grid_search.score(X_test, y_test)

print("Model accuracy on test data", accurary)

results = grid_search.cv_results_

mean_test_scores = results['mean_test_score']

hyperparameter_strings = [
    ("_".join(str(value) for value in params.values()))
    for params in results["params"]
]


df = pd.DataFrame({"x": hyperparameter_strings, "y": mean_test_scores})

fig = px.line(df, x="x", y="y", title="Mean Test Scores", labels={"y": "Accuraccy", "x": "hyperparameters"}, markers=True)

fig.show()

Best hyperparameters: {'logisticregression__C': 0.1, 'logisticregression__penalty': 'l1', 'logisticregression__solver': 'liblinear', 'polynomialfeatures__degree': 3}
Model accurary on training data: 0.9274999999999999
Model accuracy on test data 0.937
