<a href="https://colab.research.google.com/github/marco10507/ml-portfolio/blob/main/logistic_regression_1_class.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, ConfusionMatrixDisplay
import plotly.express as px
import matplotlib
import matplotlib.pyplot as plt


# Generate synthetic data
X, y = make_classification(n_samples=10000, n_features=10, n_classes=2, random_state=42)

unique_values, counts = np.unique(y, return_counts=True)

print("classes")
for value, count in zip(unique_values, counts):
  print(f"{value}: {count} times")

classes
0: 4988 times
1: 5012 times


In [2]:
x_scaled = StandardScaler().fit_transform(X)

pca = PCA(n_components=3)
x_pca = pca.fit_transform(x_scaled)

df = pd.DataFrame(data=x_pca, columns=["PC1", "PC2", "PC3"])
df["Target"] = y

fig_2d = px.scatter(df, x="PC1", y="PC2", color="Target")

fig_2d.show()

fig_3d = px.scatter_3d(df, x="PC1", y="PC2", z="PC3", color="Target")

fig_3d.show()

In [3]:
# logistic regression
X_train, X_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.2, random_state=42)

num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
degrees = 4

avg_training_accuracies = []
avg_validation_accuracies = []

for degree in range(1, degrees + 1):

  training_accuracies = []
  validation_accuracies = []

  poly_features = PolynomialFeatures(degree=degree)
  x_poly = poly_features.fit_transform(X_train)

  for fold, (train_index, val_index) in enumerate(kf.split(x_poly, y_train), start=1):
    x_cv_train, x_cv_val = x_poly[train_index], x_poly[val_index]
    y_cv_train, y_cv_val = y_train[train_index], y_train[val_index]

    model = LogisticRegression(max_iter=1000, solver="liblinear", C=0.1, penalty="l2")

    model.fit(x_cv_train, y_cv_train)

    y_train_pred = model.predict(x_cv_train)
    y_val_pred = model.predict(x_cv_val)

    training_accuracy = accuracy_score(y_cv_train, y_train_pred)
    validation_accuracy = accuracy_score(y_cv_val, y_val_pred)

    training_accuracies.append(training_accuracy)
    validation_accuracies.append(validation_accuracy)

  avg_training_accuracy = sum(training_accuracies) / num_folds
  avg_validation_accuracy = sum(validation_accuracies) / num_folds

  avg_training_accuracies.append(avg_training_accuracy)
  avg_validation_accuracies.append(avg_validation_accuracy)

In [4]:
x_values = list(range(1, degrees + 1))

metrics_data = pd.DataFrame({"x": x_values, "training_accuracy": avg_training_accuracies, "validation_accuracy": avg_validation_accuracies})

fig = px.line(metrics_data, x="x", y=["training_accuracy", "validation_accuracy"], labels= {"x": "degrees", "y": "training_accuracy"})

lower_range = min(min(avg_training_accuracies), min(avg_validation_accuracies)) - 0.0005
uper_range = max(max(avg_training_accuracies), max(avg_validation_accuracies)) + 0.0005


fig.update_layout(yaxis=dict(range=[lower_range, uper_range], dtick=0.005), xaxis=dict(dtick=1))

fig.show()