In [14]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
df = pd.read_csv('data.csv', sep=';')
X = df.drop(['Class', 'Output'], axis=1)
y = df['Class']

In [15]:
# Build Logistic Regression Model
logistic_model = LogisticRegression(random_state=42, max_iter=1000, solver='lbfgs')

# Fit the model on the training data
logistic_model.fit(X, y)

# Make predictions on training data
y_train_pred = logistic_model.predict(X)
y_train_pred_proba = logistic_model.predict_proba(X)[:, 1]

# Calculate training accuracy
train_accuracy = accuracy_score(y, y_train_pred)
print(f"Training Accuracy: {train_accuracy:.4f} ({train_accuracy*100:.2f}%)")

# Confusion Matrix
print("\nConfusion Matrix (Training Data):")
cm_train = confusion_matrix(y, y_train_pred)
print(cm_train)

Training Accuracy: 0.7100 (71.00%)

Confusion Matrix (Training Data):
[[691 296]
 [284 729]]


In [18]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
cv_scores = cross_val_score(logistic_model, X, y, cv=kf, scoring='accuracy')
cv_scores_mean = cv_scores.mean()

# Compare training vs cross-validation performance
print(f"\nPerformance Comparison:")
print(f"Training Accuracy:     {train_accuracy:.4f}")
print(f"CV Mean Accuracy:      {cv_scores_mean:.4f}")
print(f"Difference:            {train_accuracy - cv_scores.mean():.4f}")


Performance Comparison:
Training Accuracy:     0.7100
CV Mean Accuracy:      0.5170
Difference:            0.1930


### Comment

Just like in linear regression, huge differences betweeen train and test accuracy suggest overfitting and model's poor capability to generalize beyond training data.