In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler

# Load preprocessed data from Assignment 1
df = pd.read_csv('preprocessed_data.csv')

# Select relevant features and target variable
X = df.drop('target_variable', axis=1)
y = df['target_variable']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train Linear Regression model
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)
y_pred_linear = linear_reg.predict(X_test)

# Train Logistic Regression model
logistic_reg = LogisticRegression()
logistic_reg.fit(X_train, y_train)
y_pred_logistic = logistic_reg.predict(X_test)

# Check for overfitting/underfitting
print("Linear Regression")
print("Train MSE:", mean_squared_error(y_train, linear_reg.predict(X_train)))
print("Test MSE:", mean_squared_error(y_test, y_pred_linear))

print("\nLogistic Regression")
print("Train Accuracy:", accuracy_score(y_train, logistic_reg.predict(X_train)))
print("Test Accuracy:", accuracy_score(y_test, y_pred_logistic))

# Regularization: Ridge (L2) and Lasso (L1)
ridge_reg = Ridge(alpha=1.0)
ridge_reg.fit(X_train, y_train)
y_pred_ridge = ridge_reg.predict(X_test)

lasso_reg = Lasso(alpha=1.0)
lasso_reg.fit(X_train, y_train)
y_pred_lasso = lasso_reg.predict(X_test)

# Hyperparameter optimization
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
log_reg_cv = GridSearchCV(LogisticRegression(), param_grid, scoring='accuracy', cv=5)
log_reg_cv.fit(X_train, y_train)
print("Best C for Logistic Regression:", log_reg_cv.best_params_)

# Plot learning curves
def plot_learning_curves(model, X_train, y_train, X_test, y_test, metric):
    train_errors, test_errors = [], []
    for m in range(1, len(X_train)):
        model.fit(X_train[:m], y_train[:m])
        y_train_pred = model.predict(X_train[:m])
        y_test_pred = model.predict(X_test)
        train_errors.append(metric(y_train[:m], y_train_pred))
        test_errors.append(metric(y_test, y_test_pred))

    plt.plot(np.sqrt(train_errors), "r-+", linewidth=2, label="train")
    plt.plot(np.sqrt(test_errors), "b-", linewidth=3, label="test")
    plt.legend(loc="upper right", fontsize=14)
    plt.xlabel("Training set size", fontsize=14)
    plt.ylabel("Error", fontsize=14)

# Plot learning curves for both models
plt.figure(figsize=(10, 5))

plt.subplot(1, 2, 1)
plot_learning_curves(LinearRegression(), X_train, y_train, X_test, y_test, mean_squared_error)
plt.title("Linear Regression Learning Curves")

plt.subplot(1, 2, 2)
plot_learning_curves(LogisticRegression(), X_train, y_train, X_test, y_test, lambda y_true, y_pred: 1 - accuracy_score(y_true, y_pred))
plt.title("Logistic Regression Learning Curves")

plt.show()