In [None]:
# =========================================
# Machine Learning: Conventional vs Advanced Models
# =========================================

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# ------------------------------
# Dataset: Titanic for classification
# ------------------------------
titanic = sns.load_dataset("titanic").dropna(subset=["age", "fare", "sex", "class", "survived"])
df = titanic.copy()
df["sex"] = df["sex"].map({"male": 0, "female": 1})
df["class"] = df["class"].map({"First": 1, "Second": 2, "Third": 3})

X = df[["age", "fare", "sex", "class"]]
y = df["survived"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# ------------------------------
# Part 1: Conventional Models
# ------------------------------
print("\n=== Logistic Regression (Baseline) ===")
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)
y_pred_log = logreg.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred_log))

# ------------------------------
# Part 2: Regularization
# ------------------------------
print("\n=== Ridge Logistic Regression ===")
ridge = LogisticRegression(penalty="l2", C=1.0, solver="lbfgs", max_iter=1000)
ridge.fit(X_train, y_train)
print("Accuracy:", accuracy_score(y_test, ridge.predict(X_test)))

print("\n=== LASSO Logistic Regression ===")
lasso = LogisticRegression(penalty="l1", solver="liblinear", max_iter=1000)
lasso.fit(X_train, y_train)
print("Accuracy:", accuracy_score(y_test, lasso.predict(X_test)))

# ------------------------------
# Part 3: Advanced Tree Models
# ------------------------------
print("\n=== Random Forest ===")
rf = RandomForestClassifier(n_estimators=200, max_depth=5, random_state=42)
rf.fit(X_train, y_train)
print("Accuracy:", accuracy_score(y_test, rf.predict(X_test)))

print("\n=== Gradient Boosting ===")
gb = GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, max_depth=3, random_state=42)
gb.fit(X_train, y_train)
print("Accuracy:", accuracy_score(y_test, gb.predict(X_test)))

# ------------------------------
# Part 4: Compare Models
# ------------------------------
results = {
    "Logistic": accuracy_score(y_test, y_pred_log),
    "Ridge": accuracy_score(y_test, ridge.predict(X_test)),
    "LASSO": accuracy_score(y_test, lasso.predict(X_test)),
    "Random Forest": accuracy_score(y_test, rf.predict(X_test)),
    "Gradient Boosting": accuracy_score(y_test, gb.predict(X_test))
}
pd.Series(results).plot(kind="bar", figsize=(8,4), title="Model Accuracy Comparison")
plt.show()

# ------------------------------
# Part 5: Unsupervised Learning
# ------------------------------
print("\n=== K-Means Clustering (unsupervised) ===")
kmeans = KMeans(n_clusters=2, random_state=42)
clusters = kmeans.fit_predict(X)

df["cluster"] = clusters
sns.scatterplot(x="age", y="fare", hue="cluster", data=df, palette="Set1")
plt.title("K-Means Clustering of Titanic Passengers")
plt.show()

print("\n=== PCA (dimensionality reduction) ===")
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
plt.scatter(X_pca[:,0], X_pca[:,1], c=y, cmap="coolwarm", alpha=0.7)
plt.title("PCA Projection Colored by Survival")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()

# ------------------------------
# Mission Task
# ------------------------------
# 1. Add Probit Regression (use statsmodels).
# 2. Try Ridge & LASSO regression on mpg dataset (predict 'mpg').
# 3. Compare Random Forest vs Gradient Boosting performance on classification.
# 4. Experiment with K-Means (different k values).
# 5. Visualize PCA components with different datasets.
