In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, MiniBatchKMeans, DBSCAN, OPTICS, AgglomerativeClustering
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor
from sklearn.datasets import make_blobs, make_classification
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, roc_auc_score, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, silhouette_score, calinski_harabasz_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier

# Generate sample regression data
X_reg, y_reg = make_blobs(n_samples=300, centers=1, cluster_std=1.0, random_state=42)
X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

# Train regression model
regressor = LinearRegression().fit(X_reg_train, y_reg_train)
y_pred_reg = regressor.predict(X_reg_test)

# 5.1. Metrics
# Regression Metrics
mae = mean_absolute_error(y_reg_test, y_pred_reg)  # Mean Absolute Error
mse = mean_squared_error(y_reg_test, y_pred_reg)  # Mean Squared Error
r2 = r2_score(y_reg_test, y_pred_reg)  # R-squared
print(f"Regression Metrics:\nMAE: {mae:.4f}, MSE: {mse:.4f}, R²: {r2:.4f}")

# Generate sample classification data
X_clf, y_clf = make_classification(n_samples=300, n_features=5, random_state=42)
X_clf_train, X_clf_test, y_clf_train, y_clf_test = train_test_split(X_clf, y_clf, test_size=0.2, random_state=42)

# Train classification model
classifier = RandomForestClassifier().fit(X_clf_train, y_clf_train)
y_pred_clf = classifier.predict(X_clf_test)
y_pred_proba = classifier.predict_proba(X_clf_test)[:, 1]

# Classification Metrics
accuracy = accuracy_score(y_clf_test, y_pred_clf)  # Accuracy
precision = precision_score(y_clf_test, y_pred_clf)  # Precision
recall = recall_score(y_clf_test, y_pred_clf)  # Recall
f1 = f1_score(y_clf_test, y_pred_clf)  # F1 Score
roc_auc = roc_auc_score(y_clf_test, y_pred_proba)  # ROC-AUC
conf_matrix = confusion_matrix(y_clf_test, y_pred_clf)  # Confusion Matrix
print(f"Classification Metrics:\nAccuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, ROC-AUC: {roc_auc:.4f}")
print(f"Confusion Matrix:\n{conf_matrix}")

# Generate clustering data
X_clust, _ = make_blobs(n_samples=300, centers=3, cluster_std=1.0, random_state=42)
clustering_model = KMeans(n_clusters=3, random_state=42).fit(X_clust)
labels = clustering_model.labels_

# Clustering Metrics
silhouette = silhouette_score(X_clust, labels)  # Silhouette Score
calinski = calinski_harabasz_score(X_clust, labels)  # Calinski-Harabasz Score

print(f"Clustering Metrics:\nSilhouette Score: {silhouette:.4f}, Calinski-Harabasz Score: {calinski:.4f}")

# 5.2. Hyperparameter Optimization
# Grid Search
param_grid = {'n_estimators': [10, 50, 100], 'max_depth': [None, 10, 20]}
grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5).fit(X_clf_train, y_clf_train)
print(f"Best Parameters (GridSearchCV): {grid_search.best_params_}")

# Randomized Search
param_dist = {'n_estimators': np.arange(10, 100, 10), 'max_depth': [None, 10, 20]}
random_search = RandomizedSearchCV(RandomForestClassifier(), param_dist, n_iter=5, cv=5, random_state=42).fit(X_clf_train, y_clf_train)
print(f"Best Parameters (RandomizedSearchCV): {random_search.best_params_}")


Regression Metrics:
MAE: 0.0000, MSE: 0.0000, R²: 1.0000
Classification Metrics:
Accuracy: 1.0000, Precision: 1.0000, Recall: 1.0000, F1: 1.0000, ROC-AUC: 1.0000
Confusion Matrix:
[[27  0]
 [ 0 33]]
Clustering Metrics:
Silhouette Score: 0.8480, Calinski-Harabasz Score: 5196.2951
Best Parameters (GridSearchCV): {'max_depth': 20, 'n_estimators': 10}
Best Parameters (RandomizedSearchCV): {'n_estimators': 40, 'max_depth': 20}
