In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# ============================================
# PART 1: Create Synthetic Dataset
# ============================================

np.random.seed(42)
n_samples = 1000

# Build base feature with controlled correlation
base_feature = np.random.randn(n_samples, 1)
features = np.zeros((n_samples, 7))

# Generate 7 interdependent features with minimal noise
for col_idx in range(7):
    small_noise = np.random.randn(n_samples, 1) * 0.3
    features[:, col_idx] = (base_feature + small_noise).flatten()

# Create regression target from features
coefficients = np.random.randn(7)
target = features @ coefficients + np.random.randn(n_samples) * 0.5

# Organize into DataFrame
col_names = [f'X{j+1}' for j in range(7)]
data = pd.DataFrame(features, columns=col_names)
data['target'] = target

print("Data Dimensions:", data.shape)
print("\nFeature Correlation Matrix:")
print(data[col_names].corr().round(3))

# ============================================
# PART 2: Custom Ridge Regression Class
# ============================================

class RidgeRegressionOptimizer:
    """Custom Ridge Regression using Stochastic Gradient Descent"""

    def __init__(self, learning_rate=0.01, lambda_reg=1.0,
                 n_iterations=1000, tolerance=1e-6):
        self.learning_rate = learning_rate
        self.lambda_reg = lambda_reg
        self.n_iterations = n_iterations
        self.tolerance = tolerance
        self.weights = None
        self.bias = None
        self.cost_history = []

    def _ridge_cost(self, X, y, weights, bias):
        """Calculate cost: MSE + regularization penalty"""
        n = len(y)
        predictions = X @ weights + bias
        error = predictions - y

        mse_loss = (1 / (2 * n)) * np.sum(error ** 2)
        reg_penalty = (self.lambda_reg / (2 * n)) * np.sum(weights ** 2)

        return mse_loss + reg_penalty

    def fit(self, X, y):
        """Train model using gradient descent"""
        n, d = X.shape

        self.weights = np.zeros(d)
        self.bias = 0

        for step in range(self.n_iterations):
            # Compute predictions and errors
            predictions = X @ self.weights + self.bias
            residuals = predictions - y

            # Calculate gradients
            weight_gradient = (1 / n) * (X.T @ residuals) + \
                            (self.lambda_reg / n) * self.weights
            bias_gradient = (1 / n) * np.sum(residuals)

            # Update parameters
            self.weights -= self.learning_rate * weight_gradient
            self.bias -= self.learning_rate * bias_gradient

            # Track cost
            current_cost = self._ridge_cost(X, y, self.weights, self.bias)
            self.cost_history.append(current_cost)

            # Check for convergence
            if step > 0 and abs(self.cost_history[-2] - current_cost) < self.tolerance:
                break

        return self

    def predict(self, X):
        """Generate predictions"""
        return X @ self.weights + self.bias

# ============================================
# PART 3: Prepare and Split Data
# ============================================

X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=0.2, random_state=42
)

# Scale features for stable gradient descent
normalizer = StandardScaler()
X_train_norm = normalizer.fit_transform(X_train)
X_test_norm = normalizer.transform(X_test)

# ============================================
# PART 4: Systematic Hyperparameter Search
# ============================================

learning_rates = [0.0001, 0.001, 0.01, 0.1, 1, 10]
lambda_values = [1e-15, 1e-10, 1e-5, 1e-3, 0, 1, 10, 20]

experiment_results = []

print("\n" + "="*80)
print("CONDUCTING GRID SEARCH OVER HYPERPARAMETERS")
print("="*80)

for lr in learning_rates:
    for lam in lambda_values:
        try:
            # Initialize and train
            regressor = RidgeRegressionOptimizer(
                learning_rate=lr,
                lambda_reg=lam,
                n_iterations=1000,
                tolerance=1e-8
            )
            regressor.fit(X_train_norm, y_train)

            # Generate predictions
            train_pred = regressor.predict(X_train_norm)
            test_pred = regressor.predict(X_test_norm)

            # Calculate metrics
            final_cost = regressor.cost_history[-1]
            test_error = mean_squared_error(y_test, test_pred)
            train_score = r2_score(y_train, train_pred)
            test_score = r2_score(y_test, test_pred)

            experiment_results.append({
                'learning_rate': lr,
                'lambda': lam,
                'train_cost': final_cost,
                'test_mse': test_error,
                'train_r2': train_score,
                'test_r2': test_score,
                'iterations': len(regressor.cost_history)
            })

        except:
            # Handle failed runs
            experiment_results.append({
                'learning_rate': lr,
                'lambda': lam,
                'train_cost': np.inf,
                'test_mse': np.inf,
                'train_r2': -np.inf,
                'test_r2': -np.inf,
                'iterations': 0
            })

results_table = pd.DataFrame(experiment_results)

# ============================================
# PART 5: Identify Optimal Parameters
# ============================================

valid_results = results_table[results_table['train_cost'] != np.inf]

# Best by minimum cost
min_cost_idx = valid_results['train_cost'].idxmin()
best_by_cost = valid_results.loc[min_cost_idx]

# Best by maximum R¬≤ on test data
max_r2_idx = valid_results['test_r2'].idxmax()
best_by_r2 = valid_results.loc[max_r2_idx]

print("\n" + "="*80)
print("OPTIMAL PARAMETERS (Minimum Cost)")
print("="*80)
print(f"Learning Rate: {best_by_cost['learning_rate']}")
print(f"Regularization Strength: {best_by_cost['lambda']}")
print(f"Training Cost: {best_by_cost['train_cost']:.6f}")
print(f"Test MSE: {best_by_cost['test_mse']:.6f}")
print(f"Train R¬≤: {best_by_cost['train_r2']:.6f}")
print(f"Test R¬≤: {best_by_cost['test_r2']:.6f}")
print(f"Steps: {int(best_by_cost['iterations'])}")

print("\n" + "="*80)
print("OPTIMAL PARAMETERS (Maximum Test R¬≤)")
print("="*80)
print(f"Learning Rate: {best_by_r2['learning_rate']}")
print(f"Regularization Strength: {best_by_r2['lambda']}")
print(f"Training Cost: {best_by_r2['train_cost']:.6f}")
print(f"Test MSE: {best_by_r2['test_mse']:.6f}")
print(f"Train R¬≤: {best_by_r2['train_r2']:.6f}")
print(f"Test R¬≤: {best_by_r2['test_r2']:.6f}")
print(f"Steps: {int(best_by_r2['iterations'])}")

# ============================================
# PART 6: Train Final Model
# ============================================

optimal_lr = best_by_r2['learning_rate']
optimal_lambda = best_by_r2['lambda']

final_regressor = RidgeRegressionOptimizer(
    learning_rate=optimal_lr,
    lambda_reg=optimal_lambda,
    n_iterations=1000,
    tolerance=1e-8
)
final_regressor.fit(X_train_norm, y_train)

train_final_pred = final_regressor.predict(X_train_norm)
test_final_pred = final_regressor.predict(X_test_norm)

print("\n" + "="*80)
print("FINAL MODEL EVALUATION")
print("="*80)
print(f"Train R¬≤ Score: {r2_score(y_train, train_final_pred):.6f}")
print(f"Test R¬≤ Score: {r2_score(y_test, test_final_pred):.6f}")
print(f"Train MSE: {mean_squared_error(y_train, train_final_pred):.6f}")
print(f"Test MSE: {mean_squared_error(y_test, test_final_pred):.6f}")

# ============================================
# PART 7: Summary of Top Configurations
# ============================================

print("\n" + "="*80)
print("TOP 10 HYPERPARAMETER COMBINATIONS (by Test R¬≤)")
print("="*80)
top_configs = valid_results.nlargest(
    10, 'test_r2'
)[['learning_rate', 'lambda', 'train_cost', 'test_mse', 'train_r2', 'test_r2']]
print(top_configs.to_string(index=False))

# Save results
results_table.to_csv('ridge_regression_results.csv', index=False)
print("\n‚úì Results exported to 'ridge_regression_results.csv'")


Data Dimensions: (1000, 8)

Feature Correlation Matrix:
       X1     X2     X3     X4     X5     X6     X7
X1  1.000  0.914  0.906  0.912  0.915  0.910  0.917
X2  0.914  1.000  0.915  0.919  0.916  0.911  0.910
X3  0.906  0.915  1.000  0.913  0.914  0.910  0.909
X4  0.912  0.919  0.913  1.000  0.910  0.912  0.915
X5  0.915  0.916  0.914  0.910  1.000  0.912  0.916
X6  0.910  0.911  0.910  0.912  0.912  1.000  0.909
X7  0.917  0.910  0.909  0.915  0.916  0.909  1.000

CONDUCTING GRID SEARCH OVER HYPERPARAMETERS


  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  mse_loss = (1 / (2 * n)) * np.sum(error ** 2)
  if step > 0 and abs(self.cost_history[-2] - current_cost) < self.tolerance:
  reg_penalty = (self.lambda_reg / (2 * n)) * np.sum(weights ** 2)
  weight_gradient = (1 / n) * (X.T @ residuals) + \
  predictions = X @ weights + bias
  predictions = X @ self.weights + self.bias
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  mse_loss = (1 / (2 * n)) * np.sum(error ** 2)
  if step > 0 and abs(self.cost_history[-2] - current_cost) < self.tolerance:
  reg_penalty = (self.lambda_reg / (2 * n)) * np.sum(weights ** 2)
  weight_gradient = (1 / n) * (X.T @ residuals) + \
  predictions = X @ weights + bias
  predictions = X @ self.weights + self.bias
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  mse_loss = (1 / (2 * n)) * np.sum(error ** 2)
  if st


OPTIMAL PARAMETERS (Minimum Cost)
Learning Rate: 0.1
Regularization Strength: 1e-15
Training Cost: 0.126831
Test MSE: 0.258424
Train R¬≤: 0.938961
Test R¬≤: 0.934680
Steps: 737

OPTIMAL PARAMETERS (Maximum Test R¬≤)
Learning Rate: 0.1
Regularization Strength: 1e-15
Training Cost: 0.126831
Test MSE: 0.258424
Train R¬≤: 0.938961
Test R¬≤: 0.934680
Steps: 737

FINAL MODEL EVALUATION
Train R¬≤ Score: 0.938961
Test R¬≤ Score: 0.934680
Train MSE: 0.253661
Test MSE: 0.258424

TOP 10 HYPERPARAMETER COMBINATIONS (by Test R¬≤)
 learning_rate       lambda  train_cost  test_mse  train_r2  test_r2
          0.10 1.000000e-15    0.126831  0.258424  0.938961 0.934680
          0.10 0.000000e+00    0.126831  0.258424  0.938961 0.934680
          0.10 1.000000e-10    0.126831  0.258424  0.938961 0.934680
          0.10 1.000000e-05    0.126831  0.258424  0.938961 0.934680
          0.10 1.000000e-03    0.126832  0.258424  0.938961 0.934680
          0.10 1.000000e+00    0.128427  0.258508  0.938949 0.

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score


# ============================================
# STEP 1: Load & Clean Dataset
# ============================================

dataset = pd.read_csv('/content/Hitters.csv')
print("="*70)
print("BASEBALL SALARY PREDICTION: LINEAR vs RIDGE vs LASSO")
print("="*70)

print("\n[STEP 1] DATASET ACQUISITION & CLEANING")
print(f"  Initial Records: {len(dataset):,}")
print(f"  Total Null Values: {dataset.isnull().sum().sum()}")

# Remove missing records
dataset = dataset.dropna()
print(f"  After Null Removal: {len(dataset):,}")

# Eliminate duplicates
dataset = dataset.drop_duplicates()
print(f"  After Deduplication: {len(dataset):,}")

# Transform categorical variables to numeric
string_features = [col for col in dataset.columns if dataset[col].dtype == 'object']
for feature in string_features:
    encoder = LabelEncoder()
    dataset[feature] = encoder.fit_transform(dataset[feature])

print(f"  Categorical Features Encoded: {len(string_features)}")
print(f"  ‚úì Data cleaning complete\n")


# ============================================
# STEP 2: Split & Normalize
# ============================================

print("[STEP 2] FEATURE ENGINEERING & NORMALIZATION")

output = 'Salary'
inputs = dataset.drop(columns=[output])
target = dataset[output]

print(f"  Input Features (X): {inputs.shape[0]:,} samples √ó {inputs.shape[1]} variables")
print(f"  Output Variable (y): {target.shape[0]:,} samples")

# Partition dataset
X_tr, X_te, y_tr, y_te = train_test_split(inputs, target, test_size=0.2, random_state=42)

# Standardize features
norm = StandardScaler()
X_tr_norm = norm.fit_transform(X_tr)
X_te_norm = norm.transform(X_te)

print(f"  Train Split: {X_tr_norm.shape[0]:,} samples")
print(f"  Test Split: {X_te_norm.shape[0]:,} samples")
print(f"  Normalization: StandardScaler applied\n")


# ============================================
# STEP 3: Build Regression Models
# ============================================

print("[STEP 3] MODEL CONSTRUCTION & TRAINING")
print(f"  Regularization Parameter (Œ±): 0.5748\n")

# Define models
regressors = {
    'Vanilla Linear': LinearRegression(),
    'Ridge (L2)': Ridge(alpha=0.5748, random_state=42),
    'Lasso (L1)': Lasso(alpha=0.5748, random_state=42, max_iter=10000)
}

metrics_collection = {}

for model_label, regressor in regressors.items():
    print(f"  {model_label}")
    print(f"    " + "-"*60)

    # Train
    regressor.fit(X_tr_norm, y_tr)

    # Forecast
    y_tr_hat = regressor.predict(X_tr_norm)
    y_te_hat = regressor.predict(X_te_norm)

    # Compute metrics
    r2_train = r2_score(y_tr, y_tr_hat)
    r2_test = r2_score(y_te, y_te_hat)
    rmse_train = np.sqrt(mean_squared_error(y_tr, y_tr_hat))
    rmse_test = np.sqrt(mean_squared_error(y_te, y_te_hat))
    overfit = r2_train - r2_test

    metrics_collection[model_label] = {
        'R¬≤ (Train)': r2_train,
        'R¬≤ (Test)': r2_test,
        'RMSE (Train)': rmse_train,
        'RMSE (Test)': rmse_test,
        'Overfitting': overfit
    }

    print(f"    Train: R¬≤={r2_train:.5f} | RMSE=${rmse_train:.2f}K")
    print(f"    Test:  R¬≤={r2_test:.5f} | RMSE=${rmse_test:.2f}K")
    print(f"    Generalization Gap: {overfit:.5f}\n")


# ============================================
# STEP 4: Comparative Analysis
# ============================================

print("="*70)
print("[STEP 4] PERFORMANCE COMPARISON")
print("="*70 + "\n")

summary_table = pd.DataFrame(metrics_collection).T
print(summary_table.round(5).to_string())

# Identify champion
champion = summary_table['R¬≤ (Test)'].idxmax()
champion_r2 = summary_table.loc[champion, 'R¬≤ (Test)']

print(f"\n{'ü•á CHAMPION MODEL':^70}")
print(f"{'-'*70}")
print(f"Model: {champion}")
print(f"Test R¬≤: {champion_r2:.5f} ({champion_r2*100:.2f}% variance explained)")
print(f"Test RMSE: ${summary_table.loc[champion, 'RMSE (Test)']:.2f}K\n")

# Ranking
rank_df = summary_table.sort_values('R¬≤ (Test)', ascending=False)
print(f"{'MODEL RANKING':^70}")
for idx, (model_name, metrics) in enumerate(rank_df.iterrows(), 1):
    print(f"  #{idx} | {model_name:<25} | Test R¬≤: {metrics['R¬≤ (Test)']:.5f}")


# ============================================
# STEP 5: Export & Summary
# ============================================

print(f"\n{'='*70}")
print("[STEP 5] RESULTS EXPORT")
print(f"{'='*70}")

summary_table.to_csv('hitters_regression_comparison.csv')
print("\n‚úì Comparison metrics exported ‚Üí 'hitters_regression_comparison.csv'")

print(f"\nKEY METRICS:")
print(f"  ‚Ä¢ Best Model: {champion}")
print(f"  ‚Ä¢ Variance Explained: {champion_r2*100:.2f}%")
print(f"  ‚Ä¢ Test RMSE: ${summary_table.loc[champion, 'RMSE (Test)']:.2f}K")
print(f"  ‚Ä¢ Generalization Consistency: {(1-summary_table.loc[champion, 'Overfitting'])*100:.1f}%")


BASEBALL SALARY PREDICTION: LINEAR vs RIDGE vs LASSO

[STEP 1] DATASET ACQUISITION & CLEANING
  Initial Records: 322
  Total Null Values: 59
  After Null Removal: 263
  After Deduplication: 263
  Categorical Features Encoded: 3
  ‚úì Data cleaning complete

[STEP 2] FEATURE ENGINEERING & NORMALIZATION
  Input Features (X): 263 samples √ó 19 variables
  Output Variable (y): 263 samples
  Train Split: 210 samples
  Test Split: 53 samples
  Normalization: StandardScaler applied

[STEP 3] MODEL CONSTRUCTION & TRAINING
  Regularization Parameter (Œ±): 0.5748

  Vanilla Linear
    ------------------------------------------------------------
    Train: R¬≤=0.59047 | RMSE=$291.83K
    Test:  R¬≤=0.29075 | RMSE=$358.17K
    Generalization Gap: 0.29972

  Ridge (L2)
    ------------------------------------------------------------
    Train: R¬≤=0.58816 | RMSE=$292.65K
    Test:  R¬≤=0.30004 | RMSE=$355.81K
    Generalization Gap: 0.28812

  Lasso (L1)
    ----------------------------------------

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV, LassoCV, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt


print("CROSS VALIDATION FOR RIDGE AND LASSO REGRESSION")
print("Boston House Prediction Dataset")


# STEP 1: Load Boston Housing Dataset
print("\nSTEP 1: LOADING BOSTON HOUSING DATASET")

# Load from online CSV (sklearn removed load_boston in v1.2+)
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv')
X = df.drop('medv', axis=1).values
y = df['medv'].values
feature_names = df.drop('medv', axis=1).columns.tolist()

print(f"Dataset shape: {X.shape}")
print(f"Features: {list(feature_names)}")
print(f"Target: PRICE (Median house value in $1000s)")


# STEP 2: Data Preprocessing
print("\nSTEP 2: DATA PREPROCESSING")


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


print(f"Train: {len(X_train)} | Test: {len(X_test)}")
print("Features scaled")


# STEP 3: Ridge Cross Validation (RidgeCV)
print("\nSTEP 3: RIDGE CROSS VALIDATION (RidgeCV)")


alphas = np.logspace(-4, 4, 100)  # 100 alpha values
ridge_cv = RidgeCV(alphas=alphas, cv=5, scoring='neg_mean_squared_error')
ridge_cv.fit(X_train_scaled, y_train)


print(f"5-fold cross-validation completed")
print(f"Best alpha: {ridge_cv.alpha_:.6f}")


# Train with best alpha
ridge_best = Ridge(alpha=ridge_cv.alpha_)
ridge_best.fit(X_train_scaled, y_train)


y_test_pred_ridge = ridge_best.predict(X_test_scaled)
ridge_test_r2 = r2_score(y_test, y_test_pred_ridge)
ridge_test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred_ridge))


print(f"Test R¬≤: {ridge_test_r2:.6f}")
print(f"Test RMSE: {ridge_test_rmse:.4f}")


# STEP 4: Lasso Cross Validation (LassoCV)
print("\nSTEP 4: LASSO CROSS VALIDATION (LassoCV)")


lasso_cv = LassoCV(alphas=alphas, cv=5, random_state=42, max_iter=10000)
lasso_cv.fit(X_train_scaled, y_train)


print(f"5-fold cross-validation completed")
print(f"Best alpha: {lasso_cv.alpha_:.6f}")


# Train with best alpha
lasso_best = Lasso(alpha=lasso_cv.alpha_, max_iter=10000)
lasso_best.fit(X_train_scaled, y_train)


y_test_pred_lasso = lasso_best.predict(X_test_scaled)
lasso_test_r2 = r2_score(y_test, y_test_pred_lasso)
lasso_test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred_lasso))


print(f"Test R¬≤: {lasso_test_r2:.6f}")
print(f"Test RMSE: {lasso_test_rmse:.4f}")


# Feature selection
non_zero = np.sum(np.abs(lasso_best.coef_) > 1e-10)
print(f"Features selected: {non_zero}/{len(lasso_best.coef_)}")


# STEP 5: Model Comparison
print("\nMODEL COMPARISON")


results = pd.DataFrame({
    'Ridge (CV)': {
        'Optimal Alpha': ridge_cv.alpha_,
        'Test R¬≤': ridge_test_r2,
        'Test RMSE': ridge_test_rmse
    },
    'Lasso (CV)': {
        'Optimal Alpha': lasso_cv.alpha_,
        'Test R¬≤': lasso_test_r2,
        'Test RMSE': lasso_test_rmse
    }
}).T


print(results.round(6))


best_model = results['Test R¬≤'].idxmax()
print(f"\nBEST MODEL: {best_model}")
print(f"Test R¬≤: {results.loc[best_model, 'Test R¬≤']:.6f}")


# Feature importance
print("\nFEATURE IMPORTANCE")


print("\nTop 5 Ridge coefficients:")
ridge_imp = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': ridge_best.coef_
}).sort_values('Coefficient', key=abs, ascending=False).head(5)
print(ridge_imp.to_string(index=False))


print("\nTop 5 Lasso coefficients:")
lasso_imp = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': lasso_best.coef_
}).sort_values('Coefficient', key=abs, ascending=False).head(5)
print(lasso_imp.to_string(index=False))


print("\nANALYSIS COMPLETE")


In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV, LassoCV, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score


# ================================================
# REAL ESTATE VALUATION: REGULARIZED REGRESSION
# ================================================

print("="*75)
print("PREDICTIVE MODELING WITH REGULARIZED LINEAR REGRESSION")
print("Dataset: Boston Housing Market Analysis")
print("="*75)


# ================================================
# PHASE 1: DATA ACQUISITION
# ================================================

print("\n[PHASE 1] ACQUIRING & PREPARING DATA")
print("-"*75)

# Retrieve housing dataset
housing_data = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv')
predictors = housing_data.drop('medv', axis=1).values
outcomes = housing_data['medv'].values
attribute_list = housing_data.drop('medv', axis=1).columns.tolist()

print(f"Dataset Dimensions: {predictors.shape[0]} properties √ó {predictors.shape[1]} attributes")
print(f"Target Variable: Median Home Value (thousands USD)")
print(f"Predictive Features: {', '.join(attribute_list[:3])}...+{len(attribute_list)-3} more")


# ================================================
# PHASE 2: DATA SPLITTING & NORMALIZATION
# ================================================

print("\n[PHASE 2] TRAIN-TEST PARTITIONING & SCALING")
print("-"*75)

X_train, X_test, y_train, y_test = train_test_split(
    predictors, outcomes, test_size=0.2, random_state=42
)

normalizer = StandardScaler()
X_train_normalized = normalizer.fit_transform(X_train)
X_test_normalized = normalizer.transform(X_test)

print(f"Training Subset: {X_train_normalized.shape[0]} samples")
print(f"Testing Subset: {X_test_normalized.shape[0]} samples")
print(f"Scaling Applied: StandardScaler (mean=0, std=1)")


# ================================================
# PHASE 3: RIDGE REGRESSION WITH CV
# ================================================

print("\n[PHASE 3] RIDGE REGRESSION - HYPERPARAMETER OPTIMIZATION")
print("-"*75)

alpha_candidates = np.logspace(-4, 4, 100)
ridge_optimizer = RidgeCV(alphas=alpha_candidates, cv=5, scoring='neg_mean_squared_error')
ridge_optimizer.fit(X_train_normalized, y_train)

optimal_ridge_alpha = ridge_optimizer.alpha_

ridge_model = Ridge(alpha=optimal_ridge_alpha)
ridge_model.fit(X_train_normalized, y_train)

ridge_predictions = ridge_model.predict(X_test_normalized)
ridge_r2 = r2_score(y_test, ridge_predictions)
ridge_rmse = np.sqrt(mean_squared_error(y_test, ridge_predictions))

print(f"Validation Strategy: 5-Fold Cross-Validation")
print(f"Alpha Search Space: [{alpha_candidates[0]:.2e}, {alpha_candidates[-1]:.2e}]")
print(f"Optimal Alpha: {optimal_ridge_alpha:.6f}")
print(f"‚îú‚îÄ Test R¬≤ Score: {ridge_r2:.6f}")
print(f"‚îî‚îÄ Test RMSE: ${ridge_rmse:.4f}K")


# ================================================
# PHASE 4: LASSO REGRESSION WITH CV
# ================================================

print("\n[PHASE 4] LASSO REGRESSION - HYPERPARAMETER OPTIMIZATION")
print("-"*75)

lasso_optimizer = LassoCV(alphas=alpha_candidates, cv=5, random_state=42, max_iter=10000)
lasso_optimizer.fit(X_train_normalized, y_train)

optimal_lasso_alpha = lasso_optimizer.alpha_

lasso_model = Lasso(alpha=optimal_lasso_alpha, max_iter=10000)
lasso_model.fit(X_train_normalized, y_train)

lasso_predictions = lasso_model.predict(X_test_normalized)
lasso_r2 = r2_score(y_test, lasso_predictions)
lasso_rmse = np.sqrt(mean_squared_error(y_test, lasso_predictions))

active_features = np.sum(np.abs(lasso_model.coef_) > 1e-10)

print(f"Validation Strategy: 5-Fold Cross-Validation")
print(f"Alpha Search Space: [{alpha_candidates[0]:.2e}, {alpha_candidates[-1]:.2e}]")
print(f"Optimal Alpha: {optimal_lasso_alpha:.6f}")
print(f"‚îú‚îÄ Test R¬≤ Score: {lasso_r2:.6f}")
print(f"‚îú‚îÄ Test RMSE: ${lasso_rmse:.4f}K")
print(f"‚îî‚îÄ Active Features: {active_features} out of {len(lasso_model.coef_)}")


# ================================================
# PHASE 5: COMPARATIVE EVALUATION
# ================================================

print("\n[PHASE 5] PERFORMANCE METRICS & COMPARISON")
print("-"*75)

comparison = pd.DataFrame({
    'Ridge (L2)': {
        'Optimal Œ±': optimal_ridge_alpha,
        'Test R¬≤': ridge_r2,
        'Test RMSE': ridge_rmse,
        'Features': len(ridge_model.coef_)
    },
    'Lasso (L1)': {
        'Optimal Œ±': optimal_lasso_alpha,
        'Test R¬≤': lasso_r2,
        'Test RMSE': lasso_rmse,
        'Features': active_features
    }
}).T

print(comparison.round(6).to_string())

winner = comparison['Test R¬≤'].idxmax()
best_score = comparison.loc[winner, 'Test R¬≤']

print(f"\n{'‚≠ê SUPERIOR MODEL':^75}")
print(f"{winner} | R¬≤ = {best_score:.6f}")


# ================================================
# PHASE 6: COEFFICIENT ANALYSIS
# ================================================

print("\n[PHASE 6] FEATURE CONTRIBUTION ANALYSIS")
print("-"*75)

print("\nRIDGE - Top 5 Influential Features:")
ridge_features = pd.DataFrame({
    'Feature': attribute_list,
    'Weight': ridge_model.coef_
}).reindex(pd.DataFrame({
    'Feature': attribute_list,
    'Weight': ridge_model.coef_
}).sort_values('Weight', key=abs, ascending=False).index).head(5)

for i, (_, row) in enumerate(ridge_features.iterrows(), 1):
    print(f"  {i}. {row['Feature']:<15} ‚Üí {row['Weight']:>10.6f}")

print("\nLASSO - Top 5 Influential Features:")
lasso_features = pd.DataFrame({
    'Feature': attribute_list,
    'Weight': lasso_model.coef_
}).reindex(pd.DataFrame({
    'Feature': attribute_list,
    'Weight': lasso_model.coef_
}).sort_values('Weight', key=abs, ascending=False).index).head(5)

for i, (_, row) in enumerate(lasso_features.iterrows(), 1):
    print(f"  {i}. {row['Feature']:<15} ‚Üí {row['Weight']:>10.6f}")


# ================================================
# SUMMARY
# ================================================

print("\n" + "="*75)
print("ANALYSIS SUMMARY")
print("="*75)
print(f"Ridge Œ±: {optimal_ridge_alpha:.6f} ‚Üí R¬≤: {ridge_r2:.4f} | RMSE: ${ridge_rmse:.2f}K")
print(f"Lasso Œ±: {optimal_lasso_alpha:.6f} ‚Üí R¬≤: {lasso_r2:.4f} | RMSE: ${lasso_rmse:.2f}K")
print("\n‚úì Cross-validation complete | Models ready for deployment")


PREDICTIVE MODELING WITH REGULARIZED LINEAR REGRESSION
Dataset: Boston Housing Market Analysis

[PHASE 1] ACQUIRING & PREPARING DATA
---------------------------------------------------------------------------
Dataset Dimensions: 506 properties √ó 13 attributes
Target Variable: Median Home Value (thousands USD)
Predictive Features: crim, zn, indus...+10 more

[PHASE 2] TRAIN-TEST PARTITIONING & SCALING
---------------------------------------------------------------------------
Training Subset: 404 samples
Testing Subset: 102 samples
Scaling Applied: StandardScaler (mean=0, std=1)

[PHASE 3] RIDGE REGRESSION - HYPERPARAMETER OPTIMIZATION
---------------------------------------------------------------------------
Validation Strategy: 5-Fold Cross-Validation
Alpha Search Space: [1.00e-04, 1.00e+04]
Optimal Alpha: 2.310130
‚îú‚îÄ Test R¬≤ Score: 0.668074
‚îî‚îÄ Test RMSE: $4.9337K

[PHASE 4] LASSO REGRESSION - HYPERPARAMETER OPTIMIZATION
-----------------------------------------------------

In [6]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


# ================================================================
# FLORAL CLASSIFICATION SYSTEM - IRIS SPECIES RECOGNITION
# ================================================================

print("="*80)
print("MULTICLASS PATTERN RECOGNITION: IRIS FLOWER CLASSIFICATION")
print("Methodology: Logistic Regression with One-vs-Rest Strategy")
print("="*80)


# ================================================================
# SEGMENT A: DATASET INITIALIZATION
# ================================================================

print("\n[SEGMENT A] BOTANICAL DATASET ACQUISITION")
print("-"*80)

# Import iris flower measurements
iris_collection = load_iris()
measurements = iris_collection.data
classifications = iris_collection.target
measurement_names = iris_collection.feature_names
species_names = iris_collection.target_names

print(f"Total Specimens: {measurements.shape[0]}")
print(f"Attributes Per Specimen: {measurements.shape[1]}")
print(f"Species Categories: {len(species_names)}")
print(f"\nFloral Attributes:")
for idx, attr in enumerate(measurement_names, 1):
    print(f"  {idx}. {attr}")
print(f"\nTarget Species:")
for idx, species in enumerate(species_names):
    print(f"  {idx} ‚Üí {species}")


# ================================================================
# SEGMENT B: DATASET PARTITIONING & NORMALIZATION
# ================================================================

print("\n[SEGMENT B] TRAIN-TEST STRATIFICATION & FEATURE SCALING")
print("-"*80)

# Stratified split to preserve class distribution
train_measurements, test_measurements, train_labels, test_labels = train_test_split(
    measurements, classifications, test_size=0.2, random_state=42, stratify=classifications
)

print(f"Training Cohort: {train_measurements.shape[0]} specimens")
print(f"Testing Cohort: {test_measurements.shape[0]} specimens")

# Standardization pipeline
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train_measurements)
test_scaled = scaler.transform(test_measurements)

print(f"Normalization Method: StandardScaler (Œº=0, œÉ=1)")
print(f"‚úì Data preparation complete")


# ================================================================
# SEGMENT C: MULTICLASS LOGISTIC REGRESSION (ONE-VS-REST)
# ================================================================

print("\n[SEGMENT C] CLASSIFIER TRAINING - ONE-VS-REST APPROACH")
print("-"*80)

# Initialize OvR classifier
classifier = LogisticRegression(
    multi_class='ovr',
    solver='lbfgs',
    max_iter=200,
    random_state=42
)

print("Training configuration:")
print(f"  ‚Ä¢ Strategy: One-vs-Rest (OvR)")
print(f"  ‚Ä¢ Binary Classifiers: {len(species_names)}")
print(f"  ‚Ä¢ Optimization: LBFGS")
print(f"  ‚Ä¢ Max Iterations: 200")
print(f"\nFitting model on training data...")

classifier.fit(train_scaled, train_labels)

print("‚úì Model convergence achieved")


# ================================================================
# SEGMENT D: PREDICTIVE PERFORMANCE ANALYSIS
# ================================================================

print("\n[SEGMENT D] CLASSIFICATION ACCURACY & DIAGNOSTICS")
print("-"*80)

# Generate predictions
train_predictions = classifier.predict(train_scaled)
test_predictions = classifier.predict(test_scaled)
confidence_scores = classifier.predict_proba(test_scaled)

# Compute metrics
train_acc = accuracy_score(train_labels, train_predictions)
test_acc = accuracy_score(test_labels, test_predictions)

print(f"\nAccuracy Metrics:")
print(f"  Training Set:  {train_acc*100:>6.2f}%")
print(f"  Testing Set:   {test_acc*100:>6.2f}%")
print(f"  Generalization Gap: {(train_acc - test_acc)*100:>5.2f}%")

print(f"\nDetailed Classification Breakdown:")
print(classification_report(test_labels, test_predictions, target_names=species_names))

print("\nPrediction Error Matrix:")
error_matrix = confusion_matrix(test_labels, test_predictions)
error_df = pd.DataFrame(error_matrix, index=species_names, columns=species_names)
print(error_df.to_string())


# ================================================================
# SEGMENT E: BINARY CLASSIFIER INSPECTION
# ================================================================

print("\n[SEGMENT E] ONE-VS-REST BINARY DECISION BOUNDARIES")
print("-"*80)

model_weights = classifier.coef_
model_biases = classifier.intercept_

for class_idx, species in enumerate(species_names):
    print(f"\nüìä Classifier #{class_idx + 1}: {species.upper()} vs Others")
    print(f"  Bias Term: {model_biases[class_idx]:>8.4f}")
    print(f"  Feature Weights:")

    for feat_idx, feat_name in enumerate(measurement_names):
        weight_val = model_weights[class_idx][feat_idx]
        direction = "‚Üë" if weight_val > 0 else "‚Üì"
        print(f"    {direction} {feat_name:<25} {weight_val:>10.4f}")


# ================================================================
# SEGMENT F: FEATURE CONTRIBUTION RANKING
# ================================================================

print("\n[SEGMENT F] GLOBAL FEATURE IMPORTANCE SCORES")
print("-"*80)

# Calculate mean absolute weights across all classifiers
importance_scores = np.abs(model_weights).mean(axis=0)
importance_ranking = pd.DataFrame({
    'Measurement': measurement_names,
    'Impact_Score': importance_scores
}).sort_values('Impact_Score', ascending=False).reset_index(drop=True)

print("\nRanking (Most to Least Discriminative):")
for rank, (_, row) in enumerate(importance_ranking.iterrows(), 1):
    bar_length = int(row['Impact_Score'] * 50)
    bar = "‚ñà" * bar_length
    print(f"  {rank}. {row['Measurement']:<25} {bar} {row['Impact_Score']:.4f}")


# ================================================================
# SEGMENT G: INDIVIDUAL SPECIMEN ANALYSIS
# ================================================================

print("\n[SEGMENT G] SAMPLE CLASSIFICATION RESULTS")
print("-"*80)

display_count = min(5, len(test_measurements))
for sample_num in range(display_count):
    actual_species = species_names[test_labels[sample_num]]
    predicted_species = species_names[test_predictions[sample_num]]
    probabilities = confidence_scores[sample_num]
    match_status = "‚úì MATCH" if test_labels[sample_num] == test_predictions[sample_num] else "‚úó MISMATCH"

    print(f"\nSpecimen #{sample_num + 1}:")
    print(f"  Ground Truth: {actual_species}")
    print(f"  Model Output: {predicted_species}")
    print(f"  Status: {match_status}")
    print(f"  Confidence Distribution:")

    for class_idx, species in enumerate(species_names):
        conf_pct = probabilities[class_idx] * 100
        bar_len = int(conf_pct / 5)
        bar = "‚ñ¨" * bar_len
        print(f"    ‚Ä¢ {species:<15} {bar} {conf_pct:>6.2f}%")


# ================================================================
# RESULTS SUMMARY
# ================================================================

print("\n" + "="*80)
print("CLASSIFICATION PIPELINE COMPLETE")
print("="*80)

print(f"\nüìà Model Performance Summary:")
print(f"   Dataset: Iris Flower Database (150 specimens)")
print(f"   Attributes: 4 morphological measurements")
print(f"   Target Classes: 3 species categories")
print(f"   Classification Method: Logistic Regression (OvR)")
print(f"   Test Accuracy: {test_acc*100:.2f}%")

print(f"\n‚≠ê Top Predictive Features:")
for rank, (_, row) in enumerate(importance_ranking.head(2).iterrows(), 1):
    print(f"   {rank}. {row['Measurement']}")

print(f"\n‚úì All classifiers trained and evaluated successfully")


MULTICLASS PATTERN RECOGNITION: IRIS FLOWER CLASSIFICATION
Methodology: Logistic Regression with One-vs-Rest Strategy

[SEGMENT A] BOTANICAL DATASET ACQUISITION
--------------------------------------------------------------------------------
Total Specimens: 150
Attributes Per Specimen: 4
Species Categories: 3

Floral Attributes:
  1. sepal length (cm)
  2. sepal width (cm)
  3. petal length (cm)
  4. petal width (cm)

Target Species:
  0 ‚Üí setosa
  1 ‚Üí versicolor
  2 ‚Üí virginica

[SEGMENT B] TRAIN-TEST STRATIFICATION & FEATURE SCALING
--------------------------------------------------------------------------------
Training Cohort: 120 specimens
Testing Cohort: 30 specimens
Normalization Method: StandardScaler (Œº=0, œÉ=1)
‚úì Data preparation complete

[SEGMENT C] CLASSIFIER TRAINING - ONE-VS-REST APPROACH
--------------------------------------------------------------------------------
Training configuration:
  ‚Ä¢ Strategy: One-vs-Rest (OvR)
  ‚Ä¢ Binary Classifiers: 3
  ‚Ä¢ Op

