<a href="https://colab.research.google.com/github/manansharma2711/UML-501-ML/blob/main/MLAssignment5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Q 1 (Based on Step-by-Step Implementation of Ridge Regression using Gradient Descent Optimization) Generate a dataset with atleast seven highly correlated columns and a target variable. Implement Ridge Regression using Gradient Descent Optimization. Take different values of learning rate (such as 0.0001,0.001,0.01,0.1,1,10) and regularization parameter (10-15,10-10,10-5 ,10- 3 ,0,1,10,20). Choose the best parameters for which ridge regression cost function is minimum and R2_score is maximum.

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

np.random.seed(0)
N = 500
z_val = np.random.randn(N)
features = np.column_stack([z_val + 0.01 * np.random.randn(N) for _ in range(7)])
features = np.column_stack([features, 0.5 * z_val + 0.2 * np.random.randn(N)])
true_weights = np.array([3, -2, 1, 0, 0.5, -1, 2, 4], dtype=float)
target = features.dot(true_weights) + 0.5 * np.random.randn(N)

X_tr, X_te, y_tr, y_te = train_test_split(features, target, test_size=0.25, random_state=1)

mean_vals = X_tr.mean(axis=0)
std_vals = X_tr.std(axis=0)
std_vals = np.where(np.isfinite(std_vals) & (std_vals > 0), std_vals, 1.0)
X_tr = (X_tr - mean_vals) / std_vals
X_te = (X_te - mean_vals) / std_vals

def ridge_reg(X, y, alpha, lam, iters=2000):
    X = X.astype(np.float64); y = y.astype(np.float64)
    m, n = X.shape
    wts = np.zeros(n, dtype=np.float64)
    bias = 0.0
    for _ in range(iters):
        preds = X.dot(wts) + bias
        diff = preds - y
        grad_wts = (2 / m) * (X.T.dot(diff)) + 2 * lam * wts
        grad_bias = (2 / m) * diff.sum()
        wts -= alpha * grad_wts
        bias -= alpha * grad_bias
        if not (np.isfinite(wts).all() and np.isfinite(bias)):
            return None
    return wts, bias

alphas = [0.0001, 0.001, 0.01, 0.1]
lambdas = [0, 0.001, 0.01, 0.1, 1, 10]
top_r2 = -1
opt_params = None

for a in alphas:
    for lam in lambdas:
        outcome = ridge_reg(X_tr, y_tr, a, lam)
        if outcome is None:
            continue
        wts, bias = outcome
        preds = X_te.dot(wts) + bias
        if not np.isfinite(preds).all():
            continue
        r2_val = r2_score(y_te, preds)
        if r2_val > top_r2:
            top_r2 = r2_val
            opt_params = (a, lam, r2_val)

print("Optimal LR, Lambda, R2 =", opt_params)

Optimal LR, Lambda, R2 = (0.1, 0, 0.9920703853817209)


Q 2 Load the Hitters dataset from the following link https://drive.google.com/file/d/1qzCKF6JKKMB0p7ul_lLy8tdmRk3vE_bG/view?usp=sharing (a) Pre-process the data (null values, noise, categorical to numerical encoding) (b) Separate input and output features and perform scaling (c) Fit a Linear, Ridge (use regularization parameter as 0.5748), and LASSO (use regularization parameter as 0.5748) regression function on the dataset. (d) Evaluate the performance of each trained model on test set. Which model performs the best and Why?

In [2]:
import pandas as pd
import requests
import kagglehub

path = kagglehub.dataset_download("floser/hitters")
data = pd.read_csv(path + "/Hitters.csv")

data = data.dropna(subset=["Salary"])
data = data.fillna(data.median(numeric_only=True))
for col in ["League", "Division", "NewLeague"]:
    data[col] = data[col].astype("category").cat.codes

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X = data.drop("Salary", axis=1)
y = data["Salary"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

from sklearn.linear_model import LinearRegression, Ridge, Lasso
model_lr = LinearRegression().fit(X_train, y_train)
model_ridge = Ridge(alpha=0.5748).fit(X_train, y_train)
model_lasso = Lasso(alpha=0.5748, max_iter=5000).fit(X_train, y_train)

from sklearn.metrics import mean_squared_error, r2_score
for name, mdl in [("Linear", model_lr), ("Ridge", model_ridge), ("Lasso", model_lasso)]:
    preds = mdl.predict(X_test)
    print(name, mean_squared_error(y_test, preds), r2_score(y_test, preds))

Downloading from https://www.kaggle.com/api/v1/datasets/download/floser/hitters?dataset_version_number=1...


100%|██████████| 8.96k/8.96k [00:00<00:00, 10.5MB/s]

Extracting files...





Linear 131898.53472017136 0.5531784057871476
Ridge 128967.77296468576 0.5631067013714948
Lasso 128572.34347396714 0.5644462646639643


In [3]:
import pandas as pd
import kagglehub
import os

# Download latest version using kagglehub
path = kagglehub.dataset_download("floser/hitters")

# Construct the full path to the Hitters.csv file
output = os.path.join(path, 'Hitters.csv')

data = pd.read_csv(output)
data = data.dropna(subset=["Salary"])
data = data.fillna(data.median(numeric_only=True))
for col in ["League", "Division", "NewLeague"]:
    data[col] = data[col].astype("category").cat.codes

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X = data.drop("Salary", axis=1)
y = data["Salary"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

from sklearn.linear_model import LinearRegression, Ridge, Lasso
model_lr = LinearRegression().fit(X_train, y_train)
model_ridge = Ridge(alpha=0.5748).fit(X_train, y_train)
model_lasso = Lasso(alpha=0.5748, max_iter=5000).fit(X_train, y_train)

from sklearn.metrics import mean_squared_error, r2_score
for name, mdl in [("Linear", model_lr), ("Ridge", model_ridge), ("Lasso", model_lasso)]:
    preds = mdl.predict(X_test)
    print(name, mean_squared_error(y_test, preds), r2_score(y_test, preds))

Linear 131898.53472017136 0.5531784057871476
Ridge 128967.77296468576 0.5631067013714948
Lasso 128572.34347396714 0.5644462646639643


Q 3 Cross Validation for Ridge and Lasso Regression Explore Ridge Cross Validation (RidgeCV) and Lasso Cross Validation (LassoCV) function of Python. Implement both on Boston House Prediction Dataset (load_boston dataset from sklearn.datasets).

In [4]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.metrics import r2_score
import numpy as np

data = fetch_openml("boston", version=1, as_frame=False)
X, y = data.data, data.target.astype(float)

X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.25, random_state=1)

scaler = StandardScaler()
X_tr = scaler.fit_transform(X_tr)
X_te = scaler.transform(X_te)

alpha_values = np.logspace(-6, 6, 13)

ridge_model = RidgeCV(alphas=alpha_values, cv=5).fit(X_tr, y_tr)
lasso_model = LassoCV(alphas=alpha_values, cv=5, max_iter=5000).fit(X_tr, y_tr)

print("Ridge best alpha:", ridge_model.alpha_)
print("Ridge R2:", r2_score(y_te, ridge_model.predict(X_te)))
print("Lasso best alpha:", lasso_model.alpha_)
print("Lasso R2:", r2_score(y_te, lasso_model.predict(X_te)))

Ridge best alpha: 10.0
Ridge R2: 0.7775108393295395
Lasso best alpha: 0.01
Lasso R2: 0.7787621490259895


Multiclass Logistic Regression: Implement Multiclass Logistic Regression (step-by step) on Iris dataset using one vs. rest strategy?

In [5]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, classification_report

data = load_iris()
X = data.data
y = data.target

X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.3, random_state=0)

sc = StandardScaler()
X_tr = sc.fit_transform(X_tr)
X_te = sc.transform(X_te)

clf = OneVsRestClassifier(LogisticRegression(max_iter=2000))
clf.fit(X_tr, y_tr)

y_pred = clf.predict(X_te)

print("Accuracy:", accuracy_score(y_te, y_pred))
print("\nClassification Report:\n", classification_report(y_te, y_pred))

Accuracy: 0.9111111111111111

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       0.94      0.83      0.88        18
           2       0.77      0.91      0.83        11

    accuracy                           0.91        45
   macro avg       0.90      0.91      0.91        45
weighted avg       0.92      0.91      0.91        45

