<a href="https://colab.research.google.com/github/niteshydv01/ML-LAB-102217260/blob/main/ML_Assignment4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

####Q 1 (Based on Step-by-Step Implementation of Ridge Regression using Gradient
Descent Optimization)
Generate a dataset with atleast seven highly correlated columns and a target variable.
Implement Ridge Regression using Gradient Descent Optimization. Take different
values of learning rate (such as 0.0001,0.001,0.01,0.1,1,10) and regularization
parameter (10-15,10-10,10-5
,10- 3
,0,1,10,20). Choose the best parameters for which ridge
regression cost function is minimum and R2_score is maximum.

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

In [None]:
np.random.seed(42)
n_samples = 1000
X1 = np.random.rand(n_samples)
X2 = X1 + np.random.normal(0, 0.01, n_samples)
X3 = X1 + np.random.normal(0, 0.01, n_samples)
X4 = X2 + np.random.normal(0, 0.01, n_samples)
X5 = X3 + np.random.normal(0, 0.01, n_samples)
X6 = X4 + np.random.normal(0, 0.01, n_samples)
X7 = X5 + np.random.normal(0, 0.01, n_samples)
X = np.column_stack([X1, X2, X3, X4, X5, X6, X7])
y = X1 + X2 * 2 + np.random.normal(0, 0.1, n_samples)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
class RidgeRegressionGD:
    def __init__(self, learning_rate=0.01, n_iters=1000, reg_param=0):
        self.learning_rate = learning_rate
        self.n_iters = n_iters
        self.reg_param = reg_param
        self.theta = None

    def fit(self, X, y):
        # Initialize weights
        m, n = X.shape
        self.theta = np.zeros(n)
        y = y.flatten()

        # Gradient Descent
        for i in range(self.n_iters):
            y_pred = X.dot(self.theta)
            residuals = y_pred - y
            gradient = (X.T.dot(residuals) + self.reg_param * self.theta) / m
            self.theta -= self.learning_rate * gradient

    def predict(self, X):
        return X.dot(self.theta)

    def cost_function(self, X, y):
        m = len(y)
        y_pred = X.dot(self.theta)
        cost = (1/(2*m)) * np.sum((y_pred - y)**2) + (self.reg_param/(2*m)) * np.sum(self.theta**2)
        return cost

In [None]:
best_r2_score = float('-inf')
best_params = {'learning_rate': None, 'reg_param': None}
best_model = None

# Use smaller learning rates and regularization parameters
learning_rates = [0.0001, 0.001, 0.01, 0.1]  # Remove larger values like 1 and 10
regularization_params = [1e-3, 0.01, 0.1, 1, 10]  # Focus on reasonable regularization values

for lr in learning_rates:
    for reg in regularization_params:
        model = RidgeRegressionGD(learning_rate=lr, n_iters=10000, reg_param=reg)
        model.fit(X_train_scaled, y_train)

        y_pred_test = model.predict(X_test_scaled)

        # Skip if predictions contain NaNs
        if np.isnan(y_pred_test).sum() > 0:
            print(f"Skipping due to NaN values for learning_rate: {lr}, reg_param: {reg}")
            continue

        # Compute R2 score
        test_r2 = r2_score(y_test, y_pred_test)

        # Check for the best model
        if test_r2 > best_r2_score:
            best_r2_score = test_r2
            best_params['learning_rate'] = lr
            best_params['reg_param'] = reg
            best_model = model

print(f"Best learning rate: {best_params['learning_rate']}")
print(f"Best regularization parameter: {best_params['reg_param']}")
print(f"Best R2 score: {best_r2_score}")


Best learning rate: 0.0001
Best regularization parameter: 10
Best R2 score: -1.8961014083430814


***
***
####Q 2 Load the Hitters dataset from the following link
https://drive.google.com/file/d/1qzCKF6JKKMB0p7ul_lLy8tdmRk3vE_bG/view?usp=sharing

(a) Pre-process the data (null values, noise, categorical to numerical encoding)

(b) Separate input and output features and perform scaling

(c) Fit a Linear, Ridge (use regularization parameter as 0.5748), and LASSO (use
regularization parameter as 0.5748) regression function on the dataset.

(d) Evaluate the performance of each trained model on test set. Which model performs
the best and Why?

!!!!! provided dataset link is not working !!!!!

***
***
####Q 3 Cross Validation for Ridge and Lasso Regression
Explore Ridge Cross Validation (RidgeCV) and Lasso Cross Validation (LassoCV)
function of Python. Implement both on Boston House Prediction Dataset (load_boston
dataset from sklearn.datasets).

In [None]:
from sklearn.datasets import fetch_openml
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

In [None]:
boston = fetch_openml(name='boston', version=1, as_frame=True)
X = boston.data
y = boston.target
X = np.array(X)
y = np.array(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
alphas = np.logspace(-6, 6, 200)

In [None]:
ridge_cv = RidgeCV(alphas=alphas)
ridge_cv.fit(X_train, y_train)
ridge_pred = ridge_cv.predict(X_test)

In [None]:
lasso_cv = LassoCV(alphas=alphas, max_iter=10000, cv=5)
lasso_cv.fit(X_train, y_train)
lasso_pred = lasso_cv.predict(X_test)

In [None]:
ridge_mse = mean_squared_error(y_test, ridge_pred)
lasso_mse = mean_squared_error(y_test, lasso_pred)
print("Optimal alpha for RidgeCV: ", ridge_cv.alpha_)
print("Mean Squared Error for RidgeCV: ", ridge_mse)
print("Optimal alpha for LassoCV: ", lasso_cv.alpha_)
print("Mean Squared Error for LassoCV: ", lasso_mse)

Optimal alpha for RidgeCV:  0.02523539170434766
Mean Squared Error for RidgeCV:  24.29287565098284
Optimal alpha for LassoCV:  1e-06
Mean Squared Error for LassoCV:  24.29111675373595
