<a href="https://colab.research.google.com/github/payal15604/ML_Assignments/blob/main/Assignment5_MLR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
import statsmodels.api as sm

url = 'https://drive.google.com/uc?id=1O_NwpJT-8xGfU_-3llUl2sgPu0xllOrX'
data = pd.read_csv(url)

X = data.drop(columns=['Price'])
y = data['Price']


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

best_r2 = -np.inf
best_beta = None

# Cross-validation loop
for train_index, test_index in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Fit the model
    X_train_sm = sm.add_constant(X_train)
    model = sm.OLS(y_train, X_train_sm).fit()

    # Get predictions and R2 score
    X_test_sm = sm.add_constant(X_test)
    y_pred = model.predict(X_test_sm)
    r2 = r2_score(y_test, y_pred)

    print(f'R² Score for this fold: {r2:.4f}')

    # Update best model if current is better
    if r2 > best_r2:
        best_r2 = r2
        best_beta = model.params

print(f'\nBest R² Score: {best_r2:.4f}')
print(f'Best Beta Coefficients: {best_beta}')

# Train on 70% of the data and test on 30%
train_size = int(0.7 * len(data))
X_train_final = X_scaled[:train_size]
y_train_final = y.iloc[:train_size]
X_test_final = X_scaled[train_size:]
y_test_final = y.iloc[train_size:]

# Final model fitting
X_train_final_sm = sm.add_constant(X_train_final)
final_model = sm.OLS(y_train_final, X_train_final_sm).fit()

# Predictions on the test set
X_test_final_sm = sm.add_constant(X_test_final)
y_pred_final = final_model.predict(X_test_final_sm)

# Final R2 score
final_r2 = r2_score(y_test_final, y_pred_final)
print(f'Final R² Score on test set: {final_r2:.4f}')

R² Score for this fold: 0.9180
R² Score for this fold: 0.9146
R² Score for this fold: 0.9116
R² Score for this fold: 0.9193
R² Score for this fold: 0.9244

Best R² Score: 0.9244
Best Beta Coefficients: const    1.231617e+06
x1       2.302251e+05
x2       1.639568e+05
x3       1.211151e+05
x4       7.834672e+02
x5       1.506624e+05
dtype: float64
Final R² Score on test set: 0.9176


In [3]:
data

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price
0,79545.45857,5.682861,7.009188,4.09,23086.80050,1.059034e+06
1,79248.64245,6.002900,6.730821,3.09,40173.07217,1.505891e+06
2,61287.06718,5.865890,8.512727,5.13,36882.15940,1.058988e+06
3,63345.24005,7.188236,5.586729,3.26,34310.24283,1.260617e+06
4,59982.19723,5.040555,7.839388,4.23,26354.10947,6.309435e+05
...,...,...,...,...,...,...
4995,60567.94414,7.830362,6.137356,3.46,22837.36103,1.060194e+06
4996,78491.27543,6.999135,6.576763,4.02,25616.11549,1.482618e+06
4997,63390.68689,7.250591,4.805081,2.13,33266.14549,1.030730e+06
4998,68001.33124,5.534388,7.130144,5.44,42625.62016,1.198657e+06


In [6]:
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

url = 'https://drive.google.com/uc?id=1O_NwpJT-8xGfU_-3llUl2sgPu0xllOrX'
data = pd.read_csv(url)

X = data.drop(columns=['Price'])
y = data['Price']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

train_size = int(0.56 * len(data))
val_size = int(0.14 * len(data))

X_train = X_scaled[:train_size]
y_train = y.iloc[:train_size]
X_val = X_scaled[train_size:train_size + val_size]
y_val = y.iloc[train_size:train_size + val_size]
X_test = X_scaled[train_size + val_size:]
y_test = y.iloc[train_size + val_size:]

def gradient_descent(X, y, learning_rate, iterations):
    m, n = X.shape
    beta = np.zeros(n)
    for _ in range(iterations):
        predictions = X.dot(beta)
        errors = predictions - y
        gradient = (1/m) * X.T.dot(errors)
        beta -= learning_rate * gradient
    return beta

learning_rates = [0.001, 0.01, 0.1, 1]
iterations = 1000

best_r2_val = -np.inf
best_r2_test = -np.inf
best_beta = None

for lr in learning_rates:
    X_train_sm = np.c_[np.ones(X_train.shape[0]), X_train]
    X_val_sm = np.c_[np.ones(X_val.shape[0]), X_val]
    X_test_sm = np.c_[np.ones(X_test.shape[0]), X_test]

    beta = gradient_descent(X_train_sm, y_train, lr, iterations)

    y_val_pred = X_val_sm.dot(beta)
    r2_val = r2_score(y_val, y_val_pred)

    y_test_pred = X_test_sm.dot(beta)
    r2_test = r2_score(y_test, y_test_pred)

    print(f'Learning Rate: {lr}, Validation R²: {r2_val:.4f}, Test R²: {r2_test:.4f}')

    if r2_val > best_r2_val:
        best_r2_val = r2_val
        best_r2_test = r2_test
        best_beta = beta

print(f'\nBest Validation R²: {best_r2_val:.4f}')
print(f'Best Test R²: {best_r2_test:.4f}')
print(f'Best Regression Coefficients: {best_beta}')

Learning Rate: 0.001, Validation R²: -0.9353, Test R²: -0.8082
Learning Rate: 0.01, Validation R²: 0.9151, Test R²: 0.9175
Learning Rate: 0.1, Validation R²: 0.9151, Test R²: 0.9175
Learning Rate: 1, Validation R²: 0.9151, Test R²: 0.9175

Best Validation R²: 0.9151
Best Test R²: 0.9175
Best Regression Coefficients: [ 1.23244775e+06  2.31682635e+05  1.63635272e+05  1.19025219e+05
 -2.74956842e+02  1.50705906e+05]
