In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold, train_test_split

#load data set

df = pd.read_csv('USA_Housing.csv')


# a) Divide the dataset into input features (all columns except price) and output variable (price)
X = df.drop("Price", axis=1).values
y = df["Price"].values.reshape(-1, 1) 

# b) Scale the values of input features.  
scaler = StandardScaler()     
X_scaled = scaler.fit_transform(X)

# c) Divide input and output features into five folds. 
kf = KFold(n_splits=5, shuffle=True, random_state=42)

best_beta = None
best_r2 = -np.inf  
r2_scores = []


# d) Run five iterations, in each iteration consider one-fold as test set and remaining 

for fold, (train_idx, test_idx) in enumerate(kf.split(X_scaled)):     
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Add bias column of ones for intercept
    X_train_bias = np.c_[np.ones((X_train.shape[0], 1)), X_train]
    X_test_bias = np.c_[np.ones((X_test.shape[0], 1)), X_test]

    # Compute beta using Least Squares: Œ≤ = (X·µÄX)^(-1) X·µÄy
    beta = np.linalg.inv(X_train_bias.T @ X_train_bias) @ (X_train_bias.T @ y_train)
     # Predictions
    y_pred = X_test_bias @ beta

    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)
    print(f"Fold {fold+1}: R2 Score = {r2:.4f}")

    if r2 > best_r2:
        best_r2 = r2
        best_beta = beta

print("\nAverage R2 Score across 5 folds:", np.mean(r2_scores))
print("Best R2 Score:", best_r2)

# e) Use the best value of (ùõΩ) matrix (for which R2_score is maximum), to train the 
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

X_train_bias = np.c_[np.ones((X_train.shape[0], 1)), X_train]
X_test_bias = np.c_[np.ones((X_test.shape[0], 1)), X_test]

y_train_pred = X_train_bias @ best_beta
y_test_pred = X_test_bias @ best_beta

train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("\nFinal Model Performance using Best Beta:")
print(f"Train R2 Score: {train_r2:.4f}")
print(f"Test R2 Score: {test_r2:.4f}")




Fold 1: R2 Score = 0.9180
Fold 2: R2 Score = 0.9146
Fold 3: R2 Score = 0.9116
Fold 4: R2 Score = 0.9193
Fold 5: R2 Score = 0.9244

Average R2 Score across 5 folds: 0.9175745431092714
Best R2 Score: 0.9243869413350316

Final Model Performance using Best Beta:
Train R2 Score: 0.9193
Test R2 Score: 0.9147


In [2]:
# ques 2:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

# --- Step 1: Load Dataset ---
df = pd.read_csv("USA_Housing.csv")

# --- Step 2: Divide into input features and output variable ---
X = df.drop("Price", axis=1).values
y = df["Price"].values.reshape(-1, 1)

# --- Step 3: Scale input features ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# --- Step 4: Split into Train (56%), Validation (14%), Test (30%) ---
X_train_full, X_test, y_train_full, y_test = train_test_split(X_scaled, y, test_size=0.30, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)  # 0.2 of 70% = 14%

print(f"Train shape: {X_train.shape}, Validation shape: {X_val.shape}, Test shape: {X_test.shape}")

# --- Step 5: Add bias term (column of 1s) ---
X_train_b = np.c_[np.ones((X_train.shape[0], 1)), X_train]
X_val_b = np.c_[np.ones((X_val.shape[0], 1)), X_val]
X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]

# --- Step 6: Define Gradient Descent Function ---
def gradient_descent(X, y, alpha, iterations):
    m, n = X.shape
    beta = np.zeros((n, 1))
    for i in range(iterations):
        gradient = (1/m) * X.T.dot(X.dot(beta) - y)
        beta -= alpha * gradient
    return beta

# --- Step 7: Try different learning rates ---
learning_rates = [0.001, 0.01, 0.1, 1]
iterations = 1000

results = []

for lr in learning_rates:
    beta = gradient_descent(X_train_b, y_train, lr, iterations)
    
    # Predictions
    y_val_pred = X_val_b.dot(beta)
    y_test_pred = X_test_b.dot(beta)
    
    # R¬≤ scores
    r2_val = r2_score(y_val, y_val_pred)
    r2_test = r2_score(y_test, y_test_pred)
    
    results.append((lr, beta, r2_val, r2_test))

# --- Step 8: Display results ---
for lr, beta, r2_val, r2_test in results:
    print(f"\nLearning Rate: {lr}")
    print(f"Validation R¬≤: {r2_val:.4f}")
    print(f"Test R¬≤: {r2_test:.4f}")

# --- Step 9: Find best learning rate (based on validation R¬≤) ---
best = max(results, key=lambda x: x[2])
best_lr, best_beta, best_val_r2, best_test_r2 = best

print("\n--- Best Model ---")
print(f"Best Learning Rate: {best_lr}")
print(f"Best Validation R¬≤: {best_val_r2:.4f}")
print(f"Test R¬≤ with Best LR: {best_test_r2:.4f}")



Train shape: (2800, 5), Validation shape: (700, 5), Test shape: (1500, 5)

Learning Rate: 0.001
Validation R¬≤: -0.8125
Test R¬≤: -0.9914

Learning Rate: 0.01
Validation R¬≤: 0.9098
Test R¬≤: 0.9147

Learning Rate: 0.1
Validation R¬≤: 0.9098
Test R¬≤: 0.9148

Learning Rate: 1
Validation R¬≤: 0.9098
Test R¬≤: 0.9148

--- Best Model ---
Best Learning Rate: 0.01
Best Validation R¬≤: 0.9098
Test R¬≤ with Best LR: 0.9147


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score , mean_squared_error
from sklearn.decomposition import PCA

columns=["symboling", "normalized_losses","make", "fuel_type", "aspiration","num_doors", "body_style", "drive_wheels",  
"engine_location", "wheel_base", "length", "width", "height", "curb_weight",  
"engine_type", "num_cylinders", "engine_size", "fuel_system", "bore", "stroke",  
"compression_ratio", "horsepower", "peak_rpm", "city_mpg", "highway_mpg", "price"]

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
df= pd.read_csv(url,names=columns)

# replace ? by nan
df=df.mask(df=="?", np.nan)
df.head()



Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450
