<a href="https://colab.research.google.com/github/palak0626/uml501-ml-/blob/main/assignment3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import KFold, train_test_split

# q1

In [2]:
from google.colab import files
uploaded = files.upload()

Saving USA_Housing.csv to USA_Housing (1).csv


In [3]:
data = pd.read_csv("USA_Housing.csv")

In [4]:
X = data.drop("Price", axis=1).values
y = data["Price"].values.reshape(-1, 1)

In [44]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [6]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [7]:
betas = []
r2_scores = []

In [9]:
fold = 1
for train_idx, test_idx in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]


    X_train_bias = np.c_[np.ones(X_train.shape[0]), X_train]
    X_test_bias = np.c_[np.ones(X_test.shape[0]), X_test]


    beta = np.linalg.inv(X_train_bias.T @ X_train_bias) @ X_train_bias.T @ y_train


    y_pred = X_test_bias @ beta


    r2 = r2_score(y_test, y_pred)

    print(f"Fold {fold}: R² = {r2:.4f}")
    fold += 1

    betas.append(beta)
    r2_scores.append(r2)

Fold 1: R² = 0.9180
Fold 2: R² = 0.9146
Fold 3: R² = 0.9116
Fold 4: R² = 0.9193
Fold 5: R² = 0.9244


In [40]:
best_beta = betas[np.argmax(r2_scores)]
print("\nBest β (coefficients):\n", best_beta.flatten())


Best β (coefficients):
 [1.23161736e+06 2.30225051e+05 1.63956839e+05 1.21115120e+05
 7.83467170e+02 1.50662447e+05]


In [41]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

In [42]:
X_train_bias = np.c_[np.ones(X_train.shape[0]), X_train]
X_test_bias = np.c_[np.ones(X_test.shape[0]), X_test]

In [43]:
y_pred_final = X_test_bias @ best_beta
r2_final = r2_score(y_test, y_pred_final)

print("\nFinal R² Score on 30% Test Data: ", round(r2_final, 4))


Final R² Score on 30% Test Data:  0.9147


# Q2

In [16]:
from google.colab import files
uploaded = files.upload()

Saving USA_Housing.csv to USA_Housing (2).csv


In [17]:
data = pd.read_csv("USA_Housing.csv")

In [18]:
X = data.drop("Price", axis=1).values
y = data["Price"].values.reshape(-1, 1)

In [19]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [20]:
def add_bias(Xm):
    return np.c_[np.ones((Xm.shape[0], 1)), Xm]

In [21]:
X_temp, X_test, y_temp, y_test = train_test_split(
    X_scaled, y, test_size=0.30, random_state=42
)

In [22]:
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.20, random_state=42
)

In [23]:
X_train_b = add_bias(X_train)
X_val_b   = add_bias(X_val)
X_test_b  = add_bias(X_test)

In [24]:
n_features_plus_bias = X_train_b.shape[1]

In [25]:
def gradient_descent(Xb, y, lr=0.01, n_iter=1000, beta_init=None):
    """
    Batch Gradient Descent for Linear Regression.
    Xb: [m x (n+1)] with bias column
    y:  [m x 1]
    """
    m = Xb.shape[0]
    if beta_init is None:
        beta = np.zeros((Xb.shape[1], 1))
    else:
        beta = beta_init.copy()

    for _ in range(n_iter):
        y_hat = Xb @ beta
        grad = (1.0/m) * (Xb.T @ (y_hat - y))   # d/dβ of MSE (without the 1/2 factor)
        beta -= lr * grad

    return beta

In [32]:
def metrics(Xb, y, beta):
    y_hat = Xb @ beta
    r2 = r2_score(y, y_hat)
    mse = mean_squared_error(y, y_hat)
    return r2, mse, y_hat

In [27]:
learning_rates = [0.001, 0.01, 0.1, 1.0]
results = []

In [28]:
betas_by_lr = {}

In [29]:
for lr in learning_rates:
    beta = gradient_descent(X_train_b, y_train, lr=lr, n_iter=1000, beta_init=None)
    betas_by_lr[lr] = beta

In [33]:
r2_train, mse_train, _ = metrics(X_train_b, y_train, beta)
r2_val,   mse_val,   _ = metrics(X_val_b,   y_val,   beta)

In [34]:
results.append({
        "learning_rate": lr,
        "r2_train": r2_train,
        "mse_train": mse_train,
        "r2_val": r2_val,
        "mse_val": mse_val
})

In [35]:
res_df = pd.DataFrame(results).sort_values(by="r2_val", ascending=False)
print("Validation results after 1000 iterations for each learning rate:")
print(res_df.to_string(index=False))

Validation results after 1000 iterations for each learning rate:
 learning_rate  r2_train    mse_train  r2_val      mse_val
           1.0  0.921499 1.007178e+10  0.9098 1.116613e+10


In [36]:
best_lr = res_df.iloc[0]["learning_rate"]
best_beta = betas_by_lr[best_lr]
print(f"\nBest learning rate based on validation R²: {best_lr}")


Best learning rate based on validation R²: 1.0


In [37]:
r2_test, mse_test, yhat_test = metrics(X_test_b, y_test, best_beta)

In [38]:
print("\nBest β (coefficients) including intercept:")
print(best_beta.flatten())


Best β (coefficients) including intercept:
[1232618.32011841  230067.9889464   163710.33259401  121681.42752284
    2832.15066521  150657.52262836]


In [39]:
print(f"\nTest R²: {r2_test:.6f}")
print(f"Test MSE: {mse_test:.6f}")


Test R²: 0.914757
Test MSE: 10059552844.352884


# Q3

In [45]:
cols = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration", "num_doors",
        "body_style", "drive_wheels", "engine_location", "wheel_base", "length", "width",
        "height", "curb_weight", "engine_type", "num_cylinders", "engine_size", "fuel_system",
        "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm", "city_mpg",
        "highway_mpg", "price"]


url = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
df = pd.read_csv(url, names=cols)


df = df.replace("?", np.nan)


In [46]:
numeric_cols = ["normalized_losses", "wheel_base", "length", "width", "height",
                "curb_weight", "engine_size", "bore", "stroke", "compression_ratio",
                "horsepower", "peak_rpm", "city_mpg", "highway_mpg", "price"]

df[numeric_cols] = df[numeric_cols].astype(float)


for col in numeric_cols:
    if col != "price":
        df[col].fillna(df[col].mean(), inplace=True)


for col in df.columns:
    if df[col].dtype == "object":
        df[col].fillna(df[col].mode()[0], inplace=True)


df = df.dropna(subset=["price"])


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


In [47]:
from sklearn.preprocessing import LabelEncoder


word_to_num = {
    "two": 2, "four": 4, "six": 6, "eight": 8, "twelve": 12, "three": 3, "five": 5
}
df["num_doors"] = df["num_doors"].replace(word_to_num).astype(int)
df["num_cylinders"] = df["num_cylinders"].replace(word_to_num).astype(int)


df = pd.get_dummies(df, columns=["body_style", "drive_wheels"], drop_first=True)


for col in ["make", "aspiration", "engine_location", "fuel_type"]:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])


df["fuel_system"] = df["fuel_system"].apply(lambda x: 1 if "pfi" in x else 0)


df["engine_type"] = df["engine_type"].apply(lambda x: 1 if "ohc" in x else 0)


  df["num_doors"] = df["num_doors"].replace(word_to_num).astype(int)
  df["num_cylinders"] = df["num_cylinders"].replace(word_to_num).astype(int)


In [48]:
from sklearn.preprocessing import StandardScaler


X = df.drop("price", axis=1)
y = df["price"]


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [49]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)


lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)


print("R2 Score:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))


R2 Score: 0.873309037488395
MSE: 12009246.419712253


In [50]:
from sklearn.decomposition import PCA


pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_scaled)


Xp_train, Xp_test, yp_train, yp_test = train_test_split(X_pca, y, test_size=0.3, random_state=42)


lr_pca = LinearRegression()
lr_pca.fit(Xp_train, yp_train)

yp_pred = lr_pca.predict(Xp_test)


print("R2 Score with PCA:", r2_score(yp_test, yp_pred))
print("MSE with PCA:", mean_squared_error(yp_test, yp_pred))


R2 Score with PCA: 0.861171292726016
MSE with PCA: 13159803.37295688
