<a href="https://colab.research.google.com/github/nitrogoose/MACHINE_LEARNING/blob/main/ASS3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

# Load the dataset
file_path = '/content/USA_Housing.csv'  # Replace with actual file path
data = pd.read_csv(file_path)

# a) Divide dataset into input features and output variable
X = data.drop(columns=['Price'])
y = data['Price']

# b) Scale input features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# c) Divide input and output features into 5 folds
kf = KFold(n_splits=5, shuffle=True, random_state=42)

best_beta = None
max_r2_score = -np.inf

# d) 5 iterations for cross-validation
for train_index, test_index in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    beta = np.linalg.pinv(X_train.T @ X_train) @ X_train.T @ y_train
    y_pred = X_test @ beta
    r2 = r2_score(y_test, y_pred)

    if r2 > max_r2_score:
        max_r2_score = r2
        best_beta = beta

print("Best Beta Coefficients:", best_beta)
print("Maximum R2 Score:", max_r2_score)

# e) Train regressor on 70% of data and test on remaining 30%
split = int(0.7 * len(X_scaled))
X_train, X_test = X_scaled[:split], X_scaled[split:]
y_train, y_test = y[:split], y[split:]

beta_final = np.linalg.pinv(X_train.T @ X_train) @ X_train.T @ y_train
y_pred_final = X_test @ beta_final
r2_final = r2_score(y_test, y_pred_final)

print("Final R2 Score on Test Set:", r2_final)


In [1]:
learning_rates = [0.001, 0.01, 0.1, 1]
X_train, X_temp = X_scaled[:int(0.56 * len(X_scaled))], X_scaled[int(0.56 * len(X_scaled)):]
y_train, y_temp = y[:int(0.56 * len(y))], y[int(0.56 * len(y)):]

X_val, X_test = X_temp[:int(0.14 * len(X_scaled))], X_temp[int(0.14 * len(X_scaled)):]
y_val, y_test = y_temp[:int(0.14 * len(y))], y_temp[int(0.14 * len(y)):]

best_beta = None
best_r2 = -np.inf

for lr in learning_rates:
    beta = np.zeros(X_train.shape[1])
    for _ in range(1000):
        gradient = -2 * X_train.T @ (y_train - X_train @ beta) / len(y_train)
        beta -= lr * gradient

    y_val_pred = X_val @ beta
    r2_val = r2_score(y_val, y_val_pred)

    if r2_val > best_r2:
        best_r2 = r2_val
        best_beta = beta

y_test_pred = X_test @ best_beta
r2_test = r2_score(y_test, y_test_pred)

print("Best R2 on Validation Set:", best_r2)
print("Best R2 on Test Set:", r2_test)
print("Best Beta Coefficients:", best_beta)


NameError: name 'X_scaled' is not defined

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data'
columns = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration", "num_doors",
           "body_style", "drive_wheels", "engine_location", "wheel_base", "length", "width",
           "height", "curb_weight", "engine_type", "num_cylinders", "engine_size", "fuel_system",
           "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm", "city_mpg",
           "highway_mpg", "price"]

data = pd.read_csv(url, names=columns)
data.replace('?', np.nan, inplace=True)
data.fillna(data.mean(numeric_only=True), inplace=True)
data.dropna(subset=['price'], inplace=True)

data['num_doors'] = data['num_doors'].replace({'two': 2, 'four': 4})
data['num_cylinders'] = data['num_cylinders'].replace({
    'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'eight': 8, 'twelve': 12
})

data['fuel_system'] = data['fuel_system'].apply(lambda x: 1 if 'pfi' in str(x) else 0)
data['engine_type'] = data['engine_type'].apply(lambda x: 1 if 'ohc' in str(x) else 0)

encoder = LabelEncoder()
for col in ['make', 'aspiration', 'engine_location', 'fuel_type']:
    data[col] = encoder.fit_transform(data[col])

data = pd.get_dummies(data, columns=['body_style', 'drive_wheels'], drop_first=True)

X = data.drop(columns=['price'])
y = data['price'].astype(float)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

regressor = LinearRegression()
regressor.fit(X_train, y_train)
r2_before_pca = regressor.score(X_test, y_test)

pca = PCA(n_components=10)
X_pca = pca.fit_transform(X_scaled)

X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, y, test_size=0.3, random_state=42)

regressor_pca = LinearRegression()
regressor_pca.fit(X_train_pca, y_train_pca)
r2_after_pca = regressor_pca.score(X_test_pca, y_test_pca)

print("R2 Score Before PCA:", r2_before_pca)
print("R2 Score After PCA:", r2_after_pca)
