In [8]:
#üß† Required Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
#Q1. Fit Multiple Linear Regression for all features except ‚ÄúAddress‚Äù
#üëâ Find coefficients and evaluate using two error metrics (MAE, RMSE) and R¬≤
# Load dataset
data = pd.read_csv("USA_Housing.csv")
# Drop the non-numeric column "Address"
X = data.drop(['Address', 'Price'], axis=1)
y = data['Price']
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create model
model = LinearRegression()
model.fit(X_train, y_train)
# Coefficients
print("Intercept (Œ≤0):", model.intercept_)
print("Coefficients (Œ≤1, Œ≤2, ... Œ≤k):", model.coef_)
# Predictions
y_pred = model.predict(X_test)
# Evaluation Metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R¬≤ Score: {r2:.4f}")

Intercept (Œ≤0): -2635072.900915729
Coefficients (Œ≤1, Œ≤2, ... Œ≤k): [2.16522058e+01 1.64666481e+05 1.19624012e+05 2.44037761e+03
 1.52703134e+01]
MAE: 80879.10
RMSE: 100444.06
R¬≤ Score: 0.9180


In [3]:
#Q2. Fit Multiple Linear Regression for any 3 features except ‚ÄúAddress‚Äù
#üëâ Example: Select Avg. Area Income, Avg. Area House Age, Avg. Area Number of Rooms
# Select 3 features
X3 = data[['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms']]
y = data['Price']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X3, y, test_size=0.2, random_state=42)

# Model
model3 = LinearRegression()
model3.fit(X_train, y_train)

# Coefficients
print("Intercept (Œ≤0):", model3.intercept_)
print("Coefficients (Œ≤1, Œ≤2, Œ≤3):", model3.coef_)

# Predictions
y_pred3 = model3.predict(X_test)

# Evaluation
mae3 = mean_absolute_error(y_test, y_pred3)
rmse3 = np.sqrt(mean_squared_error(y_test, y_pred3))
r2_3 = r2_score(y_test, y_pred3)

print(f"MAE: {mae3:.2f}")
print(f"RMSE: {rmse3:.2f}")
print(f"R¬≤ Score: {r2_3:.4f}")

Intercept (Œ≤0): -2041424.2923870739
Coefficients (Œ≤1, Œ≤2, Œ≤3): [2.13519835e+01 1.62314773e+05 1.19909479e+05]
MAE: 139505.40
RMSE: 176549.38
R¬≤ Score: 0.7467


In [4]:
#Q3. Perform 200 different train-test splits and find the best random_state (max R¬≤) (For both cases separately)
#(a) All features
best_r2 = -1
best_state = 0

for state in range(200):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=state)
    model = LinearRegression()
    model.fit(X_train, y_train)
    r2 = r2_score(y_test, model.predict(X_test))
    if r2 > best_r2:
        best_r2 = r2
        best_state = state

print(f"Best random_state for all features: {best_state}, R¬≤ = {best_r2:.4f}")

Best random_state for all features: 167, R¬≤ = 0.9305


In [5]:
#(b) 3 selected features
best_r2_3 = -1
best_state_3 = 0

for state in range(200):
    X_train, X_test, y_train, y_test = train_test_split(X3, y, test_size=0.2, random_state=state)
    model3 = LinearRegression()
    model3.fit(X_train, y_train)
    r2 = r2_score(y_test, model3.predict(X_test))
    if r2 > best_r2_3:
        best_r2_3 = r2
        best_state_3 = state

print(f"Best random_state for 3 features: {best_state_3}, R¬≤ = {best_r2_3:.4f}")

Best random_state for 3 features: 6, R¬≤ = 0.7683


In [6]:
#Q4. Normalize (Standardize) Features Before Fitting the Model. Normalization ensures all features are on the same scale.
#(a) For all features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

model_norm = LinearRegression()
model_norm.fit(X_train, y_train)

y_pred_norm = model_norm.predict(X_test)

r2_norm = r2_score(y_test, y_pred_norm)
mae_norm = mean_absolute_error(y_test, y_pred_norm)
rmse_norm = np.sqrt(mean_squared_error(y_test, y_pred_norm))

print("After Normalization (All features):")
print(f"MAE: {mae_norm:.2f}, RMSE: {rmse_norm:.2f}, R¬≤: {r2_norm:.4f}")

After Normalization (All features):
MAE: 80879.10, RMSE: 100444.06, R¬≤: 0.9180


In [7]:
#(b) For 3 selected features
scaler3 = StandardScaler()
X3_scaled = scaler3.fit_transform(X3)

X_train, X_test, y_train, y_test = train_test_split(X3_scaled, y, test_size=0.2, random_state=42)

model3_norm = LinearRegression()
model3_norm.fit(X_train, y_train)

y_pred3_norm = model3_norm.predict(X_test)

r2_3_norm = r2_score(y_test, y_pred3_norm)
mae_3_norm = mean_absolute_error(y_test, y_pred3_norm)
rmse_3_norm = np.sqrt(mean_squared_error(y_test, y_pred3_norm))

print("After Normalization (3 features):")
print(f"MAE: {mae_3_norm:.2f}, RMSE: {rmse_3_norm:.2f}, R¬≤: {r2_3_norm:.4f}")

After Normalization (3 features):
MAE: 139505.40, RMSE: 176549.38, R¬≤: 0.7467
