In [35]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split


In [36]:
import pandas as pd

data = pd.read_csv(r'C:\Users\Pakistan\Downloads\Housing.csv')

print(data.shape)

print(data.columns.tolist())

print('first 10 dataset:\n')
print(data.head(10))


target = 'price'
X = data.drop(columns =[target])
y = data[target]
print(target)

print('data description:\n')
print(data.describe())

print('data info:\n')
print(data.info())

(545, 13)
['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'parking', 'prefarea', 'furnishingstatus']
first 10 dataset:

      price   area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0  13300000   7420         4          2        3      yes        no       no   
1  12250000   8960         4          4        4      yes        no       no   
2  12250000   9960         3          2        2      yes        no      yes   
3  12215000   7500         4          2        2      yes        no      yes   
4  11410000   7420         4          1        2      yes       yes      yes   
5  10850000   7500         3          3        1      yes        no      yes   
6  10150000   8580         4          3        4      yes        no       no   
7  10150000  16200         5          3        2      yes        no       no   
8   9870000   8100         4          1        2      yes       yes      yes   

In [49]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

cat_col = X.select_dtypes(include=['object', 'string']).columns
num_col = X.select_dtypes(include='number').columns


preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_col),
        ('cat', OneHotEncoder(drop='first'), cat_col),
    ]
)


model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("regressor", LinearRegression())
    ]
)

X_train,  X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)



model.fit(X_train, y_train)

y_pred = model.predict(X_test)


print(y_pred[:5])
print(y_test.iloc[:5].values)


[5372312.6161464  7069241.00525806 3099290.78784024 4526446.61703974
 3281573.61192656]
[4060000 6650000 3710000 6440000 2800000]


I print the first few actual values to quickly compare them with predictions and verify that the model output is reasonable without overwhelming the console.

In [38]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

MSE = mean_squared_error(y_test, y_pred)
MAE = mean_absolute_error(y_test, y_pred)
R2 = r2_score(y_test, y_pred)

From my understanding the MAE measures the average absolute error predicted.
MSE penalize large errors making them sensitive to outliers.
 R2 shows the percentage of the variance in the target variable.

In [44]:
print(f"Mean Squared Error (MSE): {MSE:.3f}")
print(f"Mean Absolute Error (MAE): {MAE:.3f}")
print(f"R² Score: {R2:.3f}")


Mean Squared Error (MSE): 1523019469501.291
Mean Absolute Error (MAE): 920392.940
R² Score: 0.646


The model explains 64.6% of variance in house prices.
it assumes a strict linear relationship, sensitive to outliers

In [57]:
from sklearn.linear_model import Ridge

ridge_model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("regressor", Ridge(alpha=1.0))
    ]
)

ridge_model.fit(X_train, y_train)
ridge_pred = ridge_model.predict(X_test)

print("Ridge MAE:", mean_absolute_error(y_test, ridge_pred))
print("Ridge MSE:", mean_squared_error(y_test, ridge_pred))
print("Ridge R2:", r2_score(y_test, ridge_pred))


Ridge MAE: 921133.602824988
Ridge MSE: 1527112104794.8528
Ridge R²: 0.6453847247915563


This indicates mild underfitting due to slightly lower R2 is similar to linear regression
Ridge did not significantly improve perfomance, indicating that multicolinearity was not a major issue

In [61]:
from sklearn.linear_model import Lasso

lasso_model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("regressor", Lasso(alpha=1.0, max_iter=1000))
    ]
)

lasso_model.fit(X_train, y_train)
lasso_pred = lasso_model.predict(X_test)

print(f'Lasso MAE: {mean_absolute_error(y_test, lasso_pred):.3f}')
print(f'Lasso MSE:,{ mean_squared_error(y_test, lasso_pred):.3f}')
print(f'Lasso R2:, {r2_score(y_test, lasso_pred):.3f}')

Lasso MAE: 920393.041
Lasso MSE:,1523021569941.372
Lasso R2:, 0.646


No meaningful feature elimination occurred because most contributed meaningfully hence, no advantage over baseline and sparse feature selection was unnecessary

In [63]:
from sklearn.preprocessing import PolynomialFeatures

poly_preprocessor = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("scaler", StandardScaler()),
            ("poly", PolynomialFeatures(degree=2, include_bias=False))
        ]), num_col),
        ("cat", OneHotEncoder(drop="first"), cat_col)
    ]
)

poly_model = Pipeline(
    steps=[
        ("preprocessor", poly_preprocessor),
        ("regressor", LinearRegression())
    ]
)

poly_model.fit(X_train, y_train)
poly_pred = poly_model.predict(X_test)

print(f"Polynomial MAE:, {mean_absolute_error(y_test, poly_pred):.3f}")
print(f"Polynomial MSE:, {mean_squared_error(y_test, poly_pred):.3f}")
print(f"Polynomial R²:, {r2_score(y_test, poly_pred):.3f}")


Polynomial MAE:, 924978.767
Polynomial MSE:, 1599478343411.180
Polynomial R²:, 0.629


Polynomial Regression increased the model complexity without improving predictive power, leading to overfitting.

# Ranking (Best - worse)

1  Linear regression/lasso - Best balance in performance and simplicity

2    Ridge regression      -  No benefits but slight regularization


3    Polynomial Regression        - Overfitting, reduced generalization

Linear regression performed best overall, explaining approximately 65% of the variance. while polynomial regression underperformed due to increased complexity and overfitting. for better improvement, ensemble methods would be best.