## **1. Import & Load Data**

In [54]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error

In [55]:
df = pd.read_csv("/content/Housing.csv")
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


## **2. EDA**

In [56]:
df.columns

Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
       'parking', 'prefarea', 'furnishingstatus'],
      dtype='object')

In [57]:
df.shape

(545, 13)

In [58]:
df.isnull().sum()

Unnamed: 0,0
price,0
area,0
bedrooms,0
bathrooms,0
stories,0
mainroad,0
guestroom,0
basement,0
hotwaterheating,0
airconditioning,0


In [59]:
df.dtypes

Unnamed: 0,0
price,int64
area,int64
bedrooms,int64
bathrooms,int64
stories,int64
mainroad,object
guestroom,object
basement,object
hotwaterheating,object
airconditioning,object


## **4. Cleaning and Preprossing**


In [60]:
df.drop("furnishingstatus",axis=1,inplace=True)

In [61]:
categorical = df.select_dtypes(include=["object"])
numerical = df.select_dtypes(include=["int64","float64"])

In [62]:
from sklearn.preprocessing import LabelEncoder

In [63]:
le = LabelEncoder()

In [64]:
label_encoders = {}

for col in categorical:
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le


In [65]:
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea
0,13300000,7420,4,2,3,1,0,0,0,1,2,1
1,12250000,8960,4,4,4,1,0,0,0,1,3,0
2,12250000,9960,3,2,2,1,0,1,0,0,2,1
3,12215000,7500,4,2,2,1,0,1,0,1,3,1
4,11410000,7420,4,1,2,1,1,1,0,1,2,0


## **3. Split Data**

In [66]:
X = df.drop("price",axis=1)
y = df["price"]

In [67]:
from sklearn.model_selection import train_test_split

In [68]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## **4. Models & Hyperparameter Grid**

In [69]:
models = {
    "LinearRegression": {
        "model": LinearRegression(),
        "params": {}
    },
    "Ridge": {
        "model": Ridge(),
        "params": {
            "alpha": [0.1, 1, 10, 50]
        }
    },
    "Lasso": {
        "model": Lasso(),
        "params": {
            "alpha": [0.01, 0.1, 1, 10]
        }
    },
    "RandomForest": {
        "model": RandomForestRegressor(random_state=42),
        "params": {
            "n_estimators": [100, 200],
            "max_depth": [None, 10, 20]
        }
    },
    "GradientBoosting": {
        "model": GradientBoostingRegressor(),
        "params": {
            "n_estimators": [100, 200],
            "learning_rate": [0.05, 0.1],
            "max_depth": [3, 5]
        }
    }
}

In [70]:
results = []

for name, config in models.items():
    print(f"Training {name}...")

    grid = GridSearchCV(
        config["model"],
        config["params"],
        cv=5,
        scoring="r2",
        n_jobs=-1
    )

    grid.fit(X_train, y_train)

    results.append({
        "model": name,
        "best_score": grid.best_score_,
        "best_params": grid.best_params_
    })

Training LinearRegression...
Training Ridge...
Training Lasso...
Training RandomForest...
Training GradientBoosting...


# **5. Find The Best Model**

In [71]:
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="best_score", ascending=False)
results_df

Unnamed: 0,model,best_score,best_params
1,Ridge,0.622129,{'alpha': 1}
0,LinearRegression,0.621548,{}
2,Lasso,0.621548,{'alpha': 0.01}
4,GradientBoosting,0.587339,"{'learning_rate': 0.05, 'max_depth': 3, 'n_est..."
3,RandomForest,0.580273,"{'max_depth': 10, 'n_estimators': 200}"


In [72]:
best_model_name = results_df.iloc[0]["model"]
print("Best model:", best_model_name)

Best model: Ridge


In [73]:
best_grid = GridSearchCV(
    models[best_model_name]["model"],
    models[best_model_name]["params"],
    cv=5,
    scoring="r2",
    n_jobs=-1
)

best_grid.fit(X_train, y_train)

best_model = best_grid.best_estimator_

## **6. Model Evalution**

In [74]:
import numpy as np

y_pred = best_model.predict(X_test)

r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("Test R2:", r2)
print("Test RMSE:", rmse)

Test R2: 0.6485516018159451
Test RMSE: 1227276.9855436825


In [75]:
for col, coef in zip(X_train.columns, best_model.coef_):
    print(col, ":", coef)

area : 254.24690112241467
bedrooms : 103299.0854975077
bathrooms : 1117369.04622731
stories : 416562.61853521335
mainroad : 447947.96176538186
guestroom : 336832.61438366945
basement : 517019.10392802046
hotwaterheating : 606684.1317257164
airconditioning : 687500.708256548
parking : 309565.13216175814
prefarea : 526225.777444107


## **6. Import as .pkl for development**

In [76]:
import joblib

In [77]:
joblib.dump(best_model, "best_model.pkl")
joblib.dump(label_encoders, "label_encoders.pkl")
joblib.dump(X_train.columns.tolist(), "model_columns.pkl")

['model_columns.pkl']