### **Import Libraries and Load Dataset**

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

In [2]:
# Load the dataset
path = 'C:/Users/PRATIK PAL/Desktop\Self Study\ML\datasets\Housing.csv'
df = pd.read_csv(path)

# Display the first few rows
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


### **EDA and Preprocessing**

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


In [4]:
# Check for missing values
print(df.isnull().sum())

price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64


In [5]:
df_org = df.copy()

In [6]:
# Encode categorical variables
le = LabelEncoder()
for column in df.select_dtypes(include=['object']).columns:
    df[column] = le.fit_transform(df[column])

In [7]:
# Split the data into features and target
X = df.drop('price', axis=1)
y = df['price']

In [8]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### **Model Training and Hyperparameter Tuning**

In [10]:
models = {
    'RandomForest': RandomForestRegressor(),'LinearRegression': LinearRegression(),
    'SVR': SVR()
}

params = {
    'RandomForest': {'n_estimators': [100, 200], 'max_depth': [10, 20]}, 'LinearRegression': {},
    'SVR': {'C': [1, 10], 'kernel': ['linear', 'rbf']}
}

In [11]:
best_models = {}

for model_name in models:
    grid = GridSearchCV(models[model_name], params[model_name], cv=5, scoring='neg_mean_squared_error')
    grid.fit(X_train, y_train)
    best_models[model_name] = grid.best_estimator_
    print(f"Best parameters for {model_name}: {grid.best_params_}")


Best parameters for RandomForest: {'max_depth': 10, 'n_estimators': 100}
Best parameters for LinearRegression: {}
Best parameters for SVR: {'C': 10, 'kernel': 'linear'}


In [12]:
# Evaluate models
for model_name in best_models:
    y_pred = best_models[model_name].predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f"{model_name} MSE: {mse}")

RandomForest MSE: 2017750177913.0476
LinearRegression MSE: 1771751116594.0405
SVR MSE: 5542167205507.651


### **save the best model**

In [13]:
import streamlit as st
import pickle

# Load the best model (example with RandomForest)
model = best_models['RandomForest']

# Save the model
with open('best_model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [14]:
df_org.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [17]:
df_org.parking.unique()

array([2, 3, 0, 1], dtype=int64)