<a href="https://colab.research.google.com/github/nitaisutradhar/SM-A-7/blob/main/Assignment_7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [4]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/datasets/Assignment 7/Housing.csv")

In [6]:
# Shape (Rows and columns )
print(f"Shapes number of rows and columns {df.shape}")
print(f"Number of houses: {df.shape[0]}")

#First three houses
print("\nFirst 3 houses : \n")
print(df[0:3])

# Columns Name
print("\nColumns Name: \n", df.columns.to_list())

# Data types
print('\nData Types: \n',df.dtypes)

Shapes number of rows and columns (545, 13)
Number of houses: 545

First 3 houses : 

      price  area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0  13300000  7420         4          2        3      yes        no       no   
1  12250000  8960         4          4        4      yes        no       no   
2  12250000  9960         3          2        2      yes        no      yes   

  hotwaterheating airconditioning  parking prefarea furnishingstatus  
0              no             yes        2      yes        furnished  
1              no             yes        3       no        furnished  
2              no              no        2      yes   semi-furnished  

Columns Name: 
 ['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'parking', 'prefarea', 'furnishingstatus']

Data Types: 
 price                int64
area                 int64
bedrooms             int64
bathrooms            int64
s

# Label Encoding

In [7]:
from sklearn.preprocessing import LabelEncoder

# for yes/no
label_encoder = LabelEncoder()

# Yes/No Categorical columns
# binary_columns = ['mainroad', 'guestroom', 'basement', 'airconditioning', 'prefarea', 'furnishingstatus']

# 2nd method conditional finding of categorical columns
binary_columns = df.select_dtypes(include=['object']).columns.tolist()
print("Categorical Columns: ", binary_columns)


# Apply LabelEncoder
for col in binary_columns:
  df[col] = label_encoder.fit_transform(df[col])

# print("Current values:\n", df.head())
print("After encoding values:\n", df.head())

Categorical Columns:  ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']
After encoding values:
       price  area  bedrooms  bathrooms  stories  mainroad  guestroom  \
0  13300000  7420         4          2        3         1          0   
1  12250000  8960         4          4        4         1          0   
2  12250000  9960         3          2        2         1          0   
3  12215000  7500         4          2        2         1          0   
4  11410000  7420         4          1        2         1          1   

   basement  hotwaterheating  airconditioning  parking  prefarea  \
0         0                0                1        2         1   
1         0                0                1        3         0   
2         1                0                0        2         1   
3         1                0                1        3         1   
4         1                0                1        2         0   

   f

# Train-Test Split

In [8]:
# Separate input(X) and output (y)
X = df.drop('price', axis = 1) # Everything except price
y = df['price'] # Only price

# Split: 80% for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(
    X,y,
    test_size= 0.2,
    random_state= 32
)

print(f"Now we will train with: {len(X_train)} houses")
print(f"We will test with: {len(X_test)} houses")

Now we will train with: 436 houses
We will test with: 109 houses


# Why Scale?

In [9]:
# Problem: Some numbers are big, some are small
print("Before scaling different ranges: ")
print(f"Area: smallest = {X_train['area'].min()} , biggest= {X_train['area'].max()}")
print(f"Bedrooms: smallest = {X_train['bedrooms'].min()}, biggest= {X_train['bedrooms'].max()}")
print("This confuses the model!")

Before scaling different ranges: 
Area: smallest = 1650 , biggest= 16200
Bedrooms: smallest = 1, biggest= 6
This confuses the model!


# StandardScaler

In [15]:
# create scaler
scaler = StandardScaler()

# Learn from training data and scale it
X_train_scaled = scaler.fit_transform(X_train)

# Scale test data (just transform, don't fit)
X_test_scaled = scaler.transform(X_test)

print("\nAfter scaling: ")
print("All features now have similar range!")
print(f"Example - First house's area before scaling: {X_train.values[0][1]}")
print(f"Example - First house's area after scaling: {X_train_scaled[0][1]:.2f}")


After scaling: 
All features now have similar range!
Example - First house's area before scaling: 3
Example - First house's area after scaling: 0.02


# Random Forest Regressor

In [17]:
from sklearn.ensemble import RandomForestRegressor

# n_estimators build 100 individual trees then takes the average of all 100 predictions
# "random" selection is the same every time
model = RandomForestRegressor(n_estimators=100, random_state = 42)
model.fit(X_train_scaled, y_train)
print("Model Trained!")

# Check accuracy on training data
train_score = model.score(X_train_scaled, y_train)
print(f"Training accuracy: {train_score:.1%}")

# Check accuracy on test data
test_score = model.score(X_test_scaled, y_test)
print(f"Test accuracy: {test_score:.1%}")


Model Trained!
Training accuracy: 94.4%
Test accuracy: 74.5%


# Save Everything

In [18]:
import joblib

# Save the model
joblib.dump(model, 'my_model.pkl')
print("✅ Model saved as 'my_model.pkl'")

# Save the Scaler
joblib.dump(scaler, 'my_scaler.pkl')
print("✅ Scaler saved as 'my_scaler.pkl'")

# Save scaled data as CSV
pd.DataFrame(X_train_scaled).to_csv('train_data_scaled.csv', index = False)
print("✅ Scaled data saved as 'train_data_scaled.csv'")

✅ Model saved as 'my_model.pkl'
✅ Scaler saved as 'my_scaler.pkl'
✅ Scaled data saved as 'train_data_scaled.csv'


# Use Saved Model for New House Data

In [24]:
# Load saved model and scaler
model = joblib.load('my_model.pkl')
scaler = joblib.load('my_scaler.pkl')

# New house details:
# area=5000, bedrooms=3, bathrooms=2, stories=2,
# mainroad=yes, guestroom=no, basement=yes,
# hotwaterheating=no, airconditioning=yes,
# parking=2, prefarea=yes, furnishingstatus=furnished
print("\nNew House Information: ")
print("area=5000, bedrooms=3, bathrooms=2, stories=2,")
print("mainroad=yes, guestroom=no, basement=yes,")
print("hotwaterheating=no, airconditioning=yes,")
print("parking=2, prefarea=yes, furnishingstatus=furnished\n")

new_house = [[5000, 3, 2, 2, 1, 0, 1, 0, 1, 2, 1, 0]]

# Scale the new house data
new_house_scaled = scaler.transform(new_house)

# Make prediction
prediction = model.predict(new_house_scaled)

print(f"The predicted price is : {prediction[0]}")


New House Information: 
area=5000, bedrooms=3, bathrooms=2, stories=2,
mainroad=yes, guestroom=no, basement=yes,
hotwaterheating=no, airconditioning=yes,
parking=2, prefarea=yes, furnishingstatus=furnished

The predicted price is : 7639648.8


