# Real Estate Price Prediction - Exploratory Analysis & Modeling

This notebook documents the data exploration, preprocessing, and model training process behind the Streamlit app (`app.py`). The goal is to estimate house prices based on bedroom count, bathroom count, square footage, and location (state).

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
import streamlit as st

In [2]:
df = pd.read_csv('cleaned_df.csv')
df.head()

Unnamed: 0,State,City,Street,Zipcode,Bedroom,Bathroom,Area,PPSq,LotArea,MarketEstimate,RentEstimate,Latitude,Longitude,ListedPrice
0,AL,Saraland,Scott Dr,36571.0,4.0,2.0,1614.0,148.636927,0.3805,240600.0,1599.0,30.819534,-88.09596,239900.0
1,AL,Robertsdale,Cowpen Creek Rd,36567.0,3.0,2.0,1800.0,144.388889,3.2,,,30.590004,-87.580376,259900.0
2,AL,Gulf Shores,Spinnaker Dr #201,36542.0,2.0,2.0,1250.0,274.0,,,,30.284956,-87.74792,342500.0
3,AL,Chelsea,Mallet Way,35043.0,3.0,3.0,2224.0,150.629496,0.26,336200.0,1932.0,33.357986,-86.6087,335000.0
4,AL,Huntsville,Turtlebrook Ct,35811.0,3.0,2.0,1225.0,204.081633,,222700.0,1679.0,34.775517,-86.4407,250000.0


##  Preprocessing Pipeline

In [3]:
X = df[['Bedroom', 'Bathroom', 'Area', 'State']]
y = df['ListedPrice']

numeric_features = ['Bedroom', 'Bathroom', 'Area']
categorical_features = ['State']

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median'))
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

##  Model Training

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=20, random_state=42))
])

model.fit(X_train, y_train)

## Save Model for Streamlit App


In [5]:
import joblib
joblib.dump(model, "rf_model.joblib")

['rf_model.joblib']