# House Price Prediction

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Data Preprocessing

In [2]:
# loading the dataset
df =pd.read_csv("C:/Users/mahi/Documents/data.csv")
display(df.head())
print(df.shape)
print(df.info())

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,2014-05-02 00:00:00,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,2014-05-02 00:00:00,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,3370,280,1921,0,709 W Blaine St,Seattle,WA 98119,USA
2,2014-05-02 00:00:00,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA
3,2014-05-02 00:00:00,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA
4,2014-05-02 00:00:00,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,1140,800,1976,1992,9105 170th Ave NE,Redmond,WA 98052,USA


(4600, 18)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           4600 non-null   object 
 1   price          4600 non-null   float64
 2   bedrooms       4600 non-null   float64
 3   bathrooms      4600 non-null   float64
 4   sqft_living    4600 non-null   int64  
 5   sqft_lot       4600 non-null   int64  
 6   floors         4600 non-null   float64
 7   waterfront     4600 non-null   int64  
 8   view           4600 non-null   int64  
 9   condition      4600 non-null   int64  
 10  sqft_above     4600 non-null   int64  
 11  sqft_basement  4600 non-null   int64  
 12  yr_built       4600 non-null   int64  
 13  yr_renovated   4600 non-null   int64  
 14  street         4600 non-null   object 
 15  city           4600 non-null   object 
 16  statezip       4600 non-null   object 
 17  country        4600 non-null   object 
dt

In [3]:
# drop unnecessary columns
df= df.drop(['date','street','city','statezip','country'],axis=1)
print("After dropping columns:")
display(df.head())

After dropping columns:


Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated
0,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005
1,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,3370,280,1921,0
2,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0
3,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0
4,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,1140,800,1976,1992


In [4]:
# handling missing values
df= df.dropna()

In [5]:
# separate features and target
X= df.drop('price',axis=1)
y= df['price']

In [6]:
# identify categorical features 
categorical_features = X.select_dtypes(include=['object']).columns

In [7]:
# preprocessing for numerical and categorical features
numerical_features = X.select_dtypes(exclude=['object']).columns
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

In [8]:
# create preprocessing pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [9]:
# preprocess features
X_preprocessed = preprocessor.fit_transform(X)

# Data Splitting

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

# Model Selection and Training

In [11]:
# train a linear regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Model Evaluation

In [12]:
# make predictions
y_pred = lr_model.predict(X_test)

In [13]:
# evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [14]:
print(f'Linear Regression - MSE: {mse}, R2: {r2}')

Linear Regression - MSE: 987019289801.7692, R2: 0.032188231692223845


In [15]:
nn_model = Sequential()
nn_model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
nn_model.add(Dense(32, activation='relu'))
nn_model.add(Dense(1))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [16]:
nn_model.compile(optimizer='adam', loss='mse')

In [17]:
nn_model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2)

Epoch 1/100
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - loss: 450788130816.0000 - val_loss: 401504960512.0000
Epoch 2/100
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 465692393472.0000 - val_loss: 401444274176.0000
Epoch 3/100
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 444765929472.0000 - val_loss: 401233969152.0000
Epoch 4/100
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 446875435008.0000 - val_loss: 400782426112.0000
Epoch 5/100
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 442443268096.0000 - val_loss: 399983640576.0000
Epoch 6/100
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 433474535424.0000 - val_loss: 398786625536.0000
Epoch 7/100
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 462093877248.0000 - val_loss: 397124763648.0000
Epoch 8/100


<keras.src.callbacks.history.History at 0x25cd2640170>

In [18]:
y_pred_nn = nn_model.predict(X_test)

[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step


In [19]:
mse_nn = mean_squared_error(y_test, y_pred_nn)
r2_nn = r2_score(y_test, y_pred_nn)
print(f'Neural Network - MSE: {mse_nn}, R2: {r2_nn}')

Neural Network - MSE: 1018343830947.9628, R2: 0.0014732701191668696


In [20]:
import tensorflow as tf
from tensorflow.keras.models import load_model
import joblib

# Assuming lr_model is linear regression model and preprocessor is preprocessing pipeline

# saving the models
joblib.dump(lr_model, 'linear_regression_model.pkl')
joblib.dump(preprocessor, 'preprocessor.pkl')
nn_model.save('house_price_nn_model.h5')


custom_objects = {
    'mse': tf.keras.losses.MeanSquaredError(),
    'mean_absolute_error': tf.keras.losses.MeanAbsoluteError()
}


try:
    nn_model = load_model('house_price_nn_model.h5', custom_objects=custom_objects)
    print("Model loaded successfully.")
except Exception as e:
    print(f"Error loading model: {e}")

# resaving the model
if 'nn_model' in locals():
    try:
        nn_model.save('house_price_nn_model.keras', save_format='keras')
        print("Model re-saved in the new .keras format.")
    except Exception as e:
        print(f"Error saving model in the new format: {e}")
else:
    print("Model was not loaded, cannot re-save.")




Model loaded successfully.
Model re-saved in the new .keras format.


# Creating a GUI with Tkinter

In [21]:
import tkinter as tk
from tkinter import messagebox
import numpy as np
import joblib
from tensorflow.keras.models import load_model

# Load the pre-trained models and preprocessor
preprocessor = joblib.load('preprocessor.pkl')
lr_model = joblib.load('linear_regression_model.pkl')
nn_model = load_model('house_price_nn_model.keras')

  saveable.load_own_variables(weights_store.get(inner_path))


In [None]:
# Define the column names used during training
column_names = ["bedrooms", "bathrooms", "sqft_living", "sqft_lot", 
                "floors", "waterfront", "view", "condition", "sqft_above",
                "sqft_basement", "yr_built", "yr_renovated"]

# Define the GUI
class HousePricePredictor(tk.Tk):
    def __init__(self):
        super().__init__()
        self.title("House Price Predictor")

        # Labels and input fields for each feature
        self.labels = column_names
        
        self.entries = {}
        for label in self.labels:
            frame = tk.Frame(self)
            frame.pack(padx=10, pady=5, fill='x')
            lbl = tk.Label(frame, text=label.capitalize(), width=20, anchor='w')
            lbl.pack(side=tk.LEFT)
            entry = tk.Entry(frame)
            entry.pack(side=tk.RIGHT, expand=True, fill='x')
            self.entries[label] = entry

        # Buttons for prediction
        frame = tk.Frame(self)
        frame.pack(padx=10, pady=10, fill='x')
        self.predict_lr_btn = tk.Button(frame, text="Predict with LR", command=self.predict_lr)
        self.predict_lr_btn.pack(side=tk.LEFT, padx=5)
        self.predict_nn_btn = tk.Button(frame, text="Predict with NN", command=self.predict_nn)
        self.predict_nn_btn.pack(side=tk.RIGHT, padx=5)
        
        # Result label
        self.result_label = tk.Label(self, text="")
        self.result_label.pack(padx=10, pady=10)

    def get_input_features(self):
        try:
            # Extract and preprocess the input features
            features = []
            for label in self.labels:
                value = self.entries[label].get()
                features.append(float(value))

            # Convert to DataFrame with column names
            df = pd.DataFrame([features], columns=self.labels)
            return df
        except ValueError as e:
            messagebox.showerror("Input Error", f"Invalid input: {e}")
            return None

    def predict_lr(self):
        df_features = self.get_input_features()
        if df_features is None:
            return
        
        preprocessed_features = preprocessor.transform(df_features)
        prediction = lr_model.predict(preprocessed_features)
        self.result_label.config(text=f"Predicted Price (LR): ${prediction[0]:,.2f}")

    def predict_nn(self):
        df_features = self.get_input_features()
        if df_features is None:
            return

        preprocessed_features = preprocessor.transform(df_features)
        prediction = nn_model.predict(preprocessed_features)
        self.result_label.config(text=f"Predicted Price (NN): ${prediction[0][0]:,.2f}")

# Run the GUI
if __name__ == "__main__":
    app = HousePricePredictor()
    app.mainloop()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 123ms/step
