In [16]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score,mean_absolute_error

from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor,ExtraTreesRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split

In [4]:
df= pd.read_csv("Model_Ready_Real_estate_data.csv")

In [6]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,Transaction Type,Registration type,Area,Property Type,Property Sub Type,Amount,Property Size (sq.m),Bedrooms,Parking,Nearest Metro,Nearest Mall,Nearest Landmark,Project,Latitude_Project,Longitude_Project,Price per sq.m
0,0,Sales,Off-Plan,BUSINESS BAY,Unit,Flat,2631000.0,105.75,2,Yes,Business Bay Metro Station,Dubai Mall,Downtown Dubai,AYKON CITY 3,25.11072,55.38869,24879.432624


In [8]:
# Removing unnamed columns using drop function
df.drop(df.columns[df.columns.str.contains(
    'unnamed', case=False)], axis=1, inplace=True)

In [10]:
# Define numerical and categorical features
numerical_features = ['Property Size (sq.m)', 'Bedrooms', 'Latitude_Project', 'Longitude_Project', 'Price per sq.m']
categorical_features = ['Transaction Type', 'Registration type', 'Property Type', 'Property Sub Type', 
                        'Parking', 'Nearest Metro', 'Nearest Mall', 'Nearest Landmark', 'Project']

# Create the preprocessing pipeline for numerical and categorical features
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with mean
    ('scaler', StandardScaler())                 # Standardize numerical values
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with the most frequent
    ('onehot', OneHotEncoder(handle_unknown='ignore'))      # Apply one-hot encoding
])

# Combine the numerical and categorical pipelines into one full pipeline
preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_features),
    ('cat', categorical_pipeline, categorical_features)
])


In [12]:
# Define the full pipeline that combines preprocessing and model
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

In [18]:
# Load your data (assuming data is in a pandas DataFrame)
# df = pd.read_csv('your_data.csv')

# Define features and target
X = df.drop('Amount', axis=1)  # Features (everything except Amount)
y = df['Amount']  # Target (Amount)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
# Train the model
model_pipeline.fit(X_train, y_train)

In [22]:
# Predict on test set
y_pred = model_pipeline.predict(X_test)

# Calculate Mean Absolute Error
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')

Mean Absolute Error: 66218.49640295004


In [44]:
# Calculate R² score
r2 = r2_score(y_test, y_pred)
print(f'R² Score: {r2}')

R² Score: 0.7389059262195697


In [42]:
def predict_amount(user_input):
    # Example user input: a dictionary with the feature names as keys
    user_input_df = pd.DataFrame([user_input])

    # Make prediction using the trained model
    predicted_amount = model_pipeline.predict(user_input_df)
    return predicted_amount[0]

# Example user input (replace with actual user input)
user_input = {
    'Transaction Type': 'Sales',
    'Registration type': 'Off-Plan',
    'Area': 'PALM JUMEIRAH',
    'Property Type': 'Unit',
    'Property Sub Type': 'Flat',
    'Property Size (sq.m)': 125.75,
    'Bedrooms': 2,
    'Parking': 'Yes',
    'Nearest Metro': 'Palm Jumeirah',
    'Nearest Mall': 'Marina Mall',
    'Nearest Landmark': 'Burj Al Arab',
    'Project': 'Palm Beach Towers -3',
    'Latitude_Project': 25.11072,
    'Longitude_Project': 55.38869,
    'Price per sq.m': 37929.432624
}

# Get the predicted amount
predicted_amount = predict_amount(user_input)
print(f"Predicted Amount: {predicted_amount}")


Predicted Amount: 4762466.32


In [38]:
X_test.head()

Unnamed: 0,Transaction Type,Registration type,Area,Property Type,Property Sub Type,Property Size (sq.m),Bedrooms,Parking,Nearest Metro,Nearest Mall,Nearest Landmark,Project,Latitude_Project,Longitude_Project,Price per sq.m
2081,Sales,Off-Plan,PALM JUMEIRAH,Unit,Flat,125.99,2,Yes,Palm Jumeirah,Marina Mall,Burj Al Arab,Palm Beach Towers -3,25.12901,55.13413,37929.994444
25196,Sales,Ready,INTERNATIONAL CITY PH 2 & 3,Unit,Flat,73.98,1,Yes,Rashidiya Metro Station,City Centre Mirdif,,RITZ RESIDENCE,25.0595,55.21613,5001.351717
8781,Sales,Off-Plan,JUMEIRAH VILLAGE CIRCLE,Unit,Flat,73.65,1,Yes,Nakheel Metro Station,Marina Mall,Sports City Swimming Academy,HAMILTON HOUSE,25.04535,55.24504,12977.976918
44180,Gifts,Ready,PALM JUMEIRAH,Unit,Flat,210.52,3,Yes,Knowledge Village,Marina Mall,Burj Al Arab,TIARA RESIDENCE,25.11512,55.13978,12479.9924
1331,Sales,Off-Plan,Warsan First,Unit,Flat,58.73,1,Yes,Rashidiya Metro Station,City Centre Mirdif,Dubai International Airport,Olivz Residence,25.0595,55.21613,9024.348714


In [40]:
y_test

2081     4778800.0
25196     370000.0
8781      955828.0
44180    2627288.0
1331      530000.0
           ...    
5278      969000.0
27312    2100000.0
6062     1237888.0
5694     2267888.0
8910     2900000.0
Name: Amount, Length: 8949, dtype: float64

In [46]:
# Get feature importances from the trained RandomForestRegressor
importances = model_pipeline.named_steps['regressor'].feature_importances_

# Get the feature names (columns)
column_names = numerical_features + model_pipeline.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(categorical_features).tolist()

# Create a DataFrame to display feature importances
import pandas as pd
feature_importance_df = pd.DataFrame({
    'Feature': column_names,
    'Importance': importances
})

# Sort the features by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

print(feature_importance_df)


                                      Feature    Importance
0                        Property Size (sq.m)  6.495898e-01
4                              Price per sq.m  3.268549e-01
2                            Latitude_Project  5.020266e-03
1                                    Bedrooms  4.062568e-03
293         Project_Bulgari Lighthouse Dubai   3.571698e-03
...                                       ...           ...
665                          Project_ME DO RE  1.093610e-13
1136  Project_The Royal Estate Plot 2 Phase B  3.413497e-14
800                    Project_PARK VILLA'S 6  0.000000e+00
579                       Project_LA VISTA 06  0.000000e+00
198                  Project_Azizi Riviera 27  0.000000e+00

[1213 rows x 2 columns]


In [58]:
%matplotlib inline

In [62]:
# import matplotlib.pyplot as plt

# # Plot feature importances
# plt.figure(figsize=(10, 6))
# plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
# plt.xlabel('Importance')
# plt.title('Feature Importance')
# plt.gca().invert_yaxis()  # Invert the y-axis to show the most important feature at the top
# plt.show()


In [66]:
import pandas as pd

# Function to get input from the user
def get_user_input():
    user_input = {}

    # Asking for input for each feature required by the model
    user_input['Transaction Type'] = input("Enter Transaction Type (e.g., Sales): ")
    user_input['Registration type'] = input("Enter Registration Type (e.g., Off-Plan): ")
    user_input['Area'] = input("Enter Area (e.g., BUSINESS BAY): ")
    user_input['Property Type'] = input("Enter Property Type (e.g., Flat): ")
    user_input['Property Sub Type'] = input("Enter Property Sub Type (e.g., Unit): ")
    
    user_input['Property Size (sq.m)'] = float(input("Enter Property Size (in sq.m): "))
    user_input['Bedrooms'] = int(input("Enter Number of Bedrooms: "))
    user_input['Parking'] = input("Is there parking? (Yes/No): ")
    
    user_input['Nearest Metro'] = input("Enter nearest metro station: ")
    user_input['Nearest Mall'] = input("Enter nearest mall: ")
    user_input['Nearest Landmark'] = input("Enter nearest landmark: ")
    
    user_input['Project'] = input("Enter Project Name: ")
    user_input['Latitude_Project'] = float(input("Enter Latitude of Project: "))
    user_input['Longitude_Project'] = float(input("Enter Longitude of Project: "))
    user_input['Price per sq.m'] = float(input("Enter Price per sq.m: "))

    return user_input

# Function to make predictions
def predict_amount(user_input):
    # Convert the user input into a DataFrame to match the model's input format
    user_input_df = pd.DataFrame([user_input])

    # Make prediction using the trained model pipeline
    predicted_amount = model_pipeline.predict(user_input_df)
    return predicted_amount[0]

# Get the input from the user
user_input = get_user_input()

# Get the predicted amount
predicted_amount = predict_amount(user_input)

# Display the predicted amount
print(f"Predicted Amount: {predicted_amount}")

Enter Transaction Type (e.g., Sales):  Sales
Enter Registration Type (e.g., Off-Plan):  Off-Plan
Enter Area (e.g., BUSINESS BAY):  BUSINESS BAY
Enter Property Type (e.g., Flat):  Flat
Enter Property Sub Type (e.g., Unit):  Unit
Enter Property Size (in sq.m):  105.75
Enter Number of Bedrooms:  2
Is there parking? (Yes/No):  Yes
Enter nearest metro station:  Business Bay Metro Station
Enter nearest mall:  Dubai Mall
Enter nearest landmark:  Downtown Dubai
Enter Project Name:  AYKON CITY 3
Enter Latitude of Project:  25
Enter Longitude of Project:  55
Enter Price per sq.m:  24879


Predicted Amount: 2632989.17


In [68]:
import pickle

# Assuming `model` is your trained model
with open('model_pipeline.pkl', 'wb') as file:
    pickle.dump(model, file)


NameError: name 'model' is not defined