In [2]:
# !pip install xgboost
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler

# Load the dataset
path=r'C:/Users/NovaSoft\Downloads/Video/python-app--main/houses_data_v2.csv'
data = pd.read_csv(path)
print(data.shape)
print(data.head())



(23347, 10)
  House_Type  Size  Bedrooms  Bathrooms   Floor Furnished For_rent  \
0  Apartment   170         3          2       9        No       No   
1  Apartment   104         2          1       7        No       No   
2  Apartment   160         3          2       1        No       No   
3  Apartment   160         3          3  Ground        No       No   
4  Apartment   145         3          2       3        No       No   

                   Region   City    Price  
0         Zahraa Al Maadi  Cairo  1546400  
1               Nasr City  Cairo   950000  
2          Mostakbal City  Cairo  2100000  
3  New Cairo - El Tagamoa  Cairo  3994232  
4  New Cairo - El Tagamoa  Cairo   370000  


In [3]:
# Separate numeric and categorical columns
numeric_cols = data.select_dtypes(include='number').columns
categorical_cols = data.select_dtypes(exclude='number').columns

# Impute missing values for numeric columns
numeric_imputer = SimpleImputer(strategy='mean')  # Impute with mean value
data[numeric_cols] = numeric_imputer.fit_transform(data[numeric_cols])

# Impute missing values for categorical columns
categorical_imputer = SimpleImputer(strategy='most_frequent')  # Impute with most frequent value (mode)
data[categorical_cols] = categorical_imputer.fit_transform(data[categorical_cols])
print(data.isnull().sum())

House_Type    0
Size          0
Bedrooms      0
Bathrooms     0
Floor         0
Furnished     0
For_rent      0
Region        0
City          0
Price         0
dtype: int64


In [4]:

# Perform data preprocessing (handle missing values, encode categorical variables)
def encode_floor(floor):
    if floor == 'Ground':
        floor = 0
    if floor == '10+':
        floor = 11
    if floor == 'Highest':
        floor = 12
    return int(floor)

data['Floor'] = data['Floor'].apply(encode_floor)

In [5]:
# Assuming these columns are numeric
numeric_cols = ['Size', 'Bedrooms', 'Bathrooms', 'Floor', 'Price']
# Separating out the numeric columns for normalization
data_numeric = data[numeric_cols]


In [6]:
# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Scale the numeric columns
data_numeric_scaled = scaler.fit_transform(data_numeric)

# Convert back to a DataFrame
data_numeric_scaled = pd.DataFrame(data_numeric_scaled, columns=numeric_cols)


In [7]:
# Drop the original numeric columns from the original data
data_non_numeric = data.drop(columns=numeric_cols)

# Concatenate the scaled numeric data and non-numeric data
data_preprocessed = pd.concat([data_numeric_scaled, data_non_numeric.reset_index(drop=True)], axis=1)

In [8]:

# Perform one-hot encoding for categorical variables

data_preprocessed = pd.get_dummies(data_preprocessed , columns=['House_Type', 'Furnished', 'For_rent','Region', 'City'])


In [9]:

# Split data into features and target
X = data_preprocessed.drop('Price', axis=1)  # Assume 'Price' is the target
y = data_preprocessed['Price']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [10]:

from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
# from sklearn.ensemble import  AdaBoostRegressor, ExtraTreesRegressor
# from sklearn.svm import SVR
# from sklearn.linear_model import LinearRegression

# Initialize models

rf_regressor = RandomForestRegressor(random_state=42)
gb_regressor = GradientBoostingRegressor( random_state=42)
xgb_regressor = XGBRegressor( random_state=42)
# ada_regressor = AdaBoostRegressor(random_state=42)
# et_regressor = ExtraTreesRegressor(random_state=42)
# svm_regressor = SVR(kernel='linear')

# rf_model = RandomForestRegressor(random_state=42)
# gb_model = GradientBoostingRegressor(random_state=42)
# xgb_model = XGBRegressor(random_state=42)

# Initialize VotingRegressor
voting_model = VotingRegressor([
    ('rf', rf_regressor),
    ('gb', gb_regressor),
    ('xgb', xgb_regressor),
#      ('ada', ada_regressor),
#      ('et', et_regressor),
])

# Fit the model
voting_model.fit(X_train, y_train)

# Make predictions
voting_predictions = voting_model.predict(X_test)

# Evaluate the model
voting_mse = mean_squared_error(y_test, voting_predictions)
print('Voting Regressor MSE:', voting_mse)

Voting Regressor MSE: 0.03143954782994205


In [11]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# House_Type	Size	Bedrooms	Bathrooms	Floor	Furnished	For_rent	Region	City
# Apartment	170	3	2	9	No	No	Zahraa Al Maadi	Cairo
# Apartment	104	2	1	7	No	No	Nasr City	Cairo
# Apartment	160	3	2	1	No	No	Mostakbal City	Cairo

# Assuming 'new_data' is your new dataset
# Preprocess the new data
new_data = pd.DataFrame({
    'Size': [170],  # Example sizes in square meters
    'Bedrooms': [3],  # Example number of bedrooms
    'Bathrooms': [2],  # Example number of bathrooms
    'Floor': [9],  # Example floor types
    'House_Type': ['Apartment'],  # Example house types
    'Furnished': ['No'],  # Example furnished status
    'For_rent': ['No'],  # Example for rent status
    'Region': ['Zahraa Al Maadi'],  # Example regions
    'City': ['Cairo']  # Example cities
})
# Apartment	175	3	2	6	No	No	Miami	Alexandria
# new_data = pd.DataFrame({
#     'Size': [175],  # Example sizes in square meters
#     'Bedrooms': [3],  # Example number of bedrooms
#     'Bathrooms': [2],  # Example number of bathrooms
#     'Floor': [6],  # Example floor types
#     'House_Type': ['Apartment'],  # Example house types
#     'Furnished': ['No'],  # Example furnished status
#     'For_rent': ['No'],  # Example for rent status
#     'Region': ['Miami'],  # Example regions
#     'City': ['Alexandria']  # Example cities
#})
# Apartment	122	3	2	2	No	No	New Cairo - El Tagamoa	Cairo
# Apartment	122	2	2	3	No	Yes	Shubra	Cairo
# Apartment	135	3	2	6	Yes	No	Heliopolis	Cairo	1350000
# Apartment	174	3	2	3	Yes	Yes	New Cairo - El Tagamoa	Cairo	15000
# Apartment	170	2	2	3	No	Yes	New Cairo - El Tagamoa	Cairo	9000
# Apartment	130	3	2	5	No	Yes	New Cairo - El Tagamoa	Cairo	4000
# Apartment	162	3	3	4	No	Yes	Rehab City	Cairo	12000

# new_data = pd.DataFrame({
#     'House_Type': ['Apartment'],
#     'Size': [174],
#     'Bedrooms': [3],
#     'Bathrooms': [2],
#     'Floor': [3],  # 'Ground' replaced by 0
#     'Furnished': ['Yes'],
#     'For_rent': ['Yes'],
#     'Region': ['New Cairo-El Tagamoa'],
#     'City': ['Cairo']
# })


new_data['Floor'] = new_data['Floor'].apply(encode_floor)

# Assuming these columns are numeric
numeric_cols_new = ['Size', 'Bedrooms', 'Bathrooms', 'Floor']

# Separating out the numeric columns for normalization
new_data_numeric = new_data[numeric_cols_new]

scaler_new = MinMaxScaler()

# Scale the numeric columns
new_data_numeric_scaled = scaler_new.fit_transform(new_data_numeric)

# Convert back to a DataFrame
new_data_numeric_scaled = pd.DataFrame(new_data_numeric_scaled, columns=numeric_cols_new)



# Drop the original numeric columns from the original data
new_data_non_numeric = new_data.drop(columns=numeric_cols_new)

# Concatenate the scaled numeric data and non-numeric data
new_data_preprocessed = pd.concat([new_data_numeric_scaled, new_data_non_numeric.reset_index(drop=True)], axis=1)

# Perform one-hot encoding for categorical columns
new_data_preprocessed = pd.get_dummies(new_data_preprocessed, columns=['House_Type', 'Furnished', 'For_rent', 'Region', 'City'])

missing_cols = set(X_train.columns) - set(new_data_preprocessed.columns)
for col in missing_cols:
    new_data_preprocessed[col] = 0

# Reorder columns to match X_train
new_data_preprocessed = new_data_preprocessed[X_train.columns]

import numpy as np
# Make predictions using the trained model
new_data_predictions = voting_model.predict(new_data_preprocessed)

new_data_predictions= (new_data_predictions *(np.max(data['Price'])-np.min(data['Price'])))+np.min(data['Price'])
rounded_data_predictions = np.round(new_data_predictions).astype(int)
if rounded_data_predictions<0:
  rounded_data_predictions = abs(rounded_data_predictions)
# Print or use new_data_predictions as needed
print(rounded_data_predictions)



[883621]


In [12]:
import joblib
from joblib import dump

# Save the trained model
dump(voting_model, 'voting_model.joblib')

['voting_model.joblib']