In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import pickle
import statsmodels.api as sm


In [45]:
# Load the dataset
df = pd.read_csv('C:/Users/muniza.hashim/Desktop/senior/FYP/FYP progress/House Prices/house_kg_10K_ads.csv')

# Feature Engineering
df['date_year'] = pd.to_datetime(df['date']).dt.year
df.loc[df['rooms'] == "свободная планировка", 'rooms'] = 0
df.loc[df['rooms'] == "6 и более", 'rooms'] = 6
df["rooms"] = df["rooms"].astype(int)
df['is_top_floor'] = (df['floor'] == df['floors']) & (df['floor'] != 1)
df['is_bottom_floor'] = (df['floor'] == 1) & (df['floor'] != df['floors'])
df['max_price_micro_district'] = df.groupby('micro_district')['price'].transform('max')
df['max_price_micro_district'].fillna(0, inplace=True)

# Encoding categorical data
labelencoder = LabelEncoder()
df['district_encoded'] = labelencoder.fit_transform(df['district'])
df['micro_district_encoded'] = labelencoder.fit_transform(df['micro_district'])
df['building_type_encoded'] = labelencoder.fit_transform(df['building_type'])
df['source_encoded'] = labelencoder.fit_transform(df['source'])
df['condition_encoded'] = labelencoder.fit_transform(df['condition'])

In [46]:

# Define independent and dependent variables
independent_variables = ["square", "rooms", "floors", "floor", "date_year", 'max_price_micro_district',
                         'district_encoded', 'micro_district_encoded', 'building_type_encoded', 
                         'source_encoded', 'condition_encoded']
dependent_variable = "price"

# Standardize the selected columns
df[independent_variables] = (df[independent_variables] - df[independent_variables].mean()) / df[independent_variables].std()
df[dependent_variable] = (df[dependent_variable] - df[dependent_variable].mean()) / df[dependent_variable].std()

# Set up X and y based on dependent and independent variables
X = df[independent_variables]
X = sm.add_constant(X)
y = df[dependent_variable]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)

# Instantiate and fit a Linear Regression model
regressor = LinearRegression()
regressor.fit(X_train, y_train)


In [47]:
# Make predictions on the test set
y_pred = regressor.predict(X_test)



In [48]:
# Evaluate the model
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print("Linear Regression R-squared:", r2)
print("Linear Regression Mean Squared Error (MSE):", mse)


Linear Regression R-squared: 0.7455314201761754
Linear Regression Mean Squared Error (MSE): 0.2760602122400217


In [49]:

# Random Forest Regressor
regressor_rf = RandomForestRegressor(n_estimators=100, random_state=42)
regressor_rf.fit(X_train, y_train)
y_pred_rf = regressor_rf.predict(X_test)
r2_rf = r2_score(y_test, y_pred_rf)
mse_rf = mean_squared_error(y_test, y_pred_rf)
print("Random Forest R-squared:", r2_rf)
print("Random Forest Mean Squared Error (MSE):", mse_rf)

Random Forest R-squared: 0.8676713152794032
Random Forest Mean Squared Error (MSE): 0.1435567597960504


In [50]:

# Decision Tree Regressor
regressor_dt = DecisionTreeRegressor(random_state=42)
regressor_dt.fit(X_train, y_train)
y_pred_dt = regressor_dt.predict(X_test)
r2_dt = r2_score(y_test, y_pred_dt)
mse_dt = mean_squared_error(y_test, y_pred_dt)
print("Decision Tree R-squared:", r2_dt)
print("Decision Tree Mean Squared Error (MSE):", mse_dt)



Decision Tree R-squared: 0.7771678811924448
Decision Tree Mean Squared Error (MSE): 0.24173940081127482


In [51]:

# Save the trained Decision Tree Regressor model using pickle
with open('DecisionTreeRegressor.pkl', 'wb') as file:
    pickle.dump(regressor_dt, file)