In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import statsmodels.api as sm
import numpy as np
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression

In [2]:
df_encoded = pd.read_csv('data/final_encoded_data.csv')

print(df_encoded.columns)

Index(['Superficie', 'Salles de bains', 'Chambres', 'Prix',
       'Type de bien_Appartement', 'Type de bien_Maison', 'Type de bien_Villa',
       'Localisation_Ariana', 'Localisation_Ben_Arous', 'Localisation_Bizerte',
       'Localisation_Bja', 'Localisation_Gabs', 'Localisation_Gafsa',
       'Localisation_Jendouba', 'Localisation_Kairouan',
       'Localisation_Kasserine', 'Localisation_La_Manouba',
       'Localisation_Le_Kef', 'Localisation_Mahdia', 'Localisation_Mdenine',
       'Localisation_Monastir', 'Localisation_Nabeul', 'Localisation_Sfax',
       'Localisation_Sidi_Bouzid', 'Localisation_Siliana',
       'Localisation_Sousse', 'Localisation_Tozeur', 'Localisation_Tunis',
       'Localisation_Zaghouan'],
      dtype='object')


Testing a RandomForestRegressor

In [10]:
X = df_encoded.drop(columns=['Prix'])  # Features
y = df_encoded['Prix']  # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

# Evaluate the performance
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print evaluation metrics
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")

Mean Absolute Error (MAE): 145321.93
Mean Squared Error (MSE): 38403206857.67


In [11]:
new_data = {
    'Superficie': [150],  # 150 square meters
    'Salles de bains': [2],  # 2 bathrooms
    'Chambres': [3],  # 3 bedrooms
    'Type de bien_Appartement': [1],  # Appartement
    'Type de bien_Maison': [0], 
    'Type de bien_Villa': [0], 
    'Localisation_Ariana': [0],
    'Localisation_Ben_Arous': [0],
    'Localisation_Bizerte': [0],
    'Localisation_Bja': [0],
    'Localisation_Gabs': [0],
    'Localisation_Gafsa': [0],
    'Localisation_Jendouba': [0],
    'Localisation_Kairouan': [0],
    'Localisation_Kasserine': [0],
    'Localisation_La_Manouba': [0],
    'Localisation_Le_Kef': [0],
    'Localisation_Mahdia': [0],
    'Localisation_Mdenine': [0],
    'Localisation_Monastir': [0],
    'Localisation_Nabeul': [0],
    'Localisation_Sfax': [0],
    'Localisation_Sidi_Bouzid': [0],
    'Localisation_Siliana': [0],
    'Localisation_Sousse': [0],
    'Localisation_Tozeur': [0],
    'Localisation_Tunis': [1],  # in Tunis
    'Localisation_Zaghouan': [0],
}

predicted = rf_model.predict(pd.DataFrame(new_data))
print(f"Predicted price: {predicted[0]:.2f} TND")

Predicted price: 431753.60 TND


Testing a multivariate linear regression model

In [5]:
# Working with the same features/target as above
X = sm.add_constant(X)  
model = sm.OLS(y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                   Prix   R-squared:                       0.539
Model:                            OLS   Adj. R-squared:                  0.532
Method:                 Least Squares   F-statistic:                     73.65
Date:                Wed, 25 Dec 2024   Prob (F-statistic):          6.35e-253
Time:                        13:10:10   Log-Likelihood:                -22514.
No. Observations:                1663   AIC:                         4.508e+04
Df Residuals:                    1636   BIC:                         4.523e+04
Df Model:                          26                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   

In [None]:
def predict_prix(superficie, salles_de_bains, chambres, type_de_bien, localisation):
    # Coefficients from the OLS regression
    const = 1.276e+05
    coef_superficie = 277.0918
    coef_salles_de_bains = 4.342e+04
    coef_chambres = 1.465e+04
    
    # Coefficients for 'Type de bien'
    coef_type_appartement = 1.625e+04
    coef_type_maison = -6.122e+04
    coef_type_villa = 1.725e+05
    
    # Coefficients for 'Localisation'
    localisation_coefficients = {
        'Ariana': 9.234e+04,
        'Ben_Arous': 3.145e+04,
        'Bizerte': -8188.7637,
        'Bja': -1.22e+05,
        'Gabs': -1.324e+05,
        'Gafsa': -1.378e+05,
        'Jendouba': -1.325e+04,
        'Kairouan': -1662.6989,
        'Kasserine': 6.396e+04,
        'La_Manouba': -1.701e+04,
        'Le_Kef': -1.4e+05,
        'Mahdia': 2.401e+04,
        'Mdenine': -3.877e+04,
        'Monastir': -1.939e+05,
        'Nabeul': 1.217e+05,
        'Sfax': -2.105e+04,
        'Sidi_Bouzid': -4.462e+04,
        'Siliana': 1.335e+05,
        'Sousse': 1.323e+05,
        'Tozeur': 3.665e+05,
        'Tunis': 9.422e+04,
        'Zaghouan': -6.176e+04
    }

    # Calculate base price from numerical features
    price = (const + 
             coef_superficie * superficie + 
             coef_salles_de_bains * salles_de_bains + 
             coef_chambres * chambres)

    # Add the effect of 'Type de bien'
    if type_de_bien == 'Appartement':
        price += coef_type_appartement
    elif type_de_bien == 'Maison':
        price += coef_type_maison
    elif type_de_bien == 'Villa':
        price += coef_type_villa

    # Add the effect of 'Localisation'
    if localisation in localisation_coefficients:
        price += localisation_coefficients[localisation]
    
    return price

# Example usage: predict the price for a property
predicted_price = predict_prix(superficie=150, salles_de_bains=2, chambres=3, type_de_bien='Appartement', localisation='Tunis')
print(f"Predicted price: {predicted_price}")


Predicted price: 410423.77


In [14]:
# Try degree=2 or higher for more complexity
poly = PolynomialFeatures(degree=4, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

scaler = StandardScaler()
X_train_poly_scaled = scaler.fit_transform(X_train_poly)
X_test_poly_scaled = scaler.transform(X_test_poly)

model = LinearRegression()
model.fit(X_train_poly_scaled, y_train)

y_train_pred = model.predict(X_train_poly_scaled)
y_test_pred = model.predict(X_test_poly_scaled)

print("Training RMSE:", np.sqrt(mean_squared_error(y_train, y_train_pred)))
print("Test RMSE:", np.sqrt(mean_squared_error(y_test, y_test_pred)))
print("R-squared (Training):", r2_score(y_train, y_train_pred))
print("R-squared (Test):", r2_score(y_test, y_test_pred))

Training RMSE: 144733.05206039408
Test RMSE: 8.908083359419872e+19
R-squared (Training): 0.707507424913705
R-squared (Test): -1.0122792943404074e+29
