In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data/final_data.csv')

# Feature Engineering
df['Area_x_Bathrooms'] = df['Area'] * df['#Bathrooms']
df['Log_Area'] = np.log1p(df['Area'])
df['Log_Price'] = np.log1p(df['Price'])
if '#Bedrooms' in df.columns and (df['#Bedrooms'] > 0).all():
    df['Bathrooms_per_Bedroom'] = df['#Bathrooms'] / df['#Bedrooms']
else:
    df['Bathrooms_per_Bedroom'] = np.nan
df = df.drop(columns=['Area', '#Bathrooms', 'Price'], errors='ignore')

# Clean and convert categorical columns for PyCaret
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    df[col] = df[col].str.replace(' ', '_')  # Replace spaces with underscores in values
    df[col] = df[col].astype('category')

In [3]:
from pycaret.regression import setup, compare_models

reg_setup = setup(
    data=df,
    target='Log_Price',
    session_id=123,
    feature_selection=True,
    pca=True,                # Enable PCA
    pca_components=0.95,     # Keep enough components to explain 95% variance
    normalize=True,          # Normalize the data
    verbose=False
)

best_model = compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
ada,AdaBoost Regressor,0.488,0.3996,0.6224,0.2855,0.0438,0.0373,0.11
lightgbm,Light Gradient Boosting Machine,0.4925,0.3969,0.6219,0.2849,0.0438,0.0375,0.19
gbr,Gradient Boosting Regressor,0.5027,0.4158,0.638,0.2478,0.0449,0.0383,0.09
knn,K Neighbors Regressor,0.5127,0.4294,0.6488,0.2218,0.0456,0.039,0.096
ridge,Ridge Regression,0.5602,0.5146,0.7116,0.0596,0.0499,0.0429,0.094
lr,Linear Regression,0.5602,0.5147,0.7116,0.0595,0.0499,0.0429,0.657
lar,Least Angle Regression,0.5602,0.5147,0.7116,0.0595,0.0499,0.0429,0.084
omp,Orthogonal Matching Pursuit,0.5602,0.5147,0.7116,0.0595,0.0499,0.0429,0.081
br,Bayesian Ridge,0.561,0.5156,0.7123,0.0579,0.0499,0.0429,0.066
en,Elastic Net,0.5934,0.5379,0.7301,0.0181,0.0516,0.0454,0.071


In [4]:
from pycaret.regression import predict_model, tune_model, evaluate_model

# Fine-tune the best model
tuned_model = tune_model(best_model, optimize='R2')  
evaluate_model(tuned_model)

# Predict using PyCaret's pipeline
predictions = predict_model(best_model, data=df)
price_pred = np.expm1(predictions['prediction_label'])
result = df.reset_index(drop=True).copy()
result['Predicted Price'] = price_pred

# Display the first 10 rows with related info and formatted price
pd.options.display.float_format = '{:,.0f}'.format
print(result.head(10)[['Predicted Price'] + [col for col in ['Region', 'Type', '#Bedrooms', 'Area_x_Bathrooms', 'Log_Area','Bathrooms_per_Bedroom']]])

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.4156,0.2709,0.5205,0.3549,0.0365,0.0312
1,0.6063,0.5024,0.7088,0.1944,0.0507,0.0472
2,0.5717,0.5644,0.7513,0.1417,0.0522,0.043
3,0.3988,0.2666,0.5163,0.4302,0.0368,0.0307
4,0.4949,0.4009,0.6332,0.3387,0.0446,0.0378
5,0.6218,0.5773,0.7598,0.1795,0.0534,0.0476
6,0.4668,0.2963,0.5444,0.2562,0.0387,0.036
7,0.494,0.3683,0.6069,0.2807,0.0432,0.0382
8,0.3419,0.2141,0.4627,0.5806,0.0324,0.0258
9,0.4174,0.3099,0.5567,0.477,0.0387,0.0312


Fitting 10 folds for each of 10 candidates, totalling 100 fits


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,AdaBoost Regressor,0.4829,0.3788,0.6154,0.3031,0.0434,0.0368


   Predicted Price  Region         Type  #Bedrooms  Area_x_Bathrooms  \
0        1,405,808  Ariana        Villa          4               900   
1        2,150,010   Tunis        Villa          5             6,600   
2          730,500  Nabeul        Villa          4               600   
3          858,039   Tunis        Villa          5             1,000   
4          246,875   Tunis  Appartement          1               400   
5          336,198   Tunis  Appartement          3               400   
6          714,272   Tunis        Villa          5             3,099   
7          645,825   Tunis        Villa          4             1,236   
8          863,230  Sousse        Villa          5             2,000   
9          684,974  Sousse        Villa          4             4,444   

   Log_Area  Bathrooms_per_Bedroom  
0         6                      0  
1         7                      1  
2         6                      0  
3         6                      0  
4         6           