In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import xgboost as xgb
import mlflow
import mlflow.sklearn
import os
import warnings
warnings.filterwarnings('ignore')
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [2]:
# MLFlow setup
os.environ["APP_URI"] = "https://amaulf-mlflow-server-smartinvest.hf.space"
EXPERIMENT_NAME = "Smart-Invest-Final-Models-TestKmeans"

mlflow.set_tracking_uri(os.environ["APP_URI"])
mlflow.set_experiment(EXPERIMENT_NAME)

experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
print(f"Experiment ID: {experiment.experiment_id}")
print(f"Experiment name: {experiment.name}")

2025/07/30 17:32:07 INFO mlflow.tracking.fluent: Experiment with name 'Smart-Invest-Final-Models-TestKmeans' does not exist. Creating a new experiment.


Experiment ID: 8
Experiment name: Smart-Invest-Final-Models-TestKmeans


# X & y prep 

In [16]:
# Load data
df = pd.read_csv('/Users/amaurylefranc/Desktop/Jehda/vscode/finalprojectml/final_data.csv')
print(f"Dataset shape: {df.shape}")
print(f"Target variable: prix_m2")

Dataset shape: (137917, 24)
Target variable: prix_m2


In [21]:
# Basic data checks
print("\nDataset info:")
print(df.info())
print(f"\nMissing values:\n{df.isnull().sum().sum()}")
print(f"\nTarget variable stats:")
print(df['prix_m2'].describe())


Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137917 entries, 0 to 137916
Data columns (total 20 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   surface_reelle_bati        137917 non-null  float64
 1   prix_m2                    137917 non-null  float64
 2   nombre_pieces_principales  137917 non-null  float64
 3   longitude                  137917 non-null  float64
 4   latitude                   137917 non-null  float64
 5   nombre_lots                137917 non-null  int64  
 6   annee                      137917 non-null  int64  
 7   distance_datashop_km       137917 non-null  float64
 8   distance_espace_vert_km    137917 non-null  float64
 9   distance_college_km        137917 non-null  float64
 10  distance_universite_km     137917 non-null  float64
 11  distance_ecole_km          137917 non-null  float64
 12  distance_metro_km          137917 non-null  float64
 13  distance_TER_k

In [None]:
# Remove non-predictive columns
columns_to_drop = ['cle_interop_adr_proche','arrondissement','id','Unnamed: 0']
df = df.drop(columns_to_drop, axis=1)

In [32]:
# Prepare features and target
target = 'prix_m2'
X = df.drop(target, axis=1)
y = df[target]

print(f"\nFinal feature shape: {X.shape}")
print(f"Features: {list(X.columns)}")


Final feature shape: (137917, 19)
Features: ['surface_reelle_bati', 'nombre_pieces_principales', 'longitude', 'latitude', 'nombre_lots', 'annee', 'distance_datashop_km', 'distance_espace_vert_km', 'distance_college_km', 'distance_universite_km', 'distance_ecole_km', 'distance_metro_km', 'distance_TER_km', 'distance_POI_min_km', 'proche_POI_1km', 'nb_POIs_inf_1km', 'distance_batiment_m', 'annee_construction_dpe', 'zone']


In [33]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"\nTrain set: {X_train.shape}")
print(f"Test set: {X_test.shape}")


Train set: (110333, 19)
Test set: (27584, 19)


In [34]:
# Separate numeric and categorical features

numeric_features = [
    'surface_reelle_bati', 'nombre_pieces_principales', 
    'longitude', 'latitude', 'nombre_lots', 'distance_datashop_km', 
    'distance_espace_vert_km', 'distance_college_km', 'distance_universite_km', 
    'distance_ecole_km', 'distance_metro_km', 'distance_TER_km', 
    'distance_POI_min_km', 'nb_POIs_inf_1km', 'distance_batiment_m', 
    'annee_construction_dpe'
]

categorical_features = [
    'proche_POI_1km','annee','zone'
]

In [35]:
print(f"\nNumeric features ({len(numeric_features)}): {numeric_features}")
print(f"Categorical features ({len(categorical_features)}): {categorical_features}")


Numeric features (16): ['surface_reelle_bati', 'nombre_pieces_principales', 'longitude', 'latitude', 'nombre_lots', 'distance_datashop_km', 'distance_espace_vert_km', 'distance_college_km', 'distance_universite_km', 'distance_ecole_km', 'distance_metro_km', 'distance_TER_km', 'distance_POI_min_km', 'nb_POIs_inf_1km', 'distance_batiment_m', 'annee_construction_dpe']
Categorical features (3): ['proche_POI_1km', 'annee', 'zone']


# Baseline models

In [36]:
# Create pipeline for numeric features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Create pipeline for categorical features  
categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(drop='first'))
])

# Use ColumnTransformer to make a preprocessor object
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Preprocessing on train set
print("Performing preprocessing on train set...")
X_train_processed = preprocessor.fit_transform(X_train)
print(f"Train processed shape: {X_train_processed.shape}")

# Preprocessing on test set
print("Performing preprocessing on test set...")
X_test_processed = preprocessor.transform(X_test)
print(f"Test processed shape: {X_test_processed.shape}")

Performing preprocessing on train set...
Train processed shape: (110333, 27)
Performing preprocessing on test set...
Test processed shape: (27584, 27)


In [37]:
# Model 1: Linear Regression (Baseline)
mlflow.sklearn.autolog()

with mlflow.start_run(run_name="Linear_Regression_Baseline"):
    lr = LinearRegression()
    lr.fit(X_train_processed, y_train)
    
    y_train_pred = lr.predict(X_train_processed)
    y_test_pred = lr.predict(X_test_processed)
    
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    train_mae = mean_absolute_error(y_train, y_train_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    
    mlflow.log_metric("train_r2", train_r2)
    mlflow.log_metric("test_r2", test_r2)
    mlflow.log_metric("train_mae", train_mae)
    mlflow.log_metric("test_mae", test_mae)
    mlflow.log_metric("train_rmse", train_rmse)
    mlflow.log_metric("test_rmse", test_rmse)
    
    print("Linear Regression Results:")
    print(f"Train R2: {train_r2:.3f}")
    print(f"Test R2: {test_r2:.3f}")
    print(f"Train MAE: {train_mae:,.0f}€/m²")
    print(f"Test MAE: {test_mae:,.0f}€/m²")
    print(f"Train RMSE: {train_rmse:,.0f}€/m²")
    print(f"Test RMSE: {test_rmse:,.0f}€/m²")



Linear Regression Results:
Train R2: 0.201
Test R2: 0.195
Train MAE: 1,805€/m²
Test MAE: 1,817€/m²
Train RMSE: 2,439€/m²
Test RMSE: 2,458€/m²


2025/07/30 17:41:44 INFO mlflow.tracking._tracking_service.client: 🏃 View run Linear_Regression_Baseline at: https://amaulf-mlflow-server-smartinvest.hf.space/#/experiments/8/runs/0bde56e49a80472db802d483c1a8e111.
2025/07/30 17:41:44 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://amaulf-mlflow-server-smartinvest.hf.space/#/experiments/8.


In [38]:
# Model 2: Random Forest Basic
with mlflow.start_run(run_name="RandomForest_Basic"):
    rf_basic = RandomForestRegressor(
        n_estimators=100,
        max_depth=10,
        random_state=42
    )
    rf_basic.fit(X_train_processed, y_train)
    
    y_train_pred = rf_basic.predict(X_train_processed)
    y_test_pred = rf_basic.predict(X_test_processed)
    
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    train_mae = mean_absolute_error(y_train, y_train_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    
    mlflow.log_metric("train_r2", train_r2)
    mlflow.log_metric("test_r2", test_r2)
    mlflow.log_metric("train_mae", train_mae)
    mlflow.log_metric("test_mae", test_mae)
    mlflow.log_metric("train_rmse", train_rmse)
    mlflow.log_metric("test_rmse", test_rmse)
    
    print("\nRandom Forest Basic Results:")
    print(f"Train R2: {train_r2:.3f}")
    print(f"Test R2: {test_r2:.3f}")
    print(f"Train MAE: {train_mae:,.0f}€/m²")
    print(f"Test MAE: {test_mae:,.0f}€/m²")
    print(f"Train RMSE: {train_rmse:,.0f}€/m²")
    print(f"Test RMSE: {test_rmse:,.0f}€/m²")




Random Forest Basic Results:
Train R2: 0.367
Test R2: 0.313
Train MAE: 1,576€/m²
Test MAE: 1,638€/m²
Train RMSE: 2,171€/m²
Test RMSE: 2,272€/m²


2025/07/30 17:43:06 INFO mlflow.tracking._tracking_service.client: 🏃 View run RandomForest_Basic at: https://amaulf-mlflow-server-smartinvest.hf.space/#/experiments/8/runs/54b44f7c9b3947139aee2442ec7f35bc.
2025/07/30 17:43:06 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://amaulf-mlflow-server-smartinvest.hf.space/#/experiments/8.


# XGBoost

In [39]:
# Model 3: XGBoost
with mlflow.start_run(run_name="XGBoost_Basic"):
    xgb_model = xgb.XGBRegressor(
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        random_state=42
    )
    xgb_model.fit(X_train_processed, y_train)
    
    y_train_pred = xgb_model.predict(X_train_processed)
    y_test_pred = xgb_model.predict(X_test_processed)
    
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    train_mae = mean_absolute_error(y_train, y_train_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    
    mlflow.log_metric("train_r2", train_r2)
    mlflow.log_metric("test_r2", test_r2)
    mlflow.log_metric("train_mae", train_mae)
    mlflow.log_metric("test_mae", test_mae)
    mlflow.log_metric("train_rmse", train_rmse)
    mlflow.log_metric("test_rmse", test_rmse)
    
    print("XGBoost Basic Results:")
    print(f"Train R2: {train_r2:.3f}")
    print(f"Test R2: {test_r2:.3f}")
    print(f"Train MAE: {train_mae:,.0f}€/m²")
    print(f"Test MAE: {test_mae:,.0f}€/m²")
    print(f"Train RMSE: {train_rmse:,.0f}€/m²")
    print(f"Test RMSE: {test_rmse:,.0f}€/m²")

XGBoost Basic Results:
Train R2: 0.362
Test R2: 0.325
Train MAE: 1,574€/m²
Test MAE: 1,620€/m²
Train RMSE: 2,180€/m²
Test RMSE: 2,252€/m²


2025/07/30 17:43:41 INFO mlflow.tracking._tracking_service.client: 🏃 View run XGBoost_Basic at: https://amaulf-mlflow-server-smartinvest.hf.space/#/experiments/8/runs/89d0ccff6d7f45f19ec21f0af2799012.
2025/07/30 17:43:41 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://amaulf-mlflow-server-smartinvest.hf.space/#/experiments/8.


In [40]:
# Model 4: Ridge Regression
with mlflow.start_run(run_name="Ridge_Regression"):
    ridge = Ridge(alpha=1.0, random_state=42)
    ridge.fit(X_train_processed, y_train)
    
    y_train_pred = ridge.predict(X_train_processed)
    y_test_pred = ridge.predict(X_test_processed)
    
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    train_mae = mean_absolute_error(y_train, y_train_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    
    mlflow.log_metric("train_r2", train_r2)
    mlflow.log_metric("test_r2", test_r2)
    mlflow.log_metric("train_mae", train_mae)
    mlflow.log_metric("test_mae", test_mae)
    mlflow.log_metric("train_rmse", train_rmse)
    mlflow.log_metric("test_rmse", test_rmse)
    
    print("Ridge Regression Results:")
    print(f"Train R2: {train_r2:.3f}")
    print(f"Test R2: {test_r2:.3f}")
    print(f"Train MAE: {train_mae:,.0f}€/m²")
    print(f"Test MAE: {test_mae:,.0f}€/m²")
    print(f"Train RMSE: {train_rmse:,.0f}€/m²")
    print(f"Test RMSE: {test_rmse:,.0f}€/m²")



Ridge Regression Results:
Train R2: 0.201
Test R2: 0.195
Train MAE: 1,805€/m²
Test MAE: 1,817€/m²
Train RMSE: 2,439€/m²
Test RMSE: 2,458€/m²


2025/07/30 17:44:08 INFO mlflow.tracking._tracking_service.client: 🏃 View run Ridge_Regression at: https://amaulf-mlflow-server-smartinvest.hf.space/#/experiments/8/runs/1185e00326004e5a999a71f303be677f.
2025/07/30 17:44:08 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://amaulf-mlflow-server-smartinvest.hf.space/#/experiments/8.


In [41]:
# Step 5: Hyperparameter Tuning on XGBoost
print("Starting GridSearch on XGBoost...")

with mlflow.start_run(run_name="XGBoost_GridSearch"):
    xgb_model = xgb.XGBRegressor(random_state=42)
    
    # Grid of hyperparameters to test
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [4, 6, 8],
        'learning_rate': [0.05, 0.1, 0.15],
        'min_child_weight': [1, 3, 5]
    }
    
    # GridSearchCV with cross-validation
    grid_search = GridSearchCV(
        xgb_model, 
        param_grid, 
        cv=5, 
        scoring='r2',
        n_jobs=-1,
        verbose=1
    )
    
    grid_search.fit(X_train_processed, y_train)
    
    # Best model predictions
    best_model = grid_search.best_estimator_
    y_train_pred = best_model.predict(X_train_processed)
    y_test_pred = best_model.predict(X_test_processed)
    
    # Calculate metrics
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    train_mae = mean_absolute_error(y_train, y_train_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    
    # Log results
    mlflow.log_params(grid_search.best_params_)
    mlflow.log_metric("train_r2", train_r2)
    mlflow.log_metric("test_r2", test_r2)
    mlflow.log_metric("train_mae", train_mae)
    mlflow.log_metric("test_mae", test_mae)
    mlflow.log_metric("train_rmse", train_rmse)
    mlflow.log_metric("test_rmse", test_rmse)
    mlflow.log_metric("cv_best_score", grid_search.best_score_)
    
    print("XGBoost GridSearch Results:")
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best CV score: {grid_search.best_score_:.3f}")
    print(f"Train R2: {train_r2:.3f}")
    print(f"Test R2: {test_r2:.3f}")
    print(f"Train MAE: {train_mae:,.0f}€/m²")
    print(f"Test MAE: {test_mae:,.0f}€/m²")
    print(f"Train RMSE: {train_rmse:,.0f}€/m²")
    print(f"Test RMSE: {test_rmse:,.0f}€/m²")

Starting GridSearch on XGBoost...




Fitting 5 folds for each of 81 candidates, totalling 405 fits




XGBoost GridSearch Results:
Best parameters: {'learning_rate': 0.05, 'max_depth': 8, 'min_child_weight': 1, 'n_estimators': 300}
Best CV score: 0.335
Train R2: 0.472
Test R2: 0.341
Train MAE: 1,431€/m²
Test MAE: 1,597€/m²
Train RMSE: 1,982€/m²
Test RMSE: 2,225€/m²


2025/07/30 17:49:45 INFO mlflow.tracking._tracking_service.client: 🏃 View run XGBoost_GridSearch at: https://amaulf-mlflow-server-smartinvest.hf.space/#/experiments/8/runs/d52f919209a340bdb4489b3397118b32.
2025/07/30 17:49:45 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://amaulf-mlflow-server-smartinvest.hf.space/#/experiments/8.


# Final model evaluation

In [42]:
# Step 6: Final Model Evaluation
print("Final Model Analysis...")

# Get actual feature names after preprocessing
print(f"Number of features after preprocessing: {X_train_processed.shape[1]}")
print(f"Number of feature importances: {len(best_model.feature_importances_)}")

# Create proper feature names
# Get categorical feature names from encoder
cat_feature_names = []
for i, cat_feature in enumerate(categorical_features):
    categories = preprocessor.named_transformers_['cat'].named_steps['encoder'].categories_[i]
    # Skip first category due to drop='first'
    cat_names = [f"{cat_feature}_{cat}" for cat in categories[1:]]
    cat_feature_names.extend(cat_names)

# Combine all feature names
all_feature_names = numeric_features + cat_feature_names

print(f"Total feature names created: {len(all_feature_names)}")

# Feature importance analysis
feature_importance = pd.DataFrame({
    'feature': all_feature_names,
    'importance': best_model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 10 Most Important Features:")
print(feature_importance.head(10))

Final Model Analysis...
Number of features after preprocessing: 27
Number of feature importances: 27
Total feature names created: 27
Top 10 Most Important Features:
            feature  importance
11  distance_TER_km    0.154016
3          latitude    0.131318
26           zone_6    0.083003
20       annee_2024    0.080271
2         longitude    0.069394
23           zone_3    0.069197
21           zone_1    0.054105
22           zone_2    0.036481
19       annee_2023    0.035283
4       nombre_lots    0.024814


In [43]:
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd

In [45]:
feature_importance_df = pd.DataFrame({
    'feature': all_feature_names,  # Tes noms de features créés précédemment
    'importance': best_model.feature_importances_  # Du XGBoost GridSearch
}).sort_values('importance', ascending=False)

# 1. TOP 15 FEATURES - Horizontal Bar Chart
fig1 = px.bar(
    feature_importance_df.head(15), 
    x='importance', 
    y='feature',
    orientation='h',
    title='Top 15 Features - Importance dans le Modèle K Mean',
    labels={'importance': 'Importance', 'feature': 'Features'},
    color='importance',
    color_continuous_scale='Viridis'
)

fig1.update_layout(
    height=600,
    yaxis={'categoryorder': 'total ascending'},
    font=dict(size=12),
    title_font_size=16
)

fig1.show()

In [57]:
# 2. TOP 10 FEATURES - Pie Chart pour les proportions
top_10 = feature_importance_df.head(10).copy()
top_10['importance_pct'] = top_10['importance'] / top_10['importance'].sum() * 100

fig2 = px.pie(
    top_10,
    values='importance_pct',
    names='feature',
    title='Répartition des Top 10 Features les plus Importantes'
)

fig2.update_traces(textposition='inside', textinfo='percent+label')
fig2.update_layout(font=dict(size=11))
fig2.show()

In [59]:
# ÉTAPE 1: Calculer les erreurs en pourcentage pour chaque prédiction
errors_pct = np.abs(y_test - y_test_pred) / y_test * 100

# ÉTAPE 2: Compter les prédictions dans différents seuils
within_10pct = np.sum(errors_pct <= 10) / len(errors_pct) * 100
within_15pct = np.sum(errors_pct <= 15) / len(errors_pct) * 100
within_20pct = np.sum(errors_pct <= 20) / len(errors_pct) * 100

print("Business Performance Metrics:")
print(f"Predictions within ±10%: {within_10pct:.1f}%")
print(f"Predictions within ±15%: {within_15pct:.1f}%")
print(f"Predictions within ±20%: {within_20pct:.1f}%")

# BONUS: Distribution des erreurs
print(f"\nError distribution:")
print(f"Median Absolute Error: {np.median(np.abs(y_test - y_test_pred)):,.0f}€/m²")
print(f"90th percentile error: {np.percentile(errors_pct, 90):.1f}%")

Business Performance Metrics:
Predictions within ±10%: 47.0%
Predictions within ±15%: 63.8%
Predictions within ±20%: 75.4%

Error distribution:
Median Absolute Error: 1,138€/m²
90th percentile error: 34.2%
