In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import numpy as npe
import pandas as pd

In [2]:
df = pd.read_csv("model_paris_20_24_final.csv", low_memory=False)

In [3]:
df.head()

Unnamed: 0,valeur_fonciere,surface_reelle_bati,nombre_pieces_principales,longitude,latitude,nombre_lots,nb_lots_surface,a_plusieurs_lots,prix_m2,annee,...,distance_espace_vert_km,distance_college_km,distance_universite_km,distance_ecole_km,distance_metro_km,distance_TER_km,distance_POI_min_km,proche_POI_1km,nb_POIs_<1km,POI_dominant
0,908200.0,78.0,3.0,2.251124,48.843025,2,1,0,11643.589744,2020,...,2.081811,5.416043,46.067374,5.735281,2.5199,6.776384,1.123883,0,0,distance_datashop_km
1,600000.0,70.0,3.0,2.251902,48.83842,4,1,0,8571.428571,2020,...,2.938028,6.397158,43.448577,8.650574,2.509081,7.242412,0.152864,1,1,distance_datashop_km
2,775125.0,77.0,4.0,2.252431,48.837737,1,0,0,10066.558442,2020,...,2.001488,6.736855,43.623648,9.128142,2.535903,7.148256,0.76922,1,1,distance_datashop_km
3,550000.0,67.0,3.0,2.252871,48.837354,2,1,0,8208.955224,2020,...,1.431667,6.925287,43.909307,9.380456,2.567419,7.088346,0.410568,1,1,distance_datashop_km
4,690500.0,74.0,3.0,2.251366,48.839257,2,1,0,9331.081081,2020,...,4.126177,6.143087,43.407502,8.159701,2.492159,7.164109,0.269765,1,1,distance_datashop_km


In [4]:
df.dtypes

valeur_fonciere              float64
surface_reelle_bati          float64
nombre_pieces_principales    float64
longitude                    float64
latitude                     float64
nombre_lots                    int64
nb_lots_surface                int64
a_plusieurs_lots               int64
prix_m2                      float64
annee                          int64
arrondissement                 int64
distance_datashop_km         float64
distance_espace_vert_km      float64
distance_college_km          float64
distance_universite_km       float64
distance_ecole_km            float64
distance_metro_km            float64
distance_TER_km              float64
distance_POI_min_km          float64
proche_POI_1km                 int64
nb_POIs_<1km                   int64
POI_dominant                  object
dtype: object

In [6]:
# Prepare data
target = 'prix_m2'
X = df.drop([target, 'valeur_fonciere','nb_lots_surface','a_plusieurs_lots','POI_dominant'], axis=1)  # Remove non useful columns
y = df[target]

In [7]:
# Separate numeric and categorical features 
numeric_features = ['surface_reelle_bati', 'nombre_pieces_principales', 'longitude', 'latitude', 'nombre_lots','distance_datashop_km', 'distance_espace_vert_km',
        'distance_college_km', 'distance_universite_km', 'distance_ecole_km',
       'distance_metro_km', 'distance_TER_km', 'distance_POI_min_km',
       'proche_POI_1km', 'nb_POIs_<1km']
categorical_features = ['annee', 'arrondissement']

print(f"Numeric features: {numeric_features}")
print(f"Categorical features: {categorical_features}")
print(f"Dataset shape: X={X.shape}, y={y.shape}")

Numeric features: ['surface_reelle_bati', 'nombre_pieces_principales', 'longitude', 'latitude', 'nombre_lots', 'distance_datashop_km', 'distance_espace_vert_km', 'distance_college_km', 'distance_universite_km', 'distance_ecole_km', 'distance_metro_km', 'distance_TER_km', 'distance_POI_min_km', 'proche_POI_1km', 'nb_POIs_<1km']
Categorical features: ['annee', 'arrondissement']
Dataset shape: X=(137917, 17), y=(137917,)


In [8]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Scale numeric features
scaler = StandardScaler()
X_train_numeric = scaler.fit_transform(X_train[numeric_features])
X_test_numeric = scaler.transform(X_test[numeric_features])

print(f"Numeric features scaled: {X_train_numeric.shape[1]} features")

Numeric features scaled: 15 features


In [10]:
# OneHot encode categorical features
encoder = OneHotEncoder(drop='first')  # drop='first' to avoid multicollinearity
X_train_categorical = encoder.fit_transform(X_train[categorical_features]).toarray()
X_test_categorical = encoder.transform(X_test[categorical_features]).toarray()

print(f"Categorical features encoded: {X_train_categorical.shape[1]} features")


Categorical features encoded: 23 features


In [11]:
# Combine numeric and categorical features
X_train_final = np.hstack([X_train_numeric, X_train_categorical])
X_test_final = np.hstack([X_test_numeric, X_test_categorical])

print(f"\nFinal preprocessing summary:")
print(f"X_train_final shape: {X_train_final.shape}")
print(f"X_test_final shape: {X_test_final.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")
print(f"Total features: {X_train_final.shape[1]} (numeric: {len(numeric_features)}, categorical: {X_train_categorical.shape[1]})")

# Feature names for reference
categorical_feature_names = [f"{cat}_{val}" for cat, vals in zip(categorical_features, encoder.categories_) for val in vals[1:]]
all_feature_names = numeric_features + categorical_feature_names

print(f"\nAll feature names ({len(all_feature_names)}):")
for i, name in enumerate(all_feature_names):
    print(f"  {i}: {name}")


Final preprocessing summary:
X_train_final shape: (110333, 38)
X_test_final shape: (27584, 38)
y_train shape: (110333,)
y_test shape: (27584,)
Total features: 38 (numeric: 15, categorical: 23)

All feature names (38):
  0: surface_reelle_bati
  1: nombre_pieces_principales
  2: longitude
  3: latitude
  4: nombre_lots
  5: distance_datashop_km
  6: distance_espace_vert_km
  7: distance_college_km
  8: distance_universite_km
  9: distance_ecole_km
  10: distance_metro_km
  11: distance_TER_km
  12: distance_POI_min_km
  13: proche_POI_1km
  14: nb_POIs_<1km
  15: annee_2021
  16: annee_2022
  17: annee_2023
  18: annee_2024
  19: arrondissement_2
  20: arrondissement_3
  21: arrondissement_4
  22: arrondissement_5
  23: arrondissement_6
  24: arrondissement_7
  25: arrondissement_8
  26: arrondissement_9
  27: arrondissement_10
  28: arrondissement_11
  29: arrondissement_12
  30: arrondissement_13
  31: arrondissement_14
  32: arrondissement_15
  33: arrondissement_16
  34: arrondisse

In [14]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

lr = LinearRegression()
lr.fit(X_train_final, y_train)

y_train_pred = lr.predict(X_train_final)
y_test_pred = lr.predict(X_test_final)

# Calculate metrics
print("🎯 MODEL RESULTS:")
print(f"Train R² score: {r2_score(y_train, y_train_pred):.3f}")
print(f"Test R² score: {r2_score(y_test, y_test_pred):.3f}")
print(f"Train MAE: {mean_absolute_error(y_train, y_train_pred):,.0f}€/m²")
print(f"Test MAE: {mean_absolute_error(y_test, y_test_pred):,.0f}€/m²")
print(f"Test RMSE: {np.sqrt(mean_squared_error(y_test, y_test_pred)):,.0f}€/m²")
print()

# Create feature coefficients DataFrame
feature_coefficients = pd.DataFrame({
    'feature': numeric_features + categorical_feature_names,
    'coefficient': lr.coef_
}).sort_values('coefficient', key=abs, ascending=False)

print("Top 10 most influential features:")
print(feature_coefficients.head(10))
print()

🎯 MODEL RESULTS:
Train R² score: 0.256
Test R² score: 0.252
Train MAE: 1,723€/m²
Test MAE: 1,731€/m²
Test RMSE: 2,370€/m²

Top 10 most influential features:
              feature  coefficient
30  arrondissement_13 -2295.388734
31  arrondissement_14 -1805.241772
35  arrondissement_18 -1671.282112
24   arrondissement_7  1619.715098
32  arrondissement_15 -1432.743842
23   arrondissement_6  1302.705775
21   arrondissement_4  1169.229333
36  arrondissement_19 -1146.796123
18         annee_2024 -1046.616799
27  arrondissement_10 -1028.508956

