In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import numpy as np
import pandas as pd

In [16]:
df = pd.read_csv("model_paris_20_24.csv", low_memory=False)

In [17]:
df.head()

Unnamed: 0,valeur_fonciere,surface_reelle_bati,nombre_pieces_principales,longitude,latitude,nombre_lots,nb_lots_surface,a_plusieurs_lots,prix_m2,annee,arrondissement
0,908200.0,78.0,3.0,2.251124,48.843025,2,1,0,11643.589744,2020,16
1,600000.0,70.0,3.0,2.251902,48.83842,4,1,0,8571.428571,2020,16
2,775125.0,77.0,4.0,2.252431,48.837737,1,0,0,10066.558442,2020,16
3,550000.0,67.0,3.0,2.252871,48.837354,2,1,0,8208.955224,2020,16
4,690500.0,74.0,3.0,2.251366,48.839257,2,1,0,9331.081081,2020,16


In [24]:
# Prepare data
target = 'prix_m2'
X = df.drop([target, 'valeur_fonciere','nb_lots_surface'], axis=1)  # Remove both target and valeur_fonciere
y = df[target]


In [25]:
# Separate numeric and categorical features 
numeric_features = ['surface_reelle_bati', 'nombre_pieces_principales', 'longitude', 'latitude', 'nombre_lots']
categorical_features = ['a_plusieurs_lots', 'annee', 'arrondissement']

print(f"Numeric features: {numeric_features}")
print(f"Categorical features: {categorical_features}")
print(f"Dataset shape: X={X.shape}, y={y.shape}")

Numeric features: ['surface_reelle_bati', 'nombre_pieces_principales', 'longitude', 'latitude', 'nombre_lots']
Categorical features: ['a_plusieurs_lots', 'annee', 'arrondissement']
Dataset shape: X=(137917, 8), y=(137917,)


In [20]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
# Scale numeric features
scaler = StandardScaler()
X_train_numeric = scaler.fit_transform(X_train[numeric_features])
X_test_numeric = scaler.transform(X_test[numeric_features])

print(f"Numeric features scaled: {X_train_numeric.shape[1]} features")

Numeric features scaled: 6 features


In [22]:
# OneHot encode categorical features
encoder = OneHotEncoder(drop='first')  # drop='first' to avoid multicollinearity
X_train_categorical = encoder.fit_transform(X_train[categorical_features]).toarray()
X_test_categorical = encoder.transform(X_test[categorical_features]).toarray()

print(f"Categorical features encoded: {X_train_categorical.shape[1]} features")


Categorical features encoded: 24 features


In [26]:
# Combine numeric and categorical features
X_train_final = np.hstack([X_train_numeric, X_train_categorical])
X_test_final = np.hstack([X_test_numeric, X_test_categorical])

print(f"\nFinal preprocessing summary:")
print(f"X_train_final shape: {X_train_final.shape}")
print(f"X_test_final shape: {X_test_final.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")
print(f"Total features: {X_train_final.shape[1]} (numeric: {len(numeric_features)}, categorical: {X_train_categorical.shape[1]})")

# Feature names for reference
categorical_feature_names = [f"{cat}_{val}" for cat, vals in zip(categorical_features, encoder.categories_) for val in vals[1:]]
all_feature_names = numeric_features + categorical_feature_names

print(f"\nAll feature names ({len(all_feature_names)}):")
for i, name in enumerate(all_feature_names):
    print(f"  {i}: {name}")


Final preprocessing summary:
X_train_final shape: (110333, 30)
X_test_final shape: (27584, 30)
y_train shape: (110333,)
y_test shape: (27584,)
Total features: 30 (numeric: 5, categorical: 24)

All feature names (29):
  0: surface_reelle_bati
  1: nombre_pieces_principales
  2: longitude
  3: latitude
  4: nombre_lots
  5: a_plusieurs_lots_1
  6: annee_2021
  7: annee_2022
  8: annee_2023
  9: annee_2024
  10: arrondissement_2
  11: arrondissement_3
  12: arrondissement_4
  13: arrondissement_5
  14: arrondissement_6
  15: arrondissement_7
  16: arrondissement_8
  17: arrondissement_9
  18: arrondissement_10
  19: arrondissement_11
  20: arrondissement_12
  21: arrondissement_13
  22: arrondissement_14
  23: arrondissement_15
  24: arrondissement_16
  25: arrondissement_17
  26: arrondissement_18
  27: arrondissement_19
  28: arrondissement_20
