# Linear regression model

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import RobustScaler


df = pd.read_csv("./src/boston.csv")

In [4]:
df_dropped = df.drop(columns=["price", "zone", "river", "rad"])

X = df_dropped
y = df["price"]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LinearRegression()

In [5]:
model.fit(X_train, y_train)

In [6]:
y_pred = model.predict(X_test)

In [7]:
score = model.score(X_test, y_test)
print(f"score: {score}")

score: 0.6687594935356245


# Other model

In [8]:
# Séparer les variables indépendantes (X) de la variable cible (y)
X = df_dropped
y = df["price"]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = RandomForestRegressor()

In [9]:
model.fit(X_train, y_train)

In [10]:
y_pred = model.predict(X_test)

In [11]:
score = model.score(X_test, y_test)
print(f"score: {score}")

score: 0.8711776827100306


# Optuna package with model

In [12]:
import optuna
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_squared_error

# Fonction de recherche des meilleurs hyperparamètres avec Optuna
def objective(trial):
    # Séparer les variables indépendantes (X) de la variable cible (y)
    X = df_dropped
    y = df["price"]

    # Mise à l'échelle des caractéristiques
    scaler = RobustScaler()
    X_scaled = scaler.fit_transform(X)

    # Diviser les données en ensembles d'entraînement et de test
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # Définir les hyperparamètres à optimiser
    n_estimators = trial.suggest_int("n_estimators", 100, 1000, step=100)
    max_depth = trial.suggest_int("max_depth", 3, 10)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 5)

    # Créer le modèle Random Forest avec les hyperparamètres suggérés par Optuna
    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )

    # Entraîner le modèle sur les données d'entraînement
    model.fit(X_train, y_train)

    # Faire des prédictions sur les données de test
    y_pred = model.predict(X_test)

    # Calculer l'erreur quadratique moyenne (RMSE) comme métrique
    rmse = mean_squared_error(y_test, y_pred, squared=False)

    return rmse

# Lancer l'optimisation des hyperparamètres avec Optuna
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)

# Afficher les meilleurs hyperparamètres et la meilleure valeur métrique obtenus
print("Best RMSE:", study.best_value)
print("Best Hyperparameters:", study.best_params)

  from .autonotebook import tqdm as notebook_tqdm
[I 2023-06-13 11:58:51,088] A new study created in memory with name: no-name-06f7a52e-08c0-4009-9f21-d8213600944c
[I 2023-06-13 11:58:51,390] Trial 0 finished with value: 3.519919633034565 and parameters: {'n_estimators': 300, 'max_depth': 7, 'min_samples_split': 6, 'min_samples_leaf': 5}. Best is trial 0 with value: 3.519919633034565.
[I 2023-06-13 11:58:51,978] Trial 1 finished with value: 3.5340302023076284 and parameters: {'n_estimators': 500, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 5}. Best is trial 0 with value: 3.519919633034565.
[I 2023-06-13 11:58:52,370] Trial 2 finished with value: 3.1866422277311415 and parameters: {'n_estimators': 400, 'max_depth': 6, 'min_samples_split': 6, 'min_samples_leaf': 2}. Best is trial 2 with value: 3.1866422277311415.
[I 2023-06-13 11:58:53,194] Trial 3 finished with value: 3.3169630367202023 and parameters: {'n_estimators': 800, 'max_depth': 6, 'min_samples_split': 10, 'min_

Best RMSE: 2.919570860256081
Best Hyperparameters: {'n_estimators': 700, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 1}


In [29]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import r2_score

# Séparer les variables indépendantes (X) de la variable cible (y)
X = df_dropped
y = df["price"]

# Mise à l'échelle des caractéristiques
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)

# Diviser les données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Créer le modèle Random Forest avec les meilleurs hyperparamètres
model = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    min_samples_split=3,
    min_samples_leaf=1,
    random_state=42
)

# Entraîner le modèle sur les données d'entraînement
model.fit(X_train, y_train)

# Obtenir le score du modèle sur les données de test
score = model.r2_score(X_test, y_test)
print("Score:", score)


AttributeError: 'RandomForestRegressor' object has no attribute 'r2_score'

# Optuna Gradient model

In [14]:
import optuna
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import RobustScaler

# Fonction de recherche des meilleurs hyperparamètres avec Optuna
def objective(trial):
    # Séparer les variables indépendantes (X) de la variable cible (y)
    X = df_dropped
    y = df["price"]

    # Mise à l'échelle des caractéristiques
    scaler = RobustScaler()
    X_scaled = scaler.fit_transform(X)

    # Diviser les données en ensembles d'entraînement et de test
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # Définir les hyperparamètres à optimiser
    learning_rate = trial.suggest_loguniform("learning_rate", 0.01, 0.1)
    n_estimators = trial.suggest_int("n_estimators", 100, 1000, step=100)
    max_depth = trial.suggest_int("max_depth", 3, 10)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 5)

    # Créer le modèle Gradient Boosting avec les hyperparamètres suggérés par Optuna
    model = GradientBoostingRegressor(
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )

    # Entraîner le modèle sur les données d'entraînement
    model.fit(X_train, y_train)

    # Obtenir le score du modèle sur les données de test
    score = model.score(X_test, y_test)

    return score

# Lancer l'optimisation des hyperparamètres avec Optuna
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

# Afficher le score du meilleur modèle
print("Best Score:", study.best_value)
print("Best Hyperparameters:", study.best_params)

[I 2023-06-13 12:00:07,266] A new study created in memory with name: no-name-c090108d-849e-4268-a730-ad766ad1aca9
  learning_rate = trial.suggest_loguniform("learning_rate", 0.01, 0.1)
[I 2023-06-13 12:00:07,939] Trial 0 finished with value: 0.9152780974031602 and parameters: {'learning_rate': 0.07950792023197624, 'n_estimators': 500, 'max_depth': 7, 'min_samples_split': 3, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.9152780974031602.
  learning_rate = trial.suggest_loguniform("learning_rate", 0.01, 0.1)
[I 2023-06-13 12:00:08,234] Trial 1 finished with value: 0.9198813223192597 and parameters: {'learning_rate': 0.034282854509427856, 'n_estimators': 300, 'max_depth': 5, 'min_samples_split': 10, 'min_samples_leaf': 5}. Best is trial 1 with value: 0.9198813223192597.
  learning_rate = trial.suggest_loguniform("learning_rate", 0.01, 0.1)
[I 2023-06-13 12:00:09,181] Trial 2 finished with value: 0.9129916676996095 and parameters: {'learning_rate': 0.04503941180594194, 'n_estimator

Best Score: 0.9415525934427901
Best Hyperparameters: {'learning_rate': 0.022869853499942645, 'n_estimators': 500, 'max_depth': 6, 'min_samples_split': 2, 'min_samples_leaf': 1}


Best Score: 0.9421339900053678
Best Hyperparameters: {'learning_rate': 0.019150030721405857, 'n_estimators': 800, 'max_depth': 6, 'min_samples_split': 2, 'min_samples_leaf': 1}

In [34]:
from sklearn.ensemble import GradientBoostingRegressor

X = df_dropped
y = df["price"]

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

model = GradientBoostingRegressor(
    learning_rate=0.019150030721405857,
    n_estimators=500,
    max_depth=6,
    min_samples_split=2,
    min_samples_leaf=1
)

# Entraîner le modèle
model.fit(X_train, y_train)

# Évaluer le modèle sur les données de test
score = model.score(X_test, y_test)

# Afficher le score du modèle
print("Model Score:", score.round(3))


Model Score: 0.941


In [32]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score

X = df_dropped
y = df["price"]

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

model = GradientBoostingRegressor(
    learning_rate=0.019150030721405857,
    n_estimators=500,
    max_depth=6,
    min_samples_split=2,
    min_samples_leaf=1
)

# Entraîner le modèle
model.fit(X_train, y_train)

# Faire des prédictions sur les données de test
y_pred = model.predict(X_test)

# Calculer le score R²
score = r2_score(y_test, y_pred)

# Afficher le score du modèle
print("Model R² Score:", score.round(3))


Model R² Score: 0.942


In [23]:
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

# Afficher l'importance de chaque caractéristique
for f in range(X.shape[1]):
    print(f"{X.columns[indices[f]]}: {importances[indices[f]]}")

room: 0.5174642347874622
lstat: 0.2750747773782461
district: 0.07523545594965506
crime: 0.04216574093669256
nox: 0.01990420492121404
black_population: 0.019889913500679184
tax: 0.015605903429430789
ptratio: 0.014825369334410895
age: 0.011362448049438305
indus: 0.004274518708121225
rad: 0.0032604277607313223
river: 0.0005898859886744348
zone: 0.00034711925524386804
