# Linear regression model

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import RobustScaler


df = pd.read_csv("./src/boston.csv")

In [2]:
df_dropped = df.drop(columns=["price"])

In [3]:
# Séparer les variables indépendantes (X) de la variable cible (y)
X = df_dropped
y = df["price"]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LinearRegression()

In [4]:
model.fit(X_train, y_train)

In [5]:
y_pred = model.predict(X_test)

In [6]:
score = model.score(X_test, y_test)
print(f"score: {score}")

score: 0.6687594935356245


# Other model

In [7]:
# Séparer les variables indépendantes (X) de la variable cible (y)
X = df_dropped
y = df["price"]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = RandomForestRegressor()

In [8]:
model.fit(X_train, y_train)

In [9]:
y_pred = model.predict(X_test)

In [10]:
score = model.score(X_test, y_test)
print(f"score: {score}")

score: 0.8752060852725188


# Optuna package with model

In [11]:
import optuna
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_squared_error

# Fonction de recherche des meilleurs hyperparamètres avec Optuna
def objective(trial):
    # Séparer les variables indépendantes (X) de la variable cible (y)
    X = df_dropped
    y = df["price"]

    # Mise à l'échelle des caractéristiques
    scaler = RobustScaler()
    X_scaled = scaler.fit_transform(X)

    # Diviser les données en ensembles d'entraînement et de test
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # Définir les hyperparamètres à optimiser
    n_estimators = trial.suggest_int("n_estimators", 100, 1000, step=100)
    max_depth = trial.suggest_int("max_depth", 3, 10)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 5)

    # Créer le modèle Random Forest avec les hyperparamètres suggérés par Optuna
    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )

    # Entraîner le modèle sur les données d'entraînement
    model.fit(X_train, y_train)

    # Faire des prédictions sur les données de test
    y_pred = model.predict(X_test)

    # Calculer l'erreur quadratique moyenne (RMSE) comme métrique
    rmse = mean_squared_error(y_test, y_pred, squared=False)

    return rmse

# Lancer l'optimisation des hyperparamètres avec Optuna
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)

# Afficher les meilleurs hyperparamètres et la meilleure valeur métrique obtenus
print("Best RMSE:", study.best_value)
print("Best Hyperparameters:", study.best_params)

  from .autonotebook import tqdm as notebook_tqdm
[I 2023-06-12 20:39:18,667] A new study created in memory with name: no-name-788a985e-757a-4133-8113-910488ba44c9
[I 2023-06-12 20:39:18,804] Trial 0 finished with value: 2.9216272038304463 and parameters: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 1}. Best is trial 0 with value: 2.9216272038304463.
[I 2023-06-12 20:39:19,408] Trial 1 finished with value: 3.423952818380058 and parameters: {'n_estimators': 500, 'max_depth': 9, 'min_samples_split': 3, 'min_samples_leaf': 4}. Best is trial 0 with value: 2.9216272038304463.
[I 2023-06-12 20:39:20,133] Trial 2 finished with value: 3.2507622744970393 and parameters: {'n_estimators': 600, 'max_depth': 8, 'min_samples_split': 7, 'min_samples_leaf': 3}. Best is trial 0 with value: 2.9216272038304463.
[I 2023-06-12 20:39:21,123] Trial 3 finished with value: 3.348537519451094 and parameters: {'n_estimators': 1000, 'max_depth': 5, 'min_samples_split': 8, 'min

Best RMSE: 2.919570860256081
Best Hyperparameters: {'n_estimators': 700, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 1}


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import RobustScaler

# Séparer les variables indépendantes (X) de la variable cible (y)
X = df_dropped
y = df["price"]

# Mise à l'échelle des caractéristiques
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)

# Diviser les données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Créer le modèle Random Forest avec les meilleurs hyperparamètres
model = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    min_samples_split=3,
    min_samples_leaf=1,
    random_state=42
)

# Entraîner le modèle sur les données d'entraînement
model.fit(X_train, y_train)

# Obtenir le score du modèle sur les données de test
score = model.score(X_test, y_test)
print("Score:", score)


Score: 0.8934501570926114


# Optuna Gradient model

In [13]:
import optuna
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import RobustScaler

# Fonction de recherche des meilleurs hyperparamètres avec Optuna
def objective(trial):
    # Séparer les variables indépendantes (X) de la variable cible (y)
    X = df_dropped
    y = df["price"]

    # Mise à l'échelle des caractéristiques
    scaler = RobustScaler()
    X_scaled = scaler.fit_transform(X)

    # Diviser les données en ensembles d'entraînement et de test
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # Définir les hyperparamètres à optimiser
    learning_rate = trial.suggest_loguniform("learning_rate", 0.01, 0.1)
    n_estimators = trial.suggest_int("n_estimators", 100, 1000, step=100)
    max_depth = trial.suggest_int("max_depth", 3, 10)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 5)

    # Créer le modèle Gradient Boosting avec les hyperparamètres suggérés par Optuna
    model = GradientBoostingRegressor(
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )

    # Entraîner le modèle sur les données d'entraînement
    model.fit(X_train, y_train)

    # Obtenir le score du modèle sur les données de test
    score = model.score(X_test, y_test)

    return score

# Lancer l'optimisation des hyperparamètres avec Optuna
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

# Afficher le score du meilleur modèle
print("Best Score:", study.best_value)
print("Best Hyperparameters:", study.best_params)

[I 2023-06-12 20:40:31,491] A new study created in memory with name: no-name-67a29a1f-f39b-441e-bf6c-e86fef4f7a13
  learning_rate = trial.suggest_loguniform("learning_rate", 0.01, 0.1)
[I 2023-06-12 20:40:32,933] Trial 0 finished with value: 0.9272555317021665 and parameters: {'learning_rate': 0.01571521537103064, 'n_estimators': 900, 'max_depth': 9, 'min_samples_split': 10, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.9272555317021665.
  learning_rate = trial.suggest_loguniform("learning_rate", 0.01, 0.1)
[I 2023-06-12 20:40:34,747] Trial 1 finished with value: 0.9041893554729827 and parameters: {'learning_rate': 0.036061969900027344, 'n_estimators': 1000, 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.9272555317021665.
  learning_rate = trial.suggest_loguniform("learning_rate", 0.01, 0.1)
[I 2023-06-12 20:40:34,848] Trial 2 finished with value: 0.9269594199507447 and parameters: {'learning_rate': 0.04869840663939079, 'n_estimat

Best Score: 0.9354126937769047
Best Hyperparameters: {'learning_rate': 0.08830964573474222, 'n_estimators': 300, 'max_depth': 6, 'min_samples_split': 9, 'min_samples_leaf': 1}


Best Score: 0.9421339900053678
Best Hyperparameters: {'learning_rate': 0.019150030721405857, 'n_estimators': 800, 'max_depth': 6, 'min_samples_split': 2, 'min_samples_leaf': 1}

In [14]:
from sklearn.ensemble import GradientBoostingRegressor

# Séparer les variables indépendantes (X) de la variable cible (y)
X = df_dropped
y = df["price"]

# Diviser les données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Créer le modèle de régression par gradient boosting avec les meilleurs hyperparamètres
model = GradientBoostingRegressor(
    learning_rate=0.019150030721405857,
    n_estimators=800,
    max_depth=6,
    min_samples_split=2,
    min_samples_leaf=1
)

# Entraîner le modèle
model.fit(X_train, y_train)

# Évaluer le modèle sur les données de test
score = model.score(X_test, y_test)

# Afficher le score du modèle
print("Model Score:", score)


Model Score: 0.94097871994676
