In [24]:
import pandas as pd
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer,OneHotEncoder, RobustScaler
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
import xgboost as xgb
import joblib
import pickle
import mlflow
import mlflow.sklearn

In [9]:
df=pd.read_csv('data/data.csv')
df=df.dropna(subset=['SiteEnergyUse(kBtu)'])
df

Unnamed: 0,OSEBuildingID,DataYear,BuildingType,PrimaryPropertyType,PropertyName,Address,City,State,ZipCode,TaxParcelIdentificationNumber,...,Electricity(kWh),Electricity(kBtu),NaturalGas(therms),NaturalGas(kBtu),DefaultData,Comments,ComplianceStatus,Outlier,TotalGHGEmissions,GHGEmissionsIntensity
0,1,2016,NonResidential,Hotel,Mayflower park hotel,405 Olive way,Seattle,WA,98101.0,0659000030,...,1.156514e+06,3.946027e+06,12764.529300,1.276453e+06,False,,Compliant,,249.98,2.83
1,2,2016,NonResidential,Hotel,Paramount Hotel,724 Pine street,Seattle,WA,98101.0,0659000220,...,9.504252e+05,3.242851e+06,51450.816410,5.145082e+06,False,,Compliant,,295.86,2.86
2,3,2016,NonResidential,Hotel,5673-The Westin Seattle,1900 5th Avenue,Seattle,WA,98101.0,0659000475,...,1.451544e+07,4.952666e+07,14938.000000,1.493800e+06,False,,Compliant,,2089.28,2.19
3,5,2016,NonResidential,Hotel,HOTEL MAX,620 STEWART ST,Seattle,WA,98101.0,0659000640,...,8.115253e+05,2.768924e+06,18112.130860,1.811213e+06,False,,Compliant,,286.43,4.67
4,8,2016,NonResidential,Hotel,WARWICK SEATTLE HOTEL (ID8),401 LENORA ST,Seattle,WA,98121.0,0659000970,...,1.573449e+06,5.368607e+06,88039.984380,8.803998e+06,False,,Compliant,,505.01,2.88
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3371,50222,2016,Nonresidential COS,Office,Horticulture building,1600 S Dakota St,Seattle,WA,,1624049080,...,1.536550e+05,5.242709e+05,3254.750244,3.254750e+05,True,,Error - Correct Default Data,,20.94,1.70
3372,50223,2016,Nonresidential COS,Other,International district/Chinatown CC,719 8th Ave S,Seattle,WA,,3558300000,...,1.162210e+05,3.965461e+05,5537.299805,5.537300e+05,False,,Compliant,,32.17,2.01
3373,50224,2016,Nonresidential COS,Other,Queen Anne Pool,1920 1st Ave W,Seattle,WA,,1794501150,...,5.252517e+05,1.792159e+06,39737.390630,3.973739e+06,False,,Compliant,,223.54,16.99
3374,50225,2016,Nonresidential COS,Mixed Use Property,South Park Community Center,8319 8th Ave S,Seattle,WA,,7883603155,...,1.022480e+05,3.488702e+05,3706.010010,3.706010e+05,False,,Compliant,,22.11,1.57


In [13]:
def replace_building_type(df):
    df['BuildingType'] = df['BuildingType'].replace({'Nonresidential COS': 'Nonresidential', 'Nonresidential WA': 'Nonresidential'})
    return df


In [14]:
supprimer= [
    'Address',
    'City',
    'Comments',
    'ComplianceStatus',
    'CouncilDistrictCode',
    'DataYear',
    'DefaultData',
    'Electricity(kBtu)',
    'Electricity(kWh)',
    'ENERGYSTARScore',
    'GHGEmissionsIntensity',
    'LargestPropertyUseType',
    'LargestPropertyUseTypeGFA',
    'Latitude',
    'ListOfAllPropertyUseTypes',
    'Longitude',
    'NaturalGas(kBtu)',
    'NaturalGas(therms)',
    'Neighborhood',
    'OSEBuildingID',
    'Outlier',
    'PropertyGFAParking',
    'PropertyGFATotal',
    'PropertyName',
    'SecondLargestPropertyUseType',
    'SecondLargestPropertyUseTypeGFA',
    'SiteEnergyUseWN(kBtu)',
    'SiteEUI(kBtu/sf)',
    'SiteEUIWN(kBtu/sf)',
    'SourceEUI(kBtu/sf)',
    'SourceEUIWN(kBtu/sf)',
    'State',
    'SteamUse(kBtu)',
    'TaxParcelIdentificationNumber',
    'ThirdLargestPropertyUseType',
    'ThirdLargestPropertyUseTypeGFA',
    'TotalGHGEmissions',
    'YearsENERGYSTARCertified',
    'ZipCode'
]

#thresholds = [1.0e6, 2.5e6, 5.0e6, 1.0e7]

In [15]:
df = df.drop(supprimer, axis=1, inplace=True)
df

AttributeError: 'NoneType' object has no attribute 'drop'

In [7]:
df

In [5]:

categorical_features=['BuildingType', 'PrimaryPropertyType']
numeric_features=['YearBuilt', 'NumberofBuildings','NumberofFloors', 'PropertyGFABuilding(s)']

In [6]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=1)),
    ('scaler', RobustScaler())
])
categorical_transformer = Pipeline(steps=[
    ('replace_building_type', FunctionTransformer(replace_building_type, validate=False)),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop' 
)

# Création du pipeline final
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])


In [7]:
pipeline

In [8]:
# Définir la target et les features après preprocessing
y = df['SiteEnergyUse(kBtu)'].astype(float)
X = df.drop(columns=['SiteEnergyUse(kBtu)'])


# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

# Entraîner le modèle et prédire
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

# Calculer le coefficient de détermination R^2
r2 = r2_score(y_test, y_pred)
mae=mean_absolute_error(y_test, y_pred)

print(f"Coefficient de détermination R^2 : {r2:.4f}")
print(f"MAE : {mae:.4f}")
model_path = 'trained_model.pkl'

# Exportation du modèle avec pickle
with open(model_path, 'wb') as f:
    pickle.dump(pipeline, f)

print(f"Modèle exporté avec succès dans : {model_path}")

Coefficient de détermination R^2 : 0.8274
MAE : 3188614.2721
Modèle exporté avec succès dans : trained_model.pkl


In [9]:
# Définir la target et les features après preprocessing
y = df['SiteEnergyUse(kBtu)'].astype(float)
X = df.drop(columns=['SiteEnergyUse(kBtu)'])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

# Démarrer un run dans MLflow
with mlflow.start_run():

    # Entraîner le modèle et prédire
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    # Calculer le coefficient de détermination R^2
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    # Log des paramètres et des métriques
    mlflow.log_param("model_type", "XGBoost")  # Exemple, vous pouvez ajouter plus de paramètres
    mlflow.log_metric("R2_score", r2)
    mlflow.log_metric("MAE", mae)

    # Enregistrer le modèle dans MLflow
    mlflow.sklearn.log_model(pipeline, "model")

    # Exportation du modèle avec pickle
    model_path = 'trained_model.pkl'
    with open(model_path, 'wb') as f:
        pickle.dump(pipeline, f)

    # Log de l'artefact (modèle exporté)
    mlflow.log_artifact(model_path)

    print(f"Modèle exporté avec succès dans : {model_path}")
    print(f"R^2 : {r2:.4f}")
    print(f"MAE : {mae:.4f}")




Modèle exporté avec succès dans : trained_model.pkl
R^2 : 0.8274
MAE : 3188614.2721


In [10]:
import mlflow
import mlflow.sklearn
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.model_selection import GridSearchCV

# Démarrer MLflow Tracking
mlflow.set_experiment("Regression_Models")

# Models to test
models = {
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'ElasticNet': ElasticNet()
}

# Parameter grids for hyperparameter tuning
param_grids = {
    'Ridge': {'regressor__alpha': [0.1, 1.0, 10.0]},
    'Lasso': {'regressor__alpha': [0.1, 1.0, 10.0]},
    'ElasticNet': {
        'regressor__alpha': [0.1, 1.0, 10.0],
        'regressor__l1_ratio': [0.25, 0.5, 0.75]
    }
}

# Create an empty dictionary to store results
results = {}

for model_name, model in models.items():
    with mlflow.start_run(run_name=model_name):
        
        # Mise à jour du modèle dans le pipeline
        pipeline.steps[-1] = ('regressor', model)
        
        # Grid search pour optimiser les hyperparamètres
        grid_search = GridSearchCV(pipeline, param_grids[model_name], cv=5, scoring='r2')
        grid_search.fit(X_train, y_train)
        
        best_model = grid_search.best_estimator_
        test_score = grid_search.score(X_test, y_test)
        
        # Stocker les résultats
        results[model_name] = {
            'best_params': grid_search.best_params_,
            'best_score': grid_search.best_score_,
            'test_score': test_score
        }
        
        # Enregistrement des paramètres et métriques dans MLflow
        mlflow.log_params(grid_search.best_params_)
        mlflow.log_metric("best_r2", grid_search.best_score_)
        mlflow.log_metric("test_r2", test_score)
        
        # Enregistrement du modèle entraîné
        mlflow.sklearn.log_model(best_model, f"{model_name}_model")

# Affichage des résultats
for model_name, result in results.items():
    print(f"Model: {model_name}")
    print(f"Best cross-validation score: {result['best_score']:.4f}")
    print(f"Test score: {result['test_score']:.4f}")
    print(f"Best parameters: {result['best_params']}")
    print()

print("Les résultats sont enregistrés dans MLflow. Lancez 'mlflow ui' pour les visualiser.")


2025/03/11 16:01:13 INFO mlflow.tracking.fluent: Experiment with name 'Regression_Models' does not exist. Creating a new experiment.
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


Model: Ridge
Best cross-validation score: 0.5677
Test score: 0.8499
Best parameters: {'regressor__alpha': 1.0}

Model: Lasso
Best cross-validation score: 0.5615
Test score: 0.8275
Best parameters: {'regressor__alpha': 10.0}

Model: ElasticNet
Best cross-validation score: 0.4837
Test score: 0.9275
Best parameters: {'regressor__alpha': 0.1, 'regressor__l1_ratio': 0.75}

Les résultats sont enregistrés dans MLflow. Lancez 'mlflow ui' pour les visualiser.


In [8]:
!mlflow ui

[2025-04-09 15:44:34 +0200] [408744] [INFO] Starting gunicorn 22.0.0
[2025-04-09 15:44:34 +0200] [408744] [ERROR] Connection in use: ('127.0.0.1', 5000)
[2025-04-09 15:44:34 +0200] [408744] [ERROR] Retrying in 1 second.
[2025-04-09 15:44:35 +0200] [408744] [ERROR] Connection in use: ('127.0.0.1', 5000)
[2025-04-09 15:44:35 +0200] [408744] [ERROR] Retrying in 1 second.
[2025-04-09 15:44:36 +0200] [408744] [ERROR] Connection in use: ('127.0.0.1', 5000)
[2025-04-09 15:44:36 +0200] [408744] [ERROR] Retrying in 1 second.
[2025-04-09 15:44:37 +0200] [408744] [ERROR] Connection in use: ('127.0.0.1', 5000)
[2025-04-09 15:44:37 +0200] [408744] [ERROR] Retrying in 1 second.
[2025-04-09 15:44:38 +0200] [408744] [ERROR] Connection in use: ('127.0.0.1', 5000)
[2025-04-09 15:44:38 +0200] [408744] [ERROR] Retrying in 1 second.
[2025-04-09 15:44:39 +0200] [408744] [ERROR] Can't connect to ('127.0.0.1', 5000)
Running the mlflow server failed. Please see the logs above for details.


In [12]:
# Utiliser un modèle de forêt aléatoire pour la sélection des caractéristiques
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X, y)

# Utiliser SelectFromModel pour sélectionner les caractéristiques importantes
selector = SelectFromModel(model, prefit=True)
X_selected = selector.transform(X)

# Vérifier les caractéristiques sélectionnées
selected_features = poly_features.columns[selector.get_support()]
print(selected_features)

ValueError: could not convert string to float: 'NonResidential'