In [None]:
import pandas as pd
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer,OneHotEncoder, RobustScaler
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
import xgboost as xgb
import joblib
import pickle
import mlflow
import mlflow.sklearn

In [None]:
df=pd.read_csv('data/data.csv')
df=df.dropna(subset=['SiteEnergyUse(kBtu)'])

In [None]:
def replace_building_type(df):
    df['BuildingType'] = df['BuildingType'].replace({'Nonresidential COS': 'Nonresidential', 'Nonresidential WA': 'Nonresidential'})
    return df


In [None]:
supprimer= [
    'Address',
    'City',
    'Comments',
    'ComplianceStatus',
    'CouncilDistrictCode',
    'DataYear',
    'DefaultData',
    'Electricity(kBtu)',
    'Electricity(kWh)',
    'ENERGYSTARScore',
    'GHGEmissionsIntensity',
    'LargestPropertyUseType',
    'LargestPropertyUseTypeGFA',
    'Latitude',
    'ListOfAllPropertyUseTypes',
    'Longitude',
    'NaturalGas(kBtu)',
    'NaturalGas(therms)',
    'Neighborhood',
    'OSEBuildingID',
    'Outlier',
    'PropertyGFAParking',
    'PropertyGFATotal',
    'PropertyName',
    'SecondLargestPropertyUseType',
    'SecondLargestPropertyUseTypeGFA',
    'SiteEnergyUseWN(kBtu)',
    'SiteEUI(kBtu/sf)',
    'SiteEUIWN(kBtu/sf)',
    'SourceEUI(kBtu/sf)',
    'SourceEUIWN(kBtu/sf)',
    'State',
    'SteamUse(kBtu)',
    'TaxParcelIdentificationNumber',
    'ThirdLargestPropertyUseType',
    'ThirdLargestPropertyUseTypeGFA',
    'TotalGHGEmissions',
    'YearsENERGYSTARCertified',
    'ZipCode'
]

thresholds = [1.0e6, 2.5e6, 5.0e6, 1.0e7]

In [None]:

categorical_features=['BuildingType', 'PrimaryPropertyType']
numeric_features=['YearBuilt', 'NumberofBuildings','NumberofFloors', 'PropertyGFABuilding(s)']

In [None]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=1)),
    ('scaler', RobustScaler())
])
categorical_transformer = Pipeline(steps=[
    ('replace_building_type', FunctionTransformer(replace_building_type, validate=False)),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop' 
)

# Création du pipeline final
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])


In [None]:
pipeline

In [8]:
# Définir la target et les features après preprocessing
y = df['SiteEnergyUse(kBtu)'].astype(float)
X = df.drop(columns=['SiteEnergyUse(kBtu)'])


# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

# Entraîner le modèle et prédire
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

# Calculer le coefficient de détermination R^2
r2 = r2_score(y_test, y_pred)
mae=mean_absolute_error(y_test, y_pred)

print(f"Coefficient de détermination R^2 : {r2:.4f}")
print(f"MAE : {mae:.4f}")
model_path = 'trained_model.pkl'

# Exportation du modèle avec pickle
with open(model_path, 'wb') as f:
    pickle.dump(pipeline, f)

print(f"Modèle exporté avec succès dans : {model_path}")

Coefficient de détermination R^2 : 0.8274
MAE : 3188614.2721
Modèle exporté avec succès dans : trained_model.pkl


In [None]:
# Définir la target et les features après preprocessing
y = df['SiteEnergyUse(kBtu)'].astype(float)
X = df.drop(columns=['SiteEnergyUse(kBtu)'])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

# Démarrer un run dans MLflow
with mlflow.start_run():

    # Entraîner le modèle et prédire
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    # Calculer le coefficient de détermination R^2
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    # Log des paramètres et des métriques
    mlflow.log_param("model_type", "XGBoost")  # Exemple, vous pouvez ajouter plus de paramètres
    mlflow.log_metric("R2_score", r2)
    mlflow.log_metric("MAE", mae)

    # Enregistrer le modèle dans MLflow
    mlflow.sklearn.log_model(pipeline, "model")

    # Exportation du modèle avec pickle
    model_path = 'trained_model.pkl'
    with open(model_path, 'wb') as f:
        pickle.dump(pipeline, f)

    # Log de l'artefact (modèle exporté)
    mlflow.log_artifact(model_path)

    print(f"Modèle exporté avec succès dans : {model_path}")
    print(f"R^2 : {r2:.4f}")
    print(f"MAE : {mae:.4f}")


In [None]:
import mlflow
import mlflow.sklearn
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.model_selection import GridSearchCV

# Démarrer MLflow Tracking
mlflow.set_experiment("Regression_Models")

# Models to test
models = {
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'ElasticNet': ElasticNet()
}

# Parameter grids for hyperparameter tuning
param_grids = {
    'Ridge': {'regressor__alpha': [0.1, 1.0, 10.0]},
    'Lasso': {'regressor__alpha': [0.1, 1.0, 10.0]},
    'ElasticNet': {
        'regressor__alpha': [0.1, 1.0, 10.0],
        'regressor__l1_ratio': [0.25, 0.5, 0.75]
    }
}

# Create an empty dictionary to store results
results = {}

for model_name, model in models.items():
    with mlflow.start_run(run_name=model_name):
        
        # Mise à jour du modèle dans le pipeline
        pipeline.steps[-1] = ('regressor', model)
        
        # Grid search pour optimiser les hyperparamètres
        grid_search = GridSearchCV(pipeline, param_grids[model_name], cv=5, scoring='r2')
        grid_search.fit(X_train, y_train)
        
        best_model = grid_search.best_estimator_
        test_score = grid_search.score(X_test, y_test)
        
        # Stocker les résultats
        results[model_name] = {
            'best_params': grid_search.best_params_,
            'best_score': grid_search.best_score_,
            'test_score': test_score
        }
        
        # Enregistrement des paramètres et métriques dans MLflow
        mlflow.log_params(grid_search.best_params_)
        mlflow.log_metric("best_r2", grid_search.best_score_)
        mlflow.log_metric("test_r2", test_score)
        
        # Enregistrement du modèle entraîné
        mlflow.sklearn.log_model(best_model, f"{model_name}_model")

# Affichage des résultats
for model_name, result in results.items():
    print(f"Model: {model_name}")
    print(f"Best cross-validation score: {result['best_score']:.4f}")
    print(f"Test score: {result['test_score']:.4f}")
    print(f"Best parameters: {result['best_params']}")
    print()

print("Les résultats sont enregistrés dans MLflow. Lancez 'mlflow ui' pour les visualiser.")


In [None]:
!mlflow ui

In [None]:
!mlflow ui

In [12]:
# Utiliser un modèle de forêt aléatoire pour la sélection des caractéristiques
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X, y)

# Utiliser SelectFromModel pour sélectionner les caractéristiques importantes
selector = SelectFromModel(model, prefit=True)
X_selected = selector.transform(X)

# Vérifier les caractéristiques sélectionnées
selected_features = poly_features.columns[selector.get_support()]
print(selected_features)

ValueError: could not convert string to float: 'NonResidential'