# Imports

In [1]:
from regression_model_comparison import RegressionModelComparison
import numpy as np
import pandas as pd
import mlflow

# Data

In [4]:
dataX = pd.read_csv('engie_X.csv', header=0, sep=';', decimal='.')
# dataX.info()

In [5]:
dataY = pd.read_csv('engie_Y.csv', header=0,  sep=';', decimal='.')
dataY.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 617386 entries, 0 to 617385
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   ID      617386 non-null  int64  
 1   TARGET  617386 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 9.4 MB


In [6]:
data_raw = pd.merge(dataX, dataY, on='ID', how='inner')
print("SHAPE = ", data_raw.shape)
data_raw.head(3)

SHAPE =  (617386, 79)


Unnamed: 0,ID,MAC_CODE,Date_time,Pitch_angle,Pitch_angle_min,Pitch_angle_max,Pitch_angle_std,Hub_temperature,Hub_temperature_min,Hub_temperature_max,...,Rotor_speed_min,Rotor_speed_max,Rotor_speed_std,Rotor_bearing_temperature,Rotor_bearing_temperature_min,Rotor_bearing_temperature_max,Rotor_bearing_temperature_std,Absolute_wind_direction_c,Nacelle_angle_c,TARGET
0,1,WT3,1.0,92.470001,92.470001,92.470001,0.0,7.0,7.0,7.0,...,0.0,0.0,0.0,2.4,2.4,2.4,0.0,294.19,294.23999,-0.703
1,2,WT3,2.0,92.470001,92.470001,92.470001,0.0,7.0,7.0,7.0,...,0.0,0.0,0.0,2.4,2.4,2.4,0.0,297.82999,294.23999,-0.747
2,3,WT3,3.0,92.470001,92.470001,92.470001,0.0,7.0,7.0,7.0,...,0.0,0.0,0.0,2.4,2.4,2.4,0.0,322.20999,294.23999,-0.791


In [7]:
sample_size = 100
X = data_raw.drop(columns=['ID', 'MAC_CODE']).values[:sample_size]
Y = data_raw.TARGET.values[:sample_size]

## Séparation des éoliennes

In [13]:
data_raw.groupby('MAC_CODE').agg(nrows=('MAC_CODE', 'count'))

Unnamed: 0_level_0,nrows
MAC_CODE,Unnamed: 1_level_1
WT1,154707
WT2,154791
WT3,154253
WT4,153635


# Modelisation

In [8]:
comparison = RegressionModelComparison(
    X,
    Y,
    scorings=['mae', 'mse'],
    test_size=0.1,
    seed=3
    )

##### Splitting Dataset with test_size = 0.1 and random_state = 3


In [9]:
numerical_features = [
    'Date_time', 'Pitch_angle', 'Pitch_angle_min',
       'Pitch_angle_max', 'Pitch_angle_std', 'Hub_temperature',
       'Hub_temperature_min', 'Hub_temperature_max', 'Hub_temperature_std',
       'Generator_converter_speed', 'Generator_converter_speed_min',
       'Generator_converter_speed_max', 'Generator_converter_speed_std',
       'Generator_speed', 'Generator_speed_min', 'Generator_speed_max',
       'Generator_speed_std', 'Generator_bearing_1_temperature',
       'Generator_bearing_1_temperature_min',
       'Generator_bearing_1_temperature_max',
       'Generator_bearing_1_temperature_std',
       'Generator_bearing_2_temperature',
       'Generator_bearing_2_temperature_min',
       'Generator_bearing_2_temperature_max',
       'Generator_bearing_2_temperature_std', 'Generator_stator_temperature',
       'Generator_stator_temperature_min', 'Generator_stator_temperature_max',
       'Generator_stator_temperature_std', 'Gearbox_bearing_1_temperature',
       'Gearbox_bearing_1_temperature_min',
       'Gearbox_bearing_1_temperature_max',
       'Gearbox_bearing_1_temperature_std', 'Gearbox_bearing_2_temperature',
       'Gearbox_bearing_2_temperature_min',
       'Gearbox_bearing_2_temperature_max',
       'Gearbox_bearing_2_temperature_std', 'Gearbox_inlet_temperature',
       'Gearbox_inlet_temperature_min', 'Gearbox_inlet_temperature_max',
       'Gearbox_inlet_temperature_std', 'Gearbox_oil_sump_temperature',
       'Gearbox_oil_sump_temperature_min', 'Gearbox_oil_sump_temperature_max',
       'Gearbox_oil_sump_temperature_std', 'Nacelle_angle',
       'Nacelle_angle_min', 'Nacelle_angle_max', 'Nacelle_angle_std',
       'Nacelle_temperature', 'Nacelle_temperature_min',
       'Nacelle_temperature_max', 'Nacelle_temperature_std',
       'Absolute_wind_direction', 'Outdoor_temperature',
       'Outdoor_temperature_min', 'Outdoor_temperature_max',
       'Outdoor_temperature_std', 'Grid_frequency', 'Grid_frequency_min',
       'Grid_frequency_max', 'Grid_frequency_std', 'Grid_voltage',
       'Grid_voltage_min', 'Grid_voltage_max', 'Grid_voltage_std',
       'Rotor_speed', 'Rotor_speed_min', 'Rotor_speed_max', 'Rotor_speed_std',
       'Rotor_bearing_temperature', 'Rotor_bearing_temperature_min',
       'Rotor_bearing_temperature_max', 'Rotor_bearing_temperature_std'
       ]

other_features = ['Absolute_wind_direction_c', 'Nacelle_angle_c']

In [10]:
comparison.preprocessing(
    numerical_features,
    other_features,
    nknots=4,
    poly_order=2,
    scaler='standard'
    )

##### Preprocessors prepared : 
## 'base' :  ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer', SimpleImputer()),
                                                 ('scaler', StandardScaler())]),
                                 ['Date_time', 'Pitch_angle', 'Pitch_angle_min',
                                  'Pitch_angle_max', 'Pitch_angle_std',
                                  'Hub_temperature', 'Hub_temperature_min',
                                  'Hub_temperature_max', 'Hub_temperature_std',
                                  'Generator_converter_speed',
                                  'Generator_converter_speed_min',
                                  'Generator_c...
                                  'Generator_bearing_1_temperature_std',
                                  'Generator_bearing_2_temperature',
                                  'Generator_bearing_2_temperature_min',
                                  'Generator_bearin

In [11]:
comparison.run_comparison(
        preproc=['base'],
        model_param={
            'linear_regression': {},
            'ridge': {},
            'lasso': {},
            'elasticnet': {},
            'randomforest': {
                'classifier__n_estimators' : [100, 200, 300, 1000], # Nombre d'arbres dans la forêt. defaut 100
                'classifier__max_depth' : [None, 10, 20, 30], # Profondeur maximale des arbres. Si None, les arbres sont développés jusqu'à ce que toutes les feuilles soient pures ou que chaque feuille contienne moins que min_samples_split échantillons
                'classifier__max_features': [3, 'sqrt'], # Nombre maximum de caractéristiques considérées pour chaque split (division d'un nœud en deux sous-nœuds)
                },
            'grd_boosting': {
                'classifier__learning_rate' : [.00, .01, .1, 1],
                'classifier__max_depth' : [3, 7, 9],
                'classifier__subsample' : [0.5, 0.7, 1],
                'classifier__n_estimators' : [100, 200, 300, 1000]
                }
            },
        nfolds=5,
        verbose=False
        )

###### Start comparison ######
Using preprocessor : ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer', SimpleImputer()),
                                                 ('scaler', StandardScaler())]),
                                 ['Date_time', 'Pitch_angle', 'Pitch_angle_min',
                                  'Pitch_angle_max', 'Pitch_angle_std',
                                  'Hub_temperature', 'Hub_temperature_min',
                                  'Hub_temperature_max', 'Hub_temperature_std',
                                  'Generator_converter_speed',
                                  'Generator_converter_speed_min',
                                  'Generator_c...
                                  'Generator_bearing_1_temperature_std',
                                  'Generator_bearing_2_temperature',
                                  'Generator_bearing_2_temperature_min',
                                  'Generator

TypeError: 'NoneType' object is not subscriptable