In [1]:
import pandas as pd
from deap import base, creator, tools, algorithms
import logging
import random
import numpy as np
import sys
from typing import List, Tuple
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import TransformedTargetRegressor

# pacotes personalizados
import util
from class_manipulate_data import ManipulateData
from class_control_panel import ControlPanel

import warnings

warnings.filterwarnings("ignore")

In [2]:
# region: parâmetros necessários para uso do logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
console_format = logging.Formatter('%(name)s - %(levelname)s - %(message)s')
console_handler = logging.StreamHandler()
console_handler.setFormatter(console_format)
console_handler.setLevel(logging.INFO)
logger.addHandler(console_handler)
# endregion

logger.info(util.init())

__main__ - INFO - Módulo util importado com sucesso.


In [3]:
manipulate_data = ManipulateData()
path_preprocessing_output = manipulate_data.get_path_preprocessing_output()

# tamanho da região de interesse (RUL abaixo de LENGHT_ROI)
LENGHT_ROI = 125

control_panel = ControlPanel(rolling_mean=False,
                                 window_mean=None,
                                 use_validation_data=False,
                                 number_units_validation=None,
                                 use_optuna=True,
                                 use_savgol_filter=False,
                                 use_roi=True)

control_panel.set_roi(LENGHT_ROI)

# todas as entradas
input_model = ['time',
    'setting_1', 'setting_2', 'setting_3',
    'sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5', 'sensor_6',
    'sensor_7', 'sensor_8', 'sensor_9', 'sensor_10', 'sensor_11',
    'sensor_12', 'sensor_13', 'sensor_14', 'sensor_15', 'sensor_16',
    'sensor_17', 'sensor_18', 'sensor_19', 'sensor_20', 'sensor_21']

output_model = ['RUL']

equipment_name = 'FD001'

In [4]:
logger.info("Lendo os dados de treino.")

path_dataset_train = \
    str(path_preprocessing_output.joinpath(f"train_{equipment_name}.parquet"))

df_train = pd.read_parquet(path_dataset_train)
df_train = control_panel.apply_roi(df_train, output_model[0], LENGHT_ROI)

logger.info("Lendo os dados de teste.")

path_dataset_test = \
    str(path_preprocessing_output.joinpath(f"test_{equipment_name}.parquet"))

df_test = pd.read_parquet(path_dataset_test)
df_test = control_panel.apply_roi(df_test, output_model[0], LENGHT_ROI)

__main__ - INFO - Lendo os dados de treino.
__main__ - INFO - Lendo os dados de teste.


In [5]:
df_train.head()

Unnamed: 0,unit_number,time,setting_1,setting_2,setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21,RUL
0,1.0,1.0,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,2388.02,8138.62,8.4195,0.03,392.0,2388.0,100.0,39.06,23.419,125.0
1,1.0,2.0,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,2388.07,8131.49,8.4318,0.03,392.0,2388.0,100.0,39.0,23.4236,125.0
2,1.0,3.0,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,2388.03,8133.23,8.4178,0.03,390.0,2388.0,100.0,38.95,23.3442,125.0
3,1.0,4.0,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,2388.08,8133.83,8.3682,0.03,392.0,2388.0,100.0,38.88,23.3739,125.0
4,1.0,5.0,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,2388.04,8133.8,8.4294,0.03,393.0,2388.0,100.0,38.9,23.4044,125.0


In [6]:
df_test.head()

Unnamed: 0,unit_number,time,setting_1,setting_2,setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21,RUL
0,1.0,1.0,0.0023,0.0003,100.0,518.67,643.02,1585.29,1398.21,14.62,...,2388.03,8125.55,8.4052,0.03,392.0,2388.0,100.0,38.86,23.3735,125.0
1,1.0,2.0,-0.0027,-0.0003,100.0,518.67,641.71,1588.45,1395.42,14.62,...,2388.06,8139.62,8.3803,0.03,393.0,2388.0,100.0,39.02,23.3916,125.0
2,1.0,3.0,0.0003,0.0001,100.0,518.67,642.46,1586.94,1401.34,14.62,...,2388.03,8130.1,8.4441,0.03,393.0,2388.0,100.0,39.08,23.4166,125.0
3,1.0,4.0,0.0042,0.0,100.0,518.67,642.44,1584.12,1406.42,14.62,...,2388.05,8132.9,8.3917,0.03,391.0,2388.0,100.0,39.0,23.3737,125.0
4,1.0,5.0,0.0014,0.0,100.0,518.67,642.51,1587.19,1401.92,14.62,...,2388.03,8129.54,8.4031,0.03,390.0,2388.0,100.0,38.99,23.413,125.0


In [7]:
y_train = df_train[output_model]
X_train = df_train[input_model]

y_test = df_test[output_model]
X_test = df_test[input_model]

model = LGBMRegressor(max_depth=4, n_estimators=36, boosting_type="gbdt")

pipeline = Pipeline([('std', StandardScaler()), ('regressor', model)])
pipeline = TransformedTargetRegressor(regressor=pipeline,
                                      transformer=StandardScaler())
model = pipeline

# Algoritmo genético

In [8]:
creator.create("FitnessMin", base.Fitness, weights=(-1.0,))

creator.create("Individual", list, fitness=creator.FitnessMin)

ind_size = len(input_model)

In [9]:
toolbox = base.Toolbox()

toolbox.register("attrib_bin", random.randint, 0, 1)

toolbox.register(
    "individual",
    tools.initRepeat,
    creator.Individual,
    toolbox.attrib_bin, n=ind_size)

toolbox.register("population", tools.initRepeat, list, toolbox.individual)

In [10]:
def root_mean_squared_error(y_true: pd.Series, y_pred: pd.Series) -> float:
    """Faz o cálculo do RMSE.

    Parameters
    ----------
    y_true : pd.Series
        Valor real.
    y_pred : pd.Series
        Valor predito pelo modelo.

    Returns
    -------
    float
        Valor do RMSE.
    """
    return np.sqrt(mean_squared_error(y_true, y_pred))

def evaluate(individual,
             model,
             input_model: List[str],
             X_train: pd.DataFrame,
             y_train: pd.DataFrame,
             X_test: pd.DataFrame,
             y_test: pd.DataFrame,
             control_panel: ControlPanel) -> Tuple[float, None]:
    sum_features = np.sum(individual)
    if sum_features == 0:
        return sys.float_info.max
    else:
        input_model_select = []
        for k in range(0, len(individual)):
            if individual[k] == 1:
                input_model_select.append(input_model[k])
        X_train = X_train[input_model_select]
        model.fit(X_train, y_train)

        X_test_select = X_test[input_model_select]
        y_test_pred = model.predict(X_test_select)

        if control_panel.use_roi:
            X_test_select['REAL'] = y_test.values
            X_test_select['PREDITO'] = y_test_pred
            X_test_select = X_test_select[~(X_test_select['REAL'] == control_panel.LENGHT_ROI)]
            y_test_select = X_test_select['REAL']
            y_test_pred = X_test_select['PREDITO'].values
            X_test_select = X_test_select.drop(columns=['REAL', 'PREDITO'])

        rmse = root_mean_squared_error(y_test_select, y_test_pred)
        return rmse,

In [11]:
# Two points crossover
toolbox.register("mate", tools.cxTwoPoint)

# Bit flip mutation The indpb argument is the probability of each attribute to be flipped
toolbox.register("mutate", tools.mutFlipBit, indpb=0.3)

# Select the best individual among tournsize randomly chosen individuals
toolbox.register("select", tools.selTournament, tournsize=3)

# Register the fitness function defined above in the toolbox
toolbox.register("evaluate",
                 evaluate,
                 model=model,
                 input_model=input_model,
                 X_train=X_train,
                 y_train=y_train,
                 X_test=X_test,
                 y_test=y_test,
                 control_panel=control_panel)

# Define the statistics to be shown during the algorithm run.
# We have selected minimum, maximum and average accuracy for each generation of run
# Decision will, however, be taken based on maximum accuracy as defined earlier
stats = tools.Statistics(lambda ind: ind.fitness.values)
stats.register("Mean", np.mean)
stats.register("Max", np.max)
stats.register("Min", np.min)

In [12]:
#Define initial population with its size. Here we are defining low population size (only 20) for demonstration
pop = toolbox.population(n=100)
# Below HallOfFame argument saves k best individual across all the generations
# We will use this to determine our final variable combination. 
# We are taking only the first best one. It is possible to save k number of best solution in hof
hof = tools.HallOfFame(1)
# Now, let's run the algorithm with the most simple in-built method available in DEAP (eaSimple).
# The arguments are as below:
# pop: Population defined earlier
# toolbox: toolbox containing all the operator defined
# cxpb: The probability of mating two individuals.
# mutpb: The probability of mutating an individual. We are keeping it high to show the impact
# ngen: The number of generation.
pop, log = algorithms.eaSimple(
    pop, toolbox, cxpb=0.5, mutpb=0.8, ngen=30, halloffame=hof, stats=stats, verbose=True
)
# Get the best individual
best = hof.items[0]
#Print the best individual and it's accuracy
print("Best Individual = ", best)
print("Best Fitness = ", best.fitness.values[0])

gen	nevals	Mean  	Max    	Min    
0  	100   	21.682	25.6984	19.1005
1  	90    	20.7166	25.5567	19.1005
2  	88    	20.6863	25.6433	19.1005
3  	84    	20.5454	25.1219	19.1053
4  	89    	20.4956	24.7643	19.1053
5  	88    	20.215 	25.6332	19.0464
6  	92    	20.4645	25.0769	19.0522
7  	84    	20.3722	25.1956	19.1049
8  	94    	20.5604	26.1477	19.1008
9  	93    	20.3798	25.2925	19.0403
10 	95    	20.8865	26.7415	19.0464
11 	90    	20.1401	24.955 	19.0425
12 	92    	20.5168	25.7658	19.098 
13 	88    	20.6191	25.5856	19.0931
14 	93    	20.4868	25.7484	19.0938
15 	80    	20.1866	25.4188	19.0607
16 	92    	20.5474	25.7647	19.0982
17 	88    	20.6375	25.6335	19.069 
18 	89    	20.4019	25.1712	19.098 
19 	89    	20.5092	25.4361	19.1166
20 	93    	20.428 	25.3827	19.1221
21 	91    	20.6655	25.8735	19.0797
22 	90    	20.4989	25.5015	19.1422
23 	90    	20.6002	25.4367	19.1099
24 	90    	20.319 	25.2611	19.0644
25 	88    	20.3966	25.9009	19.0644
26 	86    	20.2133	25.2808	19.0879
27 	89    	20.2401	25.

In [13]:
input_model_select = []
for i in range(0, len(best)):
    if best[i] == 1:
        input_model_select.append(input_model[i])
print(input_model_select)

['time', 'setting_1', 'setting_2', 'sensor_1', 'sensor_3', 'sensor_4', 'sensor_5', 'sensor_6', 'sensor_7', 'sensor_8', 'sensor_9', 'sensor_11', 'sensor_12', 'sensor_13', 'sensor_14', 'sensor_15', 'sensor_17', 'sensor_21']


In [14]:
len(input_model_select)

18

In [15]:
X_train[input_model_select]

Unnamed: 0,time,setting_1,setting_2,sensor_1,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,sensor_8,sensor_9,sensor_11,sensor_12,sensor_13,sensor_14,sensor_15,sensor_17,sensor_21
0,1.0,-0.0007,-0.0004,518.67,1589.70,1400.60,14.62,21.61,554.36,2388.06,9046.19,47.47,521.66,2388.02,8138.62,8.4195,392.0,23.4190
1,2.0,0.0019,-0.0003,518.67,1591.82,1403.14,14.62,21.61,553.75,2388.04,9044.07,47.49,522.28,2388.07,8131.49,8.4318,392.0,23.4236
2,3.0,-0.0043,0.0003,518.67,1587.99,1404.20,14.62,21.61,554.26,2388.08,9052.94,47.27,522.42,2388.03,8133.23,8.4178,390.0,23.3442
3,4.0,0.0007,0.0000,518.67,1582.79,1401.87,14.62,21.61,554.45,2388.11,9049.48,47.13,522.86,2388.08,8133.83,8.3682,392.0,23.3739
4,5.0,-0.0019,-0.0002,518.67,1582.85,1406.22,14.62,21.61,554.00,2388.06,9055.15,47.28,522.19,2388.04,8133.80,8.4294,393.0,23.4044
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20626,196.0,-0.0004,-0.0003,518.67,1597.98,1428.63,14.62,21.61,551.43,2388.19,9065.52,48.07,519.49,2388.26,8137.60,8.4956,397.0,22.9735
20627,197.0,-0.0016,-0.0005,518.67,1604.50,1433.58,14.62,21.61,550.86,2388.23,9065.11,48.04,519.68,2388.22,8136.50,8.5139,395.0,23.1594
20628,198.0,0.0004,0.0000,518.67,1602.46,1428.18,14.62,21.61,550.94,2388.24,9065.90,48.09,520.01,2388.24,8141.05,8.5646,398.0,22.9333
20629,199.0,-0.0011,0.0003,518.67,1605.26,1426.53,14.62,21.61,550.68,2388.25,9073.72,48.39,519.67,2388.23,8139.29,8.5389,395.0,23.0640


In [16]:
X_train_select = X_train[input_model_select]
model.fit(X_train_select, y_train)
y_train_pred = model.predict(X_train_select)

if control_panel.use_roi:
    X_train_select['REAL'] = y_train.values
    X_train_select['PREDITO'] = y_train_pred
    X_train_select = X_train_select[~(X_train_select['REAL'] == control_panel.LENGHT_ROI)]
    y_train_select = X_train_select['REAL']
    y_train_pred = X_train_select['PREDITO'].values
    X_train_select = X_train_select.drop(columns=['REAL', 'PREDITO'])

rmse = root_mean_squared_error(y_train_select, y_train_pred)
rmse

17.346961214085574

In [17]:
X_test_select = X_test[input_model_select]
y_test_pred = model.predict(X_test_select)

if control_panel.use_roi:
    X_test_select['REAL'] = y_test.values
    X_test_select['PREDITO'] = y_test_pred
    X_test_select = X_test_select[~(X_test_select['REAL'] == control_panel.LENGHT_ROI)]
    y_test_select = X_test_select['REAL']
    y_test_pred = X_test_select['PREDITO'].values
    X_test_select = X_test_select.drop(columns=['REAL', 'PREDITO'])

rmse = root_mean_squared_error(y_test_select, y_test_pred)
rmse

19.040274576562826

# Features selecionadas

['time',
'setting_1',
'setting_2',
'sensor_1',
'sensor_3',
'sensor_4',
'sensor_5',
'sensor_6',
'sensor_7',
'sensor_8',
'sensor_9',
'sensor_11',
'sensor_12',
'sensor_13',
'sensor_14',
'sensor_15',
'sensor_17',
'sensor_21']


Train:
17.346961214085574

Teste: 19.040274576562826