In [1]:
import pandas as pd
from deap import base, creator, tools, algorithms
import logging
import random
import numpy as np
import sys
from typing import List, Tuple
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import TransformedTargetRegressor

# pacotes personalizados
import util
from class_manipulate_data import ManipulateData

In [2]:
# region: parâmetros necessários para uso do logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
console_format = logging.Formatter('%(name)s - %(levelname)s - %(message)s')
console_handler = logging.StreamHandler()
console_handler.setFormatter(console_format)
console_handler.setLevel(logging.INFO)
logger.addHandler(console_handler)
# endregion

logger.info(util.init())

manipulate_data = ManipulateData()
path_preprocessing_output = manipulate_data.get_path_preprocessing_output()

__main__ - INFO - Módulo util importado com sucesso.


In [3]:
# todas as entradas
input_model = ['time',
    'setting_1', 'setting_2', 'setting_3',
    'sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5', 'sensor_6',
    'sensor_7', 'sensor_8', 'sensor_9', 'sensor_10', 'sensor_11',
    'sensor_12', 'sensor_13', 'sensor_14', 'sensor_15', 'sensor_16',
    'sensor_17', 'sensor_18', 'sensor_19', 'sensor_20', 'sensor_21']

output_model = ['RUL']

equipment_name = 'FD001'

In [4]:
logger.info("Lendo os dados de treino.")

path_dataset_train = \
    str(path_preprocessing_output.joinpath(f"train_{equipment_name}.parquet"))

df_train = pd.read_parquet(path_dataset_train)

logger.info("Lendo os dados de teste.")

path_dataset_test = \
    str(path_preprocessing_output.joinpath(f"test_{equipment_name}.parquet"))

df_test = pd.read_parquet(path_dataset_test)

__main__ - INFO - Lendo os dados de treino.
__main__ - INFO - Lendo os dados de teste.


In [5]:
df_train.head()

Unnamed: 0,unit_number,time,setting_1,setting_2,setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21,RUL
0,1.0,1.0,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,2388.02,8138.62,8.4195,0.03,392.0,2388.0,100.0,39.06,23.419,191.0
1,1.0,2.0,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,2388.07,8131.49,8.4318,0.03,392.0,2388.0,100.0,39.0,23.4236,190.0
2,1.0,3.0,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,2388.03,8133.23,8.4178,0.03,390.0,2388.0,100.0,38.95,23.3442,189.0
3,1.0,4.0,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,2388.08,8133.83,8.3682,0.03,392.0,2388.0,100.0,38.88,23.3739,188.0
4,1.0,5.0,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,2388.04,8133.8,8.4294,0.03,393.0,2388.0,100.0,38.9,23.4044,187.0


In [6]:
df_test.head()

Unnamed: 0,unit_number,time,setting_1,setting_2,setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21,RUL
0,1.0,1.0,0.0023,0.0003,100.0,518.67,643.02,1585.29,1398.21,14.62,...,2388.03,8125.55,8.4052,0.03,392.0,2388.0,100.0,38.86,23.3735,142.0
1,1.0,2.0,-0.0027,-0.0003,100.0,518.67,641.71,1588.45,1395.42,14.62,...,2388.06,8139.62,8.3803,0.03,393.0,2388.0,100.0,39.02,23.3916,141.0
2,1.0,3.0,0.0003,0.0001,100.0,518.67,642.46,1586.94,1401.34,14.62,...,2388.03,8130.1,8.4441,0.03,393.0,2388.0,100.0,39.08,23.4166,140.0
3,1.0,4.0,0.0042,0.0,100.0,518.67,642.44,1584.12,1406.42,14.62,...,2388.05,8132.9,8.3917,0.03,391.0,2388.0,100.0,39.0,23.3737,139.0
4,1.0,5.0,0.0014,0.0,100.0,518.67,642.51,1587.19,1401.92,14.62,...,2388.03,8129.54,8.4031,0.03,390.0,2388.0,100.0,38.99,23.413,138.0


In [7]:
y_train = df_train[output_model]
X_train = df_train[input_model]

y_test = df_test[output_model]
X_test = df_test[input_model]

model = RidgeCV(alphas=np.logspace(-6, 6, 100))
pipeline = Pipeline([('std', StandardScaler()), ('regressor', model)])
model = TransformedTargetRegressor(regressor=pipeline,
                                   transformer=StandardScaler())

# Algoritmo genético

In [8]:
creator.create("FitnessMin", base.Fitness, weights=(-1.0,))

creator.create("Individual", list, fitness=creator.FitnessMin)

ind_size = len(input_model)

In [9]:
toolbox = base.Toolbox()

toolbox.register("attrib_bin", random.randint, 0, 1)

toolbox.register(
    "individual",
    tools.initRepeat,
    creator.Individual,
    toolbox.attrib_bin, n=ind_size)

toolbox.register("population", tools.initRepeat, list, toolbox.individual)

In [10]:
def evaluate(individual,
             model,
             input_model: List[str],
             X_train: pd.DataFrame,
             y_train: pd.DataFrame,
             X_test: pd.DataFrame,
             y_test: pd.DataFrame) -> Tuple[float, None]:
    sum_features = np.sum(individual)
    if sum_features == 0:
        return sys.float_info.max
    else:
        input_model_select = []
        for k in range(0, len(individual)):
            if individual[k] == 1:
                input_model_select.append(input_model[k])
        X_train = X_train[input_model_select]
        model.fit(X_train, y_train)

        X_test = X_test[input_model_select]
        y_pred = model.predict(X_test)
        rmse = mean_squared_error(y_test, y_pred, squared=True)
        return rmse,

In [11]:
# Two points crossover
toolbox.register("mate", tools.cxTwoPoint)

# Bit flip mutation The indpb argument is the probability of each attribute to be flipped
toolbox.register("mutate", tools.mutFlipBit, indpb=0.3)

# Select the best individual among tournsize randomly chosen individuals
toolbox.register("select", tools.selTournament, tournsize=3)

# Register the fitness function defined above in the toolbox
toolbox.register("evaluate",
                 evaluate,
                 model=model,
                 input_model=input_model,
                 X_train=X_train,
                 y_train=y_train,
                 X_test=X_test,
                 y_test=y_test)

# Define the statistics to be shown during the algorithm run.
# We have selected minimum, maximum and average accuracy for each generation of run
# Decision will, however, be taken based on maximum accuracy as defined earlier
stats = tools.Statistics(lambda ind: ind.fitness.values)
stats.register("Mean", np.mean)
stats.register("Max", np.max)
stats.register("Min", np.min)

In [12]:
#Define initial population with its size. Here we are defining low population size (only 20) for demonstration
pop = toolbox.population(n=100)
# Below HallOfFame argument saves k best individual across all the generations
# We will use this to determine our final variable combination. 
# We are taking only the first best one. It is possible to save k number of best solution in hof
hof = tools.HallOfFame(1)
# Now, let's run the algorithm with the most simple in-built method available in DEAP (eaSimple).
# The arguments are as below:
# pop: Population defined earlier
# toolbox: toolbox containing all the operator defined
# cxpb: The probability of mating two individuals.
# mutpb: The probability of mutating an individual. We are keeping it high to show the impact
# ngen: The number of generation.
pop, log = algorithms.eaSimple(
    pop, toolbox, cxpb=0.5, mutpb=0.8, ngen=30, halloffame=hof, stats=stats, verbose=True
)
# Get the best individual
best = hof.items[0]
#Print the best individual and it's accuracy
print("Best Individual = ", best)
print("Best Fitness = ", best.fitness.values[0])

gen	nevals	Mean   	Max    	Min    
0  	100   	2220.75	2912.74	1841.82
1  	97    	2077.83	3178.36	1843.56
2  	92    	2007.73	2922.59	1841.35
3  	94    	2034.08	2605.83	1841.35
4  	90    	1990.91	2758.05	1840.11
5  	88    	1981.34	2665.54	1840.11
6  	93    	2045.55	2710.92	1842.43
7  	94    	1978.18	2613.47	1841.1 
8  	89    	2006.22	2798.61	1841.02
9  	94    	2002.6 	2604.16	1842.49
10 	81    	1990.1 	2558.65	1842.43
11 	87    	1978.74	2571.65	1840.23
12 	94    	1970.8 	2762.12	1840.47
13 	93    	1997.07	2713.4 	1840.53
14 	93    	2000.75	2704.06	1840.53
15 	94    	2067.38	2790.25	1839.36
16 	88    	2026.06	2747.67	1840.62
17 	84    	2024.47	2594.75	1840.62
18 	94    	2027.83	2837.32	1839.81
19 	87    	1984.57	2747.2 	1839.81
20 	89    	1962.99	2570.63	1839.81
21 	93    	1986.27	2491.26	1839.81
22 	91    	1984.06	2619.72	1840.62
23 	92    	2021.65	2722.44	1842.04
24 	92    	2007.33	3006.26	1839.96
25 	89    	2091.16	2720.29	1840.24
26 	86    	2021.06	2714.19	1840.6 
27 	86    	1985.78	2

In [13]:
input_model_select = []
for i in range(0, len(best)):
    if best[i] == 1:
        input_model_select.append(input_model[i])
print(input_model_select)

['time', 'setting_1', 'setting_3', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_7', 'sensor_8', 'sensor_9', 'sensor_10', 'sensor_11', 'sensor_12', 'sensor_13', 'sensor_14', 'sensor_18', 'sensor_20', 'sensor_21']


In [14]:
X_train = X_train[input_model_select]
model.fit(X_train, y_train)
X_test = X_test[input_model_select]
y_pred = model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=True)