In [1]:
import json

import h2o
import pandas as pd
import numpy as np
from h2o import H2OFrame
from h2o.automl import H2OAutoML
from tpot import TPOTRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, root_mean_squared_error
from sklearn.model_selection import KFold

PLANTS_PARAM = json.load(open("../../resources/solar_plants.json"))

SOLAR_PLANT = "Hélio"

## Remoção de _outliers_

In [2]:
table = pd.read_parquet(
    "../../" + PLANTS_PARAM[SOLAR_PLANT]["datawarehouse"]["loss_table"]
)
table["Data"] = pd.to_datetime(table["Data"])
table = table.rename(columns={"Angulação (°)": "Ângulo"})

print(f"Tabela original: {table.shape}")

for col in table.columns[1:]:
    # Calcular limites de outliers
    q1 = table[col].quantile(0.25)
    q3 = table[col].quantile(0.75)
    iqr = q3 - q1
    lower_bound = round(q1 - 2 * iqr, 2)
    upper_bound = round(q3 + 2 * iqr, 2)

    table = table[table[col].between(lower_bound, upper_bound)]

print(f"Tabela sem outliers: {table.shape}")

Tabela original: (8725, 4)
Tabela sem outliers: (8668, 4)


## Separação do conjunto de treinamento e teste

In [3]:
months = {
    "summer": [1, 2, 12],
    "fall": [3, 4, 5],
    "winter": [6, 7, 8],
    "spring": [9, 10, 11],
}

# Meses com melhores resultados para teste [2, 4, 7, 10]
# test_months = [np.random.choice(months[i]) for i in months.keys()]
test_months = [2, 4, 7, 10]
training_months = [i for i in range(1, 13) if i not in test_months]

print(f"Meses escolhidos para teste: {test_months}")

test_data = table[table["Data"].dt.month.isin(test_months)]
training_data = table[table["Data"].dt.month.isin(training_months)]

test_data = test_data.drop(columns=["Data"])
training_data = training_data.drop(columns=["Data"])

X_test, y_test = (
    test_data.drop(columns=["Perda (%)"]),
    test_data["Perda (%)"],
)
X_training, y_training = (
    training_data.drop(columns=["Perda (%)"]),
    training_data["Perda (%)"],
)

Meses escolhidos para teste: [2, 4, 7, 10]


## Regressão linear

In [4]:
model = LinearRegression()
model.fit(X_training, y_training)

y_pred = model.predict(X_test)

r2 = round(r2_score(y_test, y_pred), 2)
rmse = round(root_mean_squared_error(y_test, y_pred), 2)

print(f"R2: {r2}")
print(f"RMSE: {rmse}")

a = model.coef_[0]
b = model.coef_[1]
c = model.intercept_

variables = X_training.columns

print(f"Perda estimada = ({a:.2f} * {variables[0]}) + ({b:.2f} * {variables[1]}) + {c:.2f}")

score_df = pd.DataFrame(
    {
        "Modelo": ["LinearRegression"],
        "R2": [r2],
        "RMSE": [rmse],
    }
)

R2: 0.21
RMSE: 8.77
Perda estimada = (19.43 * CSI) + (0.06 * Ângulo) + 10.37


## Regressão polinomial com TPOT

In [5]:
custom_config = {
    # Regressão Polinomial Graus 2, 3 e 4
    "sklearn.preprocessing.PolynomialFeatures": {"degree": [2, 3, 4]},
    "sklearn.linear_model.LinearRegression": {},
}

model = TPOTRegressor(
    generations=50,
    population_size=50,
    verbosity=2,
        random_state=42,
    n_jobs=-1,
    scoring="r2",
    cv=KFold(n_splits=5),
    config_dict=custom_config,
)

model = model.fit(X_training, y_training)

y_pred = model.predict(X_test)

r2 = round(model.score(X_test, y_test), 2)
rmse = round(root_mean_squared_error(y_test, y_pred), 2)

print(f"R2: {r2}")
print(f"RMSE: {rmse}")

best_pipeline = model.fitted_pipeline_

poly = best_pipeline.named_steps["polynomialfeatures"]
linear_model = best_pipeline.named_steps["linearregression"]

# Coeficientes e intercepto
coefficients = linear_model.coef_
intercept = linear_model.intercept_

# Nomes das features
feature_names = poly.get_feature_names_out(X_training.columns)

equation = f"Perda estimada = {intercept:.2f} "

for i, coef in enumerate(coefficients):
    if coef == 0:
        continue

    sig = "+" if coef >= 0 else "-"

    if " " in str(feature_names[i]):
        f1, f2 = str(feature_names[i]).split(" ")
        feature = f"{f1} * {f2}"
    else:
        feature = feature_names[i]

    equation += f"{sig} ({abs(coef):.2f} * {feature}) "

print(equation)

new_row = pd.DataFrame(
    {
        "Modelo": ["PolynomialRegression"],
        "R2": [r2],
        "RMSE": [rmse],
    }
)

score_df = pd.concat([score_df, new_row], ignore_index=True)

model.export("best_pipelines/polynomial_pipeline.py")

Optimization Progress:   0%|          | 0/2550 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.6178947830844614

Generation 2 - Current best internal CV score: 0.6178947830844614

Generation 3 - Current best internal CV score: 0.6178947830844614

Generation 4 - Current best internal CV score: 0.6178947830844614

Generation 5 - Current best internal CV score: 0.6178947830844614

Generation 6 - Current best internal CV score: 0.6178947830844614

Generation 7 - Current best internal CV score: 0.6178947830844614

Generation 8 - Current best internal CV score: 0.6178947830844614

Generation 9 - Current best internal CV score: 0.6178947830844614

Generation 10 - Current best internal CV score: 0.6178947830844614

Generation 11 - Current best internal CV score: 0.6178947830844614

Generation 12 - Current best internal CV score: 0.6178947830844614

Generation 13 - Current best internal CV score: 0.6178947830844614

Generation 14 - Current best internal CV score: 0.6178947830844614

Generation 15 - Current best internal CV score: 0.617894

## Regressão deixando o TPOT escolher o melhor modelo

In [6]:
model = TPOTRegressor(
    generations=50,
    population_size=50,
    verbosity=2,
    random_state=42,
    n_jobs=-1,
    cv=KFold(n_splits=5),
    scoring="r2",
)

model = model.fit(X_training, y_training)

y_pred = model.predict(X_test)

r2 = round(model.score(X_test, y_test), 2)
rmse = round(root_mean_squared_error(y_test, y_pred), 2)

print(f"R2: {r2}")
print(f"RMSE: {rmse}")

best_pipeline = model.fitted_pipeline_

new_row = pd.DataFrame(
    {
        "Modelo": [(str)(best_pipeline[-1]).split("(")[0]],
        "R2": [r2],
        "RMSE": [rmse],
    }
)

score_df = pd.concat([score_df, new_row], ignore_index=True)

model.export("best_pipelines/tpot_pipeline.py")

Optimization Progress:   0%|          | 0/2550 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.6228742131029736

Generation 2 - Current best internal CV score: 0.6228742131029736

Generation 3 - Current best internal CV score: 0.6228742131029736

Generation 4 - Current best internal CV score: 0.6228742131029736

Generation 5 - Current best internal CV score: 0.6228742131029736

Generation 6 - Current best internal CV score: 0.6228742131029736

Generation 7 - Current best internal CV score: 0.6228742131029736

Generation 8 - Current best internal CV score: 0.6231400734909024

Generation 9 - Current best internal CV score: 0.6256893049466943

Generation 10 - Current best internal CV score: 0.6256893049466943

Generation 11 - Current best internal CV score: 0.6265818349932286

Generation 12 - Current best internal CV score: 0.6268023747092363

Generation 13 - Current best internal CV score: 0.6268023747092363

Generation 14 - Current best internal CV score: 0.6268023747092363

Generation 15 - Current best internal CV score: 0.627288



## Regressão usando o H2O

In [7]:
h2o.init()

test_data_h2o = H2OFrame(test_data)

training_data_h2o = H2OFrame(training_data)

model = H2OAutoML(
    seed=42,
    nfolds=5,
    keep_cross_validation_predictions=True,
    sort_metric="R2",
    max_models=50,
)

x = ["CSI", "Ângulo"]
y = "Perda (%)"

model.train(x=x, y=y, training_frame=training_data_h2o)

y_pred = model.predict(test_data_h2o)

r2 = round(model.leader.model_performance(test_data_h2o).r2(), 2)
rmse = round(model.leader.model_performance(test_data_h2o).rmse(), 2)

print(f"R2: {r2}")
print(f"RMSE: {rmse}")

new_row = pd.DataFrame(
    {
        "Modelo": [model.leader.algo],
        "R2": [r2],
        "RMSE": [rmse],
    }
)

score_df = pd.concat([score_df, new_row], ignore_index=True)

model.leader.download_mojo("best_pipelines/h2o_pipeline.zip")

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.25" 2024-10-15; OpenJDK Runtime Environment (build 11.0.25+9-post-Ubuntu-1ubuntu122.04); OpenJDK 64-Bit Server VM (build 11.0.25+9-post-Ubuntu-1ubuntu122.04, mixed mode, sharing)
  Starting server from /home/aluno/TCC/.venv/lib/python3.10/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp1svwro54
  JVM stdout: /tmp/tmp1svwro54/h2o_aluno_started_from_python.out
  JVM stderr: /tmp/tmp1svwro54/h2o_aluno_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,America/Sao_Paulo
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.6
H2O_cluster_version_age:,22 days
H2O_cluster_name:,H2O_from_python_aluno_j3ivq4
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1.887 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%
glm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
R2: 0.2
RMSE: 8.83


'/home/aluno/TCC/services/notebooks/best_pipelines/h2o_pipeline.zip'

## Tabela comparativa entre as regressões

In [8]:
score_df

Unnamed: 0,Modelo,R2,RMSE
0,LinearRegression,0.21,8.77
1,PolynomialRegression,0.73,5.08
2,KNeighborsRegressor,0.73,5.14
3,glm,0.2,8.83
