# Configurações iniciais

In [None]:
import fastf1
import os
import pandas as pd
import numpy as np
import warnings
import seaborn as sns
import logging
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import io
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from tensorflow import keras 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from fastf1 import plotting
from fastf1.core import Laps
from __future__ import annotations
from typing import List, Optional
from sqlalchemy import create_engine
from dotenv import load_dotenv

In [None]:
CACHE_DIR = './fastf1_cache'
if not os.path.exists(CACHE_DIR):
    os.makedirs(CACHE_DIR)
    print(f"Diretório de cache '{CACHE_DIR}' criado")

fastf1.Cache.clear_cache(CACHE_DIR) 
print(f"Cache do FastF1 em '{CACHE_DIR}' limpo")

fastf1.Cache.enable_cache(CACHE_DIR)

warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Criação DataBase

In [None]:

def generate_main_dataset(year=2024, event='British Grand Prix', CACHE_DIR='./fastf1_cache', driver_filter=None, years_filter=None):
    fastf1.Cache.enable_cache(CACHE_DIR)

    try:
        session = fastf1.get_session(year, event, 'R')
        print(f"Carregando sessão: {event} {year} - Corrida (R)")
        session.load(telemetry=True, weather=True)
    except Exception as e:
        print(f"Erro ao carregar sessão: {e}")
        return pd.DataFrame()

    full_data = []

    # Processar dados meteorológicos
    weather_data = session.weather_data
    if not weather_data.empty:
        weather_data['Time'] = weather_data['Time'].dt.round('s')
        weather_data_agg = weather_data.resample('1min', on='Time').mean().reset_index()
    else:
        weather_data_agg = pd.DataFrame()

    for driver in session.drivers:
        if driver_filter and driver != driver_filter:
            continue

        laps = session.laps.pick_driver(driver).pick_quicklaps()
        if laps.empty:
            continue

        laps = laps.sort_values(by='LapNumber').reset_index(drop=True)
        laps['stint_id'] = laps['Stint']
        laps['s_lap'] = laps.groupby('stint_id').cumcount() + 1
        laps['s_total'] = laps.groupby('stint_id')['LapNumber'].transform('count')
        laps['s_pct'] = laps['s_lap'] / laps['s_total']
        laps['best_s_lap'] = laps.groupby('stint_id')['LapTime'].transform('min').dt.total_seconds()
        laps['delta_best'] = laps['LapTime'].dt.total_seconds() - laps['best_s_lap']
        laps['delta_var'] = laps['delta_best'].diff().fillna(0)#Colocar NaN

        # Estimativa de combustível
        fuel_cons = 1.8
        full_tank = 110
        laps['fuel_kg'] = full_tank - (laps['LapNumber'] * fuel_cons)
        laps['fuel_kg'] = laps['fuel_kg'].apply(lambda x: max(0, x))

        laps['race_id'] = f"{year}_{event.replace(' ', '_')}"
        laps['year'] = year
        laps['race'] = event
        laps['sc_active'] = laps['TrackStatus'].apply(lambda x: 1 if x in [2,3,4,5,6] else 0)

        # Clima
        if not weather_data_agg.empty and 'LapStartTime' in laps.columns:
            try:
                laps['LapStartTime_rounded'] = laps['LapStartTime'].dt.round('s')
                laps = pd.merge_asof(
                    laps.sort_values('LapStartTime_rounded'),
                    weather_data_agg.sort_values('Time'),
                    left_on='LapStartTime_rounded',
                    right_on='Time',
                    direction='nearest'
                )
                cols_to_drop = [col for col in ['LapStartTime_rounded', 'Time', 'Time_y'] if col in laps.columns]
                laps = laps.drop(columns=cols_to_drop)
            except Exception as e:
                logger.warning(f"⚠️ Erro ao mesclar clima para {driver}: {e}")

        df = pd.DataFrame({
            'race_id': laps['race_id'],
            'year': laps['year'],
            'race': laps['race'],
            'drv': laps['Driver'],
            'team': laps['Team'],
            'lap': laps['LapNumber'],
            's_lap': laps['s_lap'],
            's_pct': laps['s_pct'],
            'tyre': laps['Compound'],
            'lap_time': laps['LapTime'].dt.total_seconds(),
            'delta_best': laps['delta_best'],
            'delta_var': laps['delta_var'],
            'fuel_kg': laps['fuel_kg'],
            'sc_active': laps['sc_active'],
            'stint_id': laps['stint_id'],
            'fresh_tyre': laps['FreshTyre'],
            'speed_i1': laps['SpeedI1'],
            'speed_i2': laps['SpeedI2'],
            'speed_fl': laps['SpeedFL'],
            'speed_st': laps['SpeedST'],
            'air_temp': laps['AirTemp'] if 'AirTemp' in laps.columns else np.nan,
            'track_temp': laps['TrackTemp'] if 'TrackTemp' in laps.columns else np.nan,
            'humidity': laps['Humidity'] if 'Humidity' in laps.columns else np.nan,
            'best_s_lap': laps['best_s_lap'],
        })

        full_data.append(df)

    if full_data:
        final_df = pd.concat(full_data, ignore_index=True)
        filename = f"tyre_wear_dataset_{year}_{event.replace(' ', '_')}.csv"
        return final_df
    else:
        print("Nenhum dado disponível para criar o dataset")
        return pd.DataFrame()

def generate_deltas_dataset(
    *,
    year: int,
    event: str,
    CACHE_DIR: str = CACHE_DIR,
    driver_filter: Optional[str] = None,
    years_filter: Optional[List[int]] = None,
) -> pd.DataFrame:

    if years_filter and year not in years_filter:
        return pd.DataFrame()

    fastf1.Cache.enable_cache(CACHE_DIR)

    try:
        session = fastf1.get_session(year, event, "R")
        session.load(telemetry=True)
    except Exception as exc:
        logger.error("Erro ao carregar telemetria %s %d: %s", event, year, exc)
        return pd.DataFrame()

    all_delta: List[pd.DataFrame] = []

    for drv_num in session.drivers:
        drv_code = session.get_driver(drv_num)["Abbreviation"]
        if driver_filter and drv_code != driver_filter:
            continue

        laps = session.laps.pick_driver(drv_code).pick_quicklaps()
        if laps.empty:
            continue

        base_df = pd.DataFrame(
            {
                "race_id": f"{year}_{event.replace(' ', '_')}",
                "drv": drv_code,
                "lap": laps["LapNumber"],
            }
        )

        try:
            tel = laps.get_telemetry()
            if tel.empty:
                all_delta.append(base_df)
                continue

            tel.add_driver_ahead()
            laps_time = laps[["LapNumber", "LapStartTime", "Time"]].rename(
                columns={"Time": "LapEndTime"}
            )
            tel = tel.sort_values("SessionTime").reset_index(drop=True)
            laps_time = laps_time.sort_values("LapStartTime").reset_index(drop=True)

            tel = pd.merge_asof(
                tel,
                laps_time,
                left_on="SessionTime",
                right_on="LapStartTime",
                direction="backward",
            )
            tel = tel[
                (tel["SessionTime"] >= tel["LapStartTime"]) &
                (tel["SessionTime"] <= tel["LapEndTime"])
            ]
            if tel.empty:
                all_delta.append(base_df)
                continue

            dist = (
                tel.groupby("LapNumber")["DistanceToDriverAhead"].mean().reset_index()
            ).rename(columns={"LapNumber": "lap", "DistanceToDriverAhead": "delta_s"})

            base_df = base_df.merge(dist, on="lap", how="left")
        except Exception as exc:
            logger.warning("Erro ao processar telemetria %s: %s", drv_code, exc)
            base_df["delta_s"] = np.nan

        all_delta.append(base_df)

    return pd.concat(all_delta, ignore_index=True) if all_delta else pd.DataFrame()

def merge_fastf1_dataframes(tyre_df: pd.DataFrame, delta_df: pd.DataFrame) -> pd.DataFrame:
    if tyre_df.empty:
        logger.warning("DataFrame de pneus vazio.")
        return pd.DataFrame()
    if delta_df.empty:
        logger.warning("DataFrame de delta vazio – retornando apenas pneus.")
        return tyre_df
    return tyre_df.merge(delta_df, on=["race_id", "drv", "lap"], how="left")


def add_analysis_features(df: pd.DataFrame) -> pd.DataFrame:
    if df.empty:
        return df

    FUEL_IMPACT = 0.03  # s/kg
    df = df.copy()
    df["delta_adjusted_fuel"] = df["delta_best"] - df["fuel_kg"] * FUEL_IMPACT

    avg_circuit_tyre = (
        df.groupby(["race_id", "tyre"])["delta_best"].mean().reset_index()
    ).rename(columns={"delta_best": "avg_delta_best_circuit_tyre"})
    df = df.merge(avg_circuit_tyre, on=["race_id", "tyre"], how="left")

    avg_driver_tyre = (
        df.groupby(["drv", "tyre"])["delta_best"].mean().reset_index()
    ).rename(columns={"delta_best": "avg_delta_best_driver_tyre"})
    df = df.merge(avg_driver_tyre, on=["drv", "tyre"], how="left")

    if "best_s_lap" in df.columns:
        df["is_stint_fastest_lap"] = (
            df["lap_time"] == df["best_s_lap"]
        ).astype(int)
    else:
        df["is_stint_fastest_lap"] = np.nan

    def _track(row):
        if pd.isna(row["track_temp"]):
            return "WET_OR_INTERMEDIATE" if row["tyre"] in ["INTERMEDIATE", "WET"] else "UNKNOWN_TEMP"
        if row["tyre"] in ["INTERMEDIATE", "WET"]:
            return "WET_OR_INTERMEDIATE"
        if row["track_temp"] > 25:
            return "DRY_HOT"
        if row["track_temp"] < 15:
            return "DRY_COLD"
        return "DRY_NORMAL"

    df["track_condition"] = df.apply(_track, axis=1)
    logger.info("Features adicionadas. Shape: %s", df.shape)
    return df

def load_multi_year_data(
    *,
    years: List[int],
    events: List[str],
    driver_filter: Optional[str] = None,
    CACHE_DIR: str = CACHE_DIR,
) -> pd.DataFrame:
    all_main, all_delta = [], []
    for yr in years:
        for ev in events:
            main_df = generate_main_dataset(
                year=yr,
                event=ev,
                CACHE_DIR=CACHE_DIR,
                driver_filter=driver_filter,
                years_filter=years,
            )
            delta_df = generate_deltas_dataset(
                year=yr,
                event=ev,
                CACHE_DIR=CACHE_DIR,
                driver_filter=driver_filter,
                years_filter=years,
            )
            if not main_df.empty:
                all_main.append(main_df)
            if not delta_df.empty:
                all_delta.append(delta_df)

    main_all = pd.concat(all_main, ignore_index=True) if all_main else pd.DataFrame()
    delta_all = pd.concat(all_delta, ignore_index=True) if all_delta else pd.DataFrame()

    combined = merge_fastf1_dataframes(main_all, delta_all)
    if combined.empty:
        logger.error("Nenhum dado disponível após merge")
        return combined

    return add_analysis_features(combined)
    
def fill_outliers_with_median(df, col, n_std=2.5, group_by_cols=['drv', 'tyre']):
    df_copy = df.copy()

    # Calcular estatísticas por grupo (incluindo mediana para preenchimento de NA e outliers)
    valid_group_by_cols = [g_col for g_col in group_by_cols if g_col in df_copy.columns]
    
    # Calcular a mediana global da coluna como fallback
    global_median_col = df_copy[col].median()

    if not valid_group_by_cols:
        print(f"Atenção: Nenhuma das colunas de agrupamento {group_by_cols} encontrada para '{col}'. Processando sem agrupamento.")
        df_copy['group_mean'] = df_copy[col].mean()
        df_copy['group_std'] = df_copy[col].std()
        df_copy['group_median'] = df_copy[col].median()
    else:
        # Calcular estatísticas agrupadas
        df_copy['group_mean'] = df_copy.groupby(valid_group_by_cols)[col].transform('mean')
        df_copy['group_std'] = df_copy.groupby(valid_group_by_cols)[col].transform('std')
        df_copy['group_median'] = df_copy.groupby(valid_group_by_cols)[col].transform('median')

    # --- 1. Tratar valores ausentes DENTRO da função de outliers ---
    na_count_before = df_copy[col].isnull().sum()
    if na_count_before > 0:
        # Preencher NaNs com a mediana do grupo; se a mediana do grupo for NaN, usa a mediana global da coluna
        df_copy[col] = df_copy[col].fillna(df_copy['group_median']).fillna(global_median_col)
        print(f"  Preenchidos {na_count_before} NA's em '{col}' (com mediana do grupo ou global).")

    # Recalcular estatísticas após preencher os NaNs para garantir que a detecção de outliers use dados completos
    if not valid_group_by_cols:
        df_copy['group_mean'] = df_copy[col].mean()
        df_copy['group_std'] = df_copy[col].std()
        df_copy['group_median'] = df_copy[col].median()
    else:
        df_copy['group_mean'] = df_copy.groupby(valid_group_by_cols)[col].transform('mean')
        df_copy['group_std'] = df_copy.groupby(valid_group_by_cols)[col].transform('std')
        df_copy['group_median'] = df_copy.groupby(valid_group_by_cols)[col].transform('median')


    # --- 2. Identificar e preencher outliers ---
    upper_outlier_condition = (df_copy[col] > df_copy['group_mean'] + n_std * df_copy['group_std'])
    lower_outlier_condition = (df_copy[col] < df_copy['group_mean'] - n_std * df_copy['group_std'])
    outlier_condition = upper_outlier_condition | lower_outlier_condition

    num_outliers_before = outlier_condition.sum()
    print(f"Número de outliers identificados em '{col}' (com n_std={n_std}): {num_outliers_before}")

    # Preencher outliers com a mediana do grupo (ou global se a do grupo for NaN)
    df_copy.loc[outlier_condition, col] = df_copy.loc[outlier_condition, 'group_median'].fillna(global_median_col)

    # Remover colunas auxiliares
    df_filled = df_copy.drop(columns=['group_mean', 'group_std', 'group_median'])

    return df_filled


if __name__ == "__main__":
    YEARS = [2025]  
    EVENTS = ["Spanish Grand Prix"]  
    DRIVER = None  

    df_final = load_multi_year_data(
        years=YEARS,
        events=EVENTS,
        driver_filter=DRIVER,
    )
    
    df_2025 = fill_outliers_with_median(df_final, 'lap_time', n_std=2.5)

    if not df_final.empty:
        driver_tag = DRIVER or "ALL"
        fname = f"analyzed_fastf1_data_{driver_tag}_{'_'.join(map(str, YEARS))}"
        logger.info("Dataframe salvo como '%s' (%d linhas).", fname, len(df_final))
    else:
        logger.warning("Pipeline terminou sem dados.")


# Integração PostGreSQL

**Enviando o DataFrame**

In [None]:
#SALVANDO OS

load_dotenv()

def save_data_to_sql(df, table_name, db_type, db_details, if_exists='replace', index=False):
    conn_str = ""
    try:
        if db_type == 'sqlite':
            conn_str = f"sqlite:///{db_details['database']}"
        elif db_type == 'mysql':
            conn_str = (
                f"mysql+mysqlconnector://{db_details['user']}:{db_details['password']}"
                f"@{db_details['host']}/{db_details['database']}"
            )
        elif db_type == 'postgresql':
            conn_str = (
                f"postgresql+psycopg2://{db_details['user']}:{db_details['password']}"
                f"@{db_details['host']}:{db_details.get('port', 5432)}/{db_details['database']}"
            )
        else:
            print(f"Erro: Tipo de banco de dados '{db_type}' não suportado")
            return False

        engine = create_engine(conn_str)
        df.to_sql(name=table_name, con=engine, if_exists=if_exists, index=index)
        print(f"Dados salvos com sucesso na tabela '{table_name}' no banco de dados {db_type}")
        return True
    except ImportError as e:
        print(f"Erro: O driver para o banco de dados '{db_type}' não está instalado. Detalhes: {e}")
        return False
    except Exception as e:
        print(f"Erro ao salvar dados no banco de dados {db_type}: {e}")
        return False

postgresql_db_details = {
    'user': os.environ.get('PG_USER'),        
    'password': os.environ.get('PG_PASSWORD'),
    'host': os.environ.get('PG_HOST'),
    'port': int(os.environ.get('PG_PORT')),
    'database': os.environ.get('PG_DATABASE')
}

table_name = 'fastf1_analysis_data'

print(f"1. Carregando dados do DF:")
df_fastf1 = df_final.copy() if df_final is not None else pd.DataFrame()
df_fastf1.to_csv('fastf1_analysis_data.csv', index=False)
if df_fastf1 is not None and not df_fastf1.empty:
    print(f"\n2. Tentando salvar o DataFrame na tabela '{table_name}' no PostgreSQL...")
    success = save_data_to_sql(df_fastf1, table_name, 'postgresql', postgresql_db_details, if_exists='replace')

    if success:
        print("\nDados salvos com sucesso no PostgreSQL.")
    else:
        print("\nNão foi possível salvar os dados no PostgreSQL.")
else:
    print("\nO DataFrame não pôde ser carregado ou está vazio. Não é possível prosseguir para salvar no PostgreSQL.")

**Carregando uma tabela em DataFrame**

In [None]:
def load_data_from_postgresql(table_name):
    try:
        conn_str = f"postgresql+psycopg2://{PG_USER}:{PG_PASSWORD}@{PG_HOST}:{PG_PORT}/{PG_DATABASE}"
        engine = create_engine(conn_str)
        print("Conexão com o PostgreSQL configurada com sucesso")

        # Carregando os dados em um DataFrame
        print(f"Carregando dados da tabela '{table_name}' para um DataFrame")
        sql_query = f"SELECT * FROM {table_name}"
        df_from_pg = pd.read_sql_query(sql_query, con=engine)

        print("\nDados carregados com sucesso do PostgreSQL para o DataFrame")
        print(f"\nDataframe importado com total de {len(df_from_pg)} linhas")
        return df_from_pg

    except Exception as e:
        print(f"Erro ao conectar ou carregar dados do PostgreSQL: {e}")

load_dotenv()
PG_USER = os.environ.get('PG_USER')
PG_PASSWORD = os.environ.get('PG_PASSWORD')
PG_HOST = os.environ.get('PG_HOST')
PG_PORT = int(os.environ.get('PG_PORT', 5432)) 
PG_DATABASE = os.environ.get('PG_DATABASE')

table_name = 'fastf1_analysis_data'

df = load_data_from_postgresql(table_name)


# Análise dos dados

In [None]:
DRIVER_TO_ANALYZE = 'HAM'

if df_final is not None and not df_final.empty:
    try:
        df_visual = load_data_from_postgresql(table_name)
        driver_df_visual = df_visual[df_visual['drv'] == DRIVER_TO_ANALYZE].copy()

        if driver_df_visual.empty:
            print(f"Nenhum dado encontrado para o piloto '{DRIVER_TO_ANALYZE}' para visualização")
            print(f"Pilotos disponíveis no dataset: {df_visual['drv'].unique().tolist()}")
        else:
            print(f"Dados filtrados para visualização do piloto: {DRIVER_TO_ANALYZE}. Total de {len(driver_df_visual)} voltas")
            driver_df_visual['stint_group_id'] = (driver_df_visual['s_lap'] == 1).cumsum()

            compound_colors = {
                'SOFT': '#FF3333',
                'MEDIUM': '#FFCC00',
                'HARD': '#CCCCCC',
                'INTERMEDIATE': '#009900',
                'WET': '#0000FF',
                'UNKNOWN': '#800080',
                'TEST_UNKNOWN': '#808000'
            }

            for year in sorted(driver_df_visual['year'].unique()):
                df_year = driver_df_visual[driver_df_visual['year'] == year]

                print(f"\n Ano: {year} | Voltas: {len(df_year)}")

                # 1. Tempo de volta por stint
                plt.figure(figsize=(12, 7))
                for (stint_group_id, tyre), group in df_year.groupby(['stint_group_id', 'tyre']):
                    color = compound_colors.get(tyre, '#000000')
                    plt.plot(group['s_lap'], group['lap_time'], marker='o', linestyle='-', color=color,
                             label=f'Stint {stint_group_id} ({tyre})')
                plt.title(f'Tempo de Volta por Stint - {DRIVER_TO_ANALYZE} - {year}')
                plt.xlabel('Volta do Stint')
                plt.ylabel('Tempo de Volta (s)')
                plt.grid(True, linestyle='--', alpha=0.7)
                plt.legend(title='Stint/Pneu', bbox_to_anchor=(1.05, 1), loc='upper left')
                plt.tight_layout()
                plt.show()

                # 2. Degradação (delta_best)
                plt.figure(figsize=(12, 7))
                for (stint_group_id, tyre), group in df_year.groupby(['stint_group_id', 'tyre']):
                    color = compound_colors.get(tyre, '#000000')
                    plt.plot(group['s_lap'], group['delta_best'], marker='o', linestyle='-', color=color,
                             label=f'Stint {stint_group_id} ({tyre})')
                plt.title(f'Degradação do Pneu (Δ para melhor volta) - {DRIVER_TO_ANALYZE} - {year}')
                plt.xlabel('Volta do Stint')
                plt.ylabel('Delta para Melhor Volta (s)')
                plt.grid(True, linestyle='--', alpha=0.7)
                plt.legend(title='Stint/Pneu', bbox_to_anchor=(1.05, 1), loc='upper left')
                plt.tight_layout()
                plt.show()

                # 3. Tempo médio por stint
                plt.figure(figsize=(12, 7))
                stint_avg = df_year.groupby(['stint_group_id', 'tyre'])['lap_time'].mean().reset_index()
                stint_avg['label'] = stint_avg.apply(lambda row: f'Stint {row.stint_group_id} ({row.tyre})', axis=1)
                plt.bar(stint_avg['label'], stint_avg['lap_time'],
                        color=[compound_colors.get(c, '#000000') for c in stint_avg['tyre']])
                plt.title(f'Tempo Médio por Stint - {DRIVER_TO_ANALYZE} - {year}')
                plt.xlabel('Stint')
                plt.ylabel('Tempo Médio de Volta (s)')
                plt.xticks(rotation=45)
                plt.grid(axis='y', linestyle='--', alpha=0.7)
                plt.tight_layout()
                plt.show()

                # 4. Volta vs combustível
                plt.figure(figsize=(12, 7))
                for tyre, group in df_year.groupby('tyre'):
                    color = compound_colors.get(tyre, '#000000')
                    plt.scatter(group['fuel_kg'], group['lap_time'], alpha=0.6, label=tyre, color=color)
                plt.title(f'Tempo de Volta vs Combustível - {DRIVER_TO_ANALYZE} - {year}')
                plt.xlabel('Peso Estimado de Combustível (kg)')
                plt.ylabel('Tempo de Volta (s)')
                plt.grid(True, linestyle='--', alpha=0.7)
                plt.legend()
                plt.tight_layout()
                plt.show()

                # 5. Volta vs % do stint
                plt.figure(figsize=(12, 7))
                for tyre, group in df_year.groupby('tyre'):
                    color = compound_colors.get(tyre, '#000000')
                    plt.scatter(group['s_pct'] * 100, group['lap_time'], alpha=0.6, label=tyre, color=color)
                plt.title(f'Tempo de Volta vs % do Stint - {DRIVER_TO_ANALYZE} - {year}')
                plt.xlabel('% do Stint')
                plt.ylabel('Tempo de Volta (s)')
                plt.grid(True, linestyle='--', alpha=0.7)
                plt.legend()
                plt.tight_layout()
                plt.show()

            print("\nVisualização de análise de desgaste por ano concluída.")

    except Exception as e:
        logger.error(f"Erro ao carregar ou visualizar o arquivo CSV: {e}")
else:
    logger.error(f"Não foi possível carregar o DataFrame para visualização. Verifique se o DataFrame está vazio ou se ocorreu um erro anterior.")

# Algoritmo de ML

**Tratamento de dados**

In [None]:
def process_data(df):
    if df is None or df.empty:
        print("Nenhum dado disponível para processamento.")
        return df
    else:
        # Preencher outliers com a mediana para colunas específicas
        cols_to_process = ['lap_time', 'delta_best', 'delta_var', 'fuel_kg',
                        'speed_i1', 'speed_i2', 'speed_fl', 'speed_st', 'delta_s']
        for col in cols_to_process:
            if col in df.columns:
                df = fill_outliers_with_median(df, col, n_std=2.5, group_by_cols=['drv', 'tyre'])

        #Converter colunas categóricas para numéricas
        df['fresh_tyre'] = df['fresh_tyre'].astype(int)
        cols_to_drop = ['race']
        df.drop(columns=[col for col in cols_to_drop if col in df.columns], inplace=True)
        categorical_cols = ['tyre', 'track_condition']
        df_preprocessed = pd.get_dummies(df, columns=[col for col in categorical_cols if col in df.columns], drop_first=False)
        for col in df_preprocessed.select_dtypes(include='bool').columns:
            df_preprocessed[col] = df_preprocessed[col].astype(int)
        
        #Escalando features numéricas
        numerical_cols_to_scale = df_preprocessed.select_dtypes(include=['float64', 'int64']).columns.tolist()
        if 'year' in numerical_cols_to_scale:
            numerical_cols_to_scale.remove('year')
        encoded_cols_prefix = [col + '_' for col in categorical_cols]
        numerical_cols_to_scale = [
            col for col in numerical_cols_to_scale
            if not any(col.startswith(p) for p in encoded_cols_prefix) and df_preprocessed[col].dtype != 'uint8'
        ]
        scaler = MinMaxScaler()
        df_preprocessed[numerical_cols_to_scale] = scaler.fit_transform(df_preprocessed[numerical_cols_to_scale])
    return df_preprocessed

df = load_data_from_postgresql(table_name)
df = process_data(df_2025)
save_data_to_sql(df, table_name, 'postgresql', postgresql_db_details, if_exists='replace')

**Arquitetura do Modelo**

In [None]:
def create_sequences(X, y, n_timesteps):
    X_sequences, y_sequences = [], []
    for i in range(len(X) - n_timesteps):
        X_sequences.append(X.iloc[i:(i + n_timesteps)].values)
        y_sequences.append(y.iloc[i + n_timesteps])
    return np.array(X_sequences), np.array(y_sequences)

def build_lstm_model(n_timesteps, n_features):
    model = Sequential()
    model.add(LSTM(units=100, activation='relu', input_shape=(n_timesteps, n_features), return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(units=100, activation='relu', return_sequences=False))
    model.add(Dropout(0.2))
    model.add(Dense(units=1))
    model.compile(optimizer='adam', loss='mse', metrics=['mae', 'mse'])
    return model

DRIVER_TO_ANALYZE = 'VER'
YEAR_TO_ANALYZE = 2024, 2023, 2022
df_driver = df[df['drv'] == DRIVER_TO_ANALYZE].copy() 
df_driver = df_driver.reset_index(drop=True)
df_driver = df[
        (df['drv'] == DRIVER_TO_ANALYZE) 
    ].copy()
df_driver = df_driver.reset_index(drop=True)
drop = ['drv', 'year', 'race_id', 'team'] 
df_driver.drop(columns=[col for col in drop if col in df_driver.columns], inplace=True)

feat = [
    'lap', 
    's_lap', 
    's_pct', 
    'fuel_kg', 
    'sc_active', 
    'stint_id', 
    'fresh_tyre', 
    'speed_i1', 'speed_i2', 'speed_fl', 'speed_st', 
    'air_temp', 'track_temp', 'humidity', 
    'delta_best', 
    'delta_var', 
    'avg_delta_best_circuit_tyre', 
    'avg_delta_best_driver_tyre', 
    'is_stint_fastest_lap', 
    'tyre_HARD', 'tyre_MEDIUM', 'tyre_SOFT',
    'track_condition_DRY_HOT',  
]

target = 'lap_time'

X_df = df_driver[feat]
y_df = df_driver[target]

N_FEATURES = X_df.shape[1]
N_TIMESTEPS = 5

X_sequences, y_sequences = create_sequences(X_df, y_df, N_TIMESTEPS)

X_train, X_val, y_train, y_val = train_test_split(X_sequences, y_sequences, test_size=0.2, random_state=42)

model = build_lstm_model(n_timesteps=N_TIMESTEPS, n_features=N_FEATURES)
model.summary()

history = model.fit(
    X_train, y_train,
    epochs=50, 
    batch_size=32, 
    validation_data=(X_val, y_val),
    verbose=1 
)

print("\nAvaliando o modelo no conjunto de validação:")
val_loss, val_mae, val_mse = model.evaluate(X_val, y_val, verbose=0)
print(f"Perda de Validação (MSE): {val_loss:.4f}")
print(f"Erro Médio Absoluto de Validação (MAE): {val_mae:.4f}")

plt.figure(figsize=(12, 5))

# Plot da Perda (Loss)
plt.subplot(1, 2, 1) # 1 linha, 2 colunas, 1º gráfico
plt.plot(history.history['loss'], label='Perda de Treinamento (MSE)')
plt.plot(history.history['val_loss'], label='Perda de Validação (MSE)')
plt.title('Perda do Modelo ao Longo das Épocas')
plt.xlabel('Época')
plt.ylabel('Perda (MSE)')
plt.legend()
plt.grid(True)

# Plot do MAE
plt.subplot(1, 2, 2) # 1 linha, 2 colunas, 2º gráfico
plt.plot(history.history['mae'], label='MAE de Treinamento')
plt.plot(history.history['val_mae'], label='MAE de Validação')
plt.title('MAE do Modelo ao Longo das Épocas')
plt.xlabel('Época')
plt.ylabel('MAE')
plt.legend()
plt.grid(True)

plt.tight_layout() # Ajusta o layout para evitar sobreposição
plt.show()

# Gráfico de Valores Reais vs. Valores Preditos no conjunto de validação
print("\nGerando previsões no conjunto de validação...")
y_pred = model.predict(X_val).flatten() # Fazer previsões e achatar para 1D

plt.figure(figsize=(10, 6))
plt.scatter(y_val, y_pred, alpha=0.6) # Scatter plot para comparar
plt.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], 'r--', lw=2) # Linha ideal (y=x)
plt.title(f'Valores Reais vs. Preditos (Piloto: {DRIVER_TO_ANALYZE}, Ano: {YEAR_TO_ANALYZE})')
plt.xlabel(f'Tempo de Volta Real (Escalado)')
plt.ylabel(f'Tempo de Volta Predito (Escalado)')
plt.grid(True)
plt.show()


**Testando o modelo na corrida de 2025**


In [None]:
df_2025_driver = df[df['drv'] == DRIVER_TO_ANALYZE].copy()
df_2025_driver = df_2025_driver.reset_index(drop=True)

if df_2025_driver.empty:
    print(f"Aviso: Nenhum dado de 2025 encontrado para o piloto '{DRIVER_TO_ANALYZE}'. Não é possível testar.")
    exit()

df_2025_driver.drop(columns=[col for col in drop if col in df_2025_driver], inplace=True, errors='ignore')

X_test_template = pd.DataFrame(columns=feat)

X_df_test = df_2025_driver[feat]
y_df_test = df_2025_driver[target] 

if len(X_df_test) < N_TIMESTEPS + 1:
    print(f"Aviso: Dados de teste insuficientes para criar sequências (necessário pelo menos {N_TIMESTEPS + 1} voltas).")
    X_sequences_test = np.array([])
    y_sequences_test = np.array([])
else:
    X_sequences_test, y_sequences_test = create_sequences(X_df_test, y_df_test, N_TIMESTEPS)
    print(f"Formato das sequências de entrada de teste (X_sequences_test): {X_sequences_test.shape}")
    print(f"Formato das sequências de saída de teste (y_sequences_test): {y_sequences_test.shape}")


if X_sequences_test.shape[0] > 0:
    print("\nFazendo previsões nos dados de teste de 2025...")
    y_pred_test = model.predict(X_sequences_test).flatten()

    test_loss, test_mae, test_mse = model.evaluate(X_sequences_test, y_sequences_test, verbose=0)
    print(f"Perda de Teste (MSE) em 2025: {test_loss:.4f}")
    print(f"Erro Médio Absoluto de Teste (MAE) em 2025: {test_mae:.4f}")

    plt.figure(figsize=(10, 6))
    plt.scatter(y_sequences_test, y_pred_test, alpha=0.6)
    plt.plot([y_sequences_test.min(), y_sequences_test.max()], [y_sequences_test.min(), y_sequences_test.max()], 'r--', lw=2)
    plt.title(f'Valores Reais vs. Preditos (Teste 2025 - Piloto: {DRIVER_TO_ANALYZE})')
    plt.xlabel(f'Tempo de Volta Real (Escalado)')
    plt.ylabel(f'Tempo de Volta Predito (Escalado)')
    plt.grid(True)
    plt.show()
else:
    print("Não foi possível gerar previsões ou avaliar o modelo para 2025 devido à falta de dados de teste suficientes.")
