In [None]:
# %pip install scikeras[tensorflow] --user -q
# %pip install PythonMETAR -q
# %pip install xgboost -q
# %pip install opencv-python -q
# %pip install cartopy

In [None]:
from datetime import datetime, timedelta
from joblib import dump, load
from time import time, sleep
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA, SparsePCA
from sklearn.preprocessing import OneHotEncoder, StandardScaler, Normalizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from tensorflow import keras, test, random
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout
from keras.losses import MeanSquaredError
from scikeras.wrappers import KerasRegressor
from tensorflow.keras.optimizers import Adam, RMSprop, SGD, Adamax, Adadelta, Adagrad, RMSprop
from sklearn.base import BaseEstimator, TransformerMixin
from PythonMETAR import Metar
from xgboost import XGBRegressor
from keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping, \
    ReduceLROnPlateau, TerminateOnNaN, BackupAndRestore

import cartopy.crs as ccrs
import cartopy.feature as cfeature
import requests as req
import cv2
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
import os
import re

In [None]:
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

pd.set_option('mode.chained_assignment', None)

In [None]:
try:
    device_name = test.gpu_device_name()
    if(device_name):
        print('Found GPU at: {}'.format(device_name))
    else:
        raise

except:
    print('GPU device not found')

## Main variables

In [None]:
# values below 2000 records generate inconsistencies in the logic of feature engineer
training_sample_size = 1 # 0 a 1 (0% a 100%)
fold_cross_validation = 5
ann_epochs = 1000
pca_n_components = 25
pca_n_components_mxgb = 10
xgb_estimators = 1000000
verbose = 0

In [None]:
seed = 25

#Numpy
np.random.seed(seed)

# TensorFlow
random.set_seed(seed)

## Preprocessing

In [None]:
categorical_selected_cols = ['origem', 'destino', 'metar', 'hora_ref', 'path', 'snapshot_radar']
numerical_selected_cols = ['troca', 'esperas']

selected_cols = categorical_selected_cols + numerical_selected_cols

In [None]:
paramsDefault = {
    'idate': '2022-06-02', # Data inicial da consulta | Example : AAAA-MM-DD
    'fdate': '2023-05-11' # Data final da consulta | Example : AAAA-MM-DD
}

### Auxiliary Functions

In [None]:
def getData(route, paramsWithoutToken = paramsDefault, token = 'a779d04f85c4bf6cfa586d30aaec57c44e9b7173'):
    """
    Fetches data from a specific API, handling exceptions and returning the data as a DataFrame.

    :param route: String representing the API endpoint.
    :param paramsWithoutToken: Dictionary containing the request parameters without the token. Defaults to paramsDefault.
    :param token: String containing the authentication token. Defaults to a fixed token.
    
    :return: A pandas DataFrame containing the data fetched from the API.
    
    :raises: May propagate exceptions related to the request if they occur.
    """
        
    url = f'http://montreal.icea.decea.mil.br:5002/api/v1/{route}'
    params = paramsWithoutToken
    params['token'] = token
    data = None
    
    while True:
        try:
            response = req.get(url, params)
            response.raise_for_status()
            data = response.json()
            break
            
        except:
            continue
    
    return pd.DataFrame(data)

def dropOutlier(X, name_cols, lower_percentile=0.05, upper_percentile=0.95):
    X['route'] = X['origem'] + '_' + X['destino']  
    keep_rows = pd.Series([True] * X.shape[0], index=X.index)
    
    for route in X['route'].unique():
    
        for col in name_cols:
            lower_bound = X[X['route'] == route][col].quantile(lower_percentile)
            upper_bound = X[X['route'] == route][col].quantile(upper_percentile)
            keep_rows &= (X['route'] != route) | ((X[col] >= lower_bound) & (X[col] <= upper_bound))
        
    X.drop('route', axis = 1, inplace = True)

    return X[keep_rows]



### Create estimate record based on route and reference time

In [None]:

class EstimatedDuration(BaseEstimator, TransformerMixin):
    """
    Estimator that calculates the estimated duration based on route and time of day.
    
    Attributes:
        _dammy_estimator (DataFrame): Table with average durations by route and time of day.
        _selected_cols (list): Selected columns from the DataFrame.
    """
    
    def __init__(self, _selected_cols, _dammy_estimator = None):
        """
        Initializes the estimator with selected columns and optionally a dummy estimator.
        
        :param _selected_cols: List of selected columns.
        :param _dammy_estimator: Optional dummy estimator. Default is None.
        """
        self._dammy_estimator = _dammy_estimator
        self._selected_cols = _selected_cols

    def fit(self, X, y=None):
        """
        Fits the estimator to the data set.
        
        :param X: Input DataFrame.
        :param y: Series with durations. Default is None.
        :return: Returns the fitted object.
        """
        
        if y is not None:
            X = pd.DataFrame(X, columns = self._selected_cols)
            X['duration'] = y.values
            X['route'] = X['origem'] + '_' + X['destino']            
            X['only_hour'] = X.apply(lambda row: self._gethourFromDatetime(row), axis = 1)

            avarageRouteHour = X.groupby(['route', 'only_hour']).agg({'duration': ['mean']})['duration'].reset_index()
            avarageRoute = X.groupby(['route']).agg({'duration': ['mean']})['duration'].reset_index()           
            avarageRouteHour.columns = ['route', 'only_hour', 'mean_hour']
            avarageRoute.columns = ['route', 'mean_absolute']
            self._dammy_estimator = avarageRouteHour.merge(avarageRoute, on=['route'], how='left')
        
        return self
    
    def transform(self, X, y=None):
        """
        Transforms the DataFrame X based on the estimated duration.
        
        :param X: Input DataFrame.
        :param y: Ignored.
        :return: Transformed DataFrame with estimated duration.
        """
        
        X = pd.DataFrame(X, columns = self._selected_cols)
        X['route'] = X['origem'] + '_' + X['destino']
        X['only_hour'] = X.apply(lambda row: self._gethourFromDatetime(row), axis = 1)
        X['estimated_duration'] = X.apply(lambda row: self._dammy_predict(row['route'], row['only_hour']), axis = 1)
        X.drop('route', axis = 1, inplace = True)
        
        return X
    
    def _dammy_predict(self, route, hour):
        """
        Predicts the duration based on route and time of day.
        
        :param route: Desired route.
        :param hour: Hour of the day.
        :return: Estimated duration.
        """
            
        try:
            return self._dammy_estimator[
                (self._dammy_estimator['route'] == route) & (self._dammy_estimator['only_hour'] == hour)
            ].iloc[0, 2]
        
        except:
            try:
                return self._dammy_estimator[self._dammy_estimator['route'] == route].iloc[0, 3]
            except:
                return 0
    
    def _gethourFromDatetime(self, row):
        """
        Extracts the hour from a datetime column.
        
        :param row: Row from the DataFrame.
        :return: Extracted hour.
        """
            
        try:
            return datetime.strptime(row['hora_ref'], "%Y-%m-%d %H:%M:%S").hour

        except:
            try:
                return datetime.strptime(row['hora_ref'], "%Y-%m-%d %H:%M:%S.%f").hour
            
            except:
                return datetime.strptime(row['hora_ref'], "%H:%M:%S").hour
        
    def get_params(self, deep=True):
        """
        Returns the estimator's parameters.
        
        :param deep: Ignored. Retained for compatibility with sklearn's interface.
        :return: Dictionary with the parameters.
        """
            
        return {
            "_dammy_estimator": self._dammy_estimator,
            "_selected_cols": self._selected_cols
        }
    
    def set_params(self, **params):
        """
        Sets the estimator's parameters.
        
        :param params: Dictionary with the new parameters.
        :return: Returns the object with updated parameters.
        """
        
        valid_params = self.get_params(deep=True)
        
        for key, value in params.items():
            if key not in valid_params:
                raise ValueError(f"Invalid parameter {key} for estimator {self.__class__.__name__}. Check the list of available parameters with `estimator.get_params().keys()`.")
            
            setattr(self, key, value)
        
        return self

### Obtains geometric information (linearity) of the path taken

In [None]:
class SnapshotRadarHandler(BaseEstimator, TransformerMixin):
    """
    Handler for Radar Snapshots (SnapshotRadar).

    This class is responsible for processing radar trajectory data and 
    deriving features such as route linearity. It inherits functionalities 
    from the BaseEstimator and TransformerMixin classes of Scikit-Learn.

    Methods:
    - fit: Sets the internal state of the transformer based on the input data.
    - transform: Applies transformations on the input data, producing new features.
    - _haversine_distance: Calculates the straight-line distance between two geographical points.

    Note: Some functionalities and calculations, such as direct distance and 
    travelled distance, are commented out in the code. They can be uncommented 
    and used as needed.
    """
    def fit(self, X, y=None):
        return self

  
    def transform(self, X, y=None):
        """
        Transforms the input data by adding the route linearity metric.

        Parameters:
        - X : DataFrame
            Input data containing the 'snapshot_radar' column with coordinates.
        - y : array-like, default=None
            Target labels. Not used, but retained for compatibility.

        Returns:
        - X : DataFrame
            Transformed input data with an additional 'route_linearity' column.
        """
        route_linearity_aux = []

        for _, row in X.iterrows():
            multipoint_str = row['snapshot_radar']

            coords = re.findall(r'(-?\d+\.\d+) (-?\d+\.\d+)', multipoint_str)
            points_rad = np.array(coords, dtype=float)
            points = np.degrees(points_rad)

            if len(points) < 2:

                direct_distance = 1200
                travelled_distance = 1200
                route_linearity = 1.0

            else:
                lon = points[:, 0]
                lat = points[:, 1]

                # Split points into X and y
                _X = lat.reshape(-1, 1)
                _y = lon

                # Create, fit the model and return the R^2 value        
                score = LinearRegression().fit(_X, _y).score(_X, _y)
                route_linearity = score

                plot = False 

                if plot:
                    # Criando figura e eixos
                    fig, ax = plt.subplots(figsize=(7, 7), subplot_kw={'projection': ccrs.PlateCarree()})
                    ax.set_extent([-55, -33, -32, 0])
                    ax.add_feature(cfeature.BORDERS, linestyle=':')
                    ax.add_feature(cfeature.COASTLINE)
                    ax.add_feature(cfeature.LAND, edgecolor='black')                    
                    ax.set_title(
                        f"Coefficient of Determination (R²): {score:.3f}"
                    )
                    
                    for n in range(len(lat)):
                        ax.plot(lon[n], lat[n], 'ro', markersize=1)    

                    plt.show()
                    plt.close()

            route_linearity_aux.append(route_linearity)

        X['route_linearity'] = route_linearity_aux

        X.drop('snapshot_radar', axis = 1, inplace = True)

        return X

### Create features based on METAR translation

In [None]:
class MetarHandler(BaseEstimator, TransformerMixin):
    """
    Transformer that translates METAR information into numerical features.
    
    This transformer extracts specific METAR information and adds corresponding 
    numerical features to the DataFrame.
    """

    def fit(self, X, y=None):
        """
        Fits the transformer to the dataset. In this case, the method is passive and doesn't perform operations.
        
        :param X: Input DataFrame.
        :param y: Ignored.
        :return: Returns the fitted object.
        """
        
        return self

    def transform(self, X, y=None):
        """
        Transforms the DataFrame X by extracting METAR information and converting it into numerical features.
        
        :param X: Input DataFrame containing a 'metar' column with METAR information.
        :param y: Ignored.
        :return: Transformed DataFrame with numerical features derived from METAR information.
        """
        
        X['wind_variation'] = 0.0
        X['qnh'] = 0.0
        X["cavok"] = 0.0
        X["gust"] = 0.0
        X["FEW_presenceCB"] = 0.0

        for index, row in X.iterrows():
            translationMetar = Metar(row['destino'], row['metar'])
            X.loc[index, 'cavok'] = float(bool(re.search('CAVOK', row['metar'])))

            if(translationMetar.wind):
                X.loc[index, 'wind_variation'] = float(translationMetar.wind['variation'] != None)
                X.loc[index, 'gust'] = float(translationMetar.wind['gust'] != None)
            else:
                # Se não tem informação de vento significa METAR no formato ex.: VRB07G17KT (vento variavel com rajada)
                X.loc[index, 'gust'] = 1

            if(translationMetar.qnh):
                X.loc[index, 'qnh'] = float(translationMetar.qnh)

            if(translationMetar.cloud):
                for formation in translationMetar.cloud:
                    if(formation['code'] == 'FEW'):
                        X.loc[index, "FEW_presenceCB"] = float(formation['presenceCB']) 
                    
        X.drop('metar', axis = 1, inplace = True)
        
        return X

### Obtains meteorology image of the flight path

In [None]:
class SatelliteImageHandler(BaseEstimator, TransformerMixin):
    """
    Handler for satellite images for feature extraction and transformation.
    
    This transformer processes satellite images, extracts specific information, 
    and converts that information into numerical features in a DataFrame.
    
    Attributes:
        _restore (str): Path for route image data.
        _width (int): Width of the image.
        _outputWidth (int): Output image width.
        _outputHeight (int): Output image height.
        _printRoutes (bool): Indicates if the routes should be printed.
        _printImage (bool): Indicates if the image should be printed.
        _imagesRouteData (DataFrame): Route image data.
        _itineraries (list): List of itineraries.
        _date (dict): Date range.
        _location (dict): Airport locations.
    """
    
    def __init__(self, _restore=None, _width=50, _outputWidth=128,
                 _outputHeight=32, _printRoutes=False, _printImage=False):
        """
        Initializes the image handler with the specified parameters.
        
        :param _restore: Path to restore the route image data.
        :param _width: Width of the image.
        :param _outputWidth: Output image width.
        :param _outputHeight: Output image height.
        :param _printRoutes: Whether the routes should be printed.
        :param _printImage: Whether the image should be printed.
        """
        
        self._restore = _restore
        self._width = _width
        self._outputWidth = _outputWidth
        self._outputHeight = _outputHeight
        self._printRoutes = _printRoutes
        self._printImage = _printImage
        self._imagesRouteData = None
        self._itineraries = []
        self._date = paramsDefault = {
            'idate': '2022-06-02', 
            'fdate': '2023-05-11'
        }          
        
        self._location = {
            'SBGR': [1665, 1459],
            'SBCF': [1722, 1376],
            'SBRJ': [1739, 1447],
            'SBPA': [1557, 1614],
            'SBSV': [1853, 1217],
            'SBFL': [1618, 1562],
            'SBRF': [1932, 1106],
            'SBBR': [1632, 1286],
            'SBCT': [1599, 1509],
            'SBSP': [1662, 1469],
            'SBKP': [1648, 1453],
            'SBGL': [1740, 1446]
        }
        
        
        for aero_from in self._location.keys():
            for aero_to in self._location.keys():
                if aero_from == aero_to:
                    continue
                    
                self._itineraries.append(f"{aero_from}_{aero_to}")
                                                    

    def fit(self, X, y=None):
        """
        Fits the transformer to the dataset. The method loads the route image data if specified.
        
        :param X: Input DataFrame.
        :param y: Ignored.
        :return: Returns the fitted object.
        """
        
        if self._restore:
            self._imagesRouteData = pd.read_csv(self._restore)
            return self
        
        route = []
        hora_ref = []
        imageSatelite_red = []
        imageSatelite_yellow = []
        imageSatelite_green = []
        imageSatelite_blue = []

        df_sat_met = getData(route = 'satelite', paramsWithoutToken = self._date)

        firstCicle = True
        
        for _, row in df_sat_met.iterrows():
                       
            print(f"date: {row['data']}")

            while True:
                try:
                    response = req.get(row['path']).content
                    break
                except:
                    print(f"requisição falhou em {row['data']}. Aguardar 5 segs e retomar deste ponto")
                    sleep(5)

            arrayImage = np.asarray(bytearray(response), dtype=np.uint8)
            cv2ImageBGR = cv2.imdecode(arrayImage, cv2.IMREAD_COLOR)

            # Converta a imagem para escala de cinza
            cv2ImageRGB = cv2.cvtColor(cv2ImageBGR, cv2.COLOR_BGR2RGB)
          
            if firstCicle:
                cv2ImageRGBCopy = cv2ImageRGB.copy()

            for itinerary in self._itineraries:        
                [origin, destiny] = itinerary.split('_')
                p_from = np.array(self._location[origin])
                p_to = np.array(self._location[destiny])

                # Vetor direcional
                v = p_to - p_from
                v = v / np.linalg.norm(v)  # Normaliza o vetor

                # Vetor perpendicular
                v_perp = np.array([v[1], -v[0]])

                # Pontos do quadrilátero usando o padding para determinar as extremidades
                padding = self._width/2
                p2 = p_to + v * padding + v_perp * padding
                p3 = p_to + v * padding - v_perp * padding
                p1 = p_from - v * padding + v_perp * padding
                p4 = p_from - v * padding - v_perp * padding

                if firstCicle:
                    pts = np.array([p1, p2, p3, p4], np.int32)
                    pts = pts.reshape((-1, 1, 2))

                    # Desenhe o paralelogramo vermelho
                    color = (255, 0, 0)  # Vermelho em RGB
                    thickness = 4  # Define a espessura da linha do paralelogramo
                    cv2.polylines(cv2ImageRGBCopy, [pts], isClosed=True, color=color, thickness=thickness)



                satelitePoints = np.array([p1, p2, p3, p4], dtype=np.float32)
                outputPoints = np.array([[0, 0], [self._outputWidth, 0],
                                         [self._outputWidth, self._outputHeight],
                                         [0, self._outputHeight]], dtype=np.float32)

                # Calcula a matriz de transformação de perspectiva
                matriz = cv2.getPerspectiveTransform(satelitePoints, outputPoints)

                # Realiza a transformação de perspectiva
                outputImage = cv2.warpPerspective(cv2ImageRGB, matriz, (self._outputWidth, self._outputHeight))

                arrayImageOutput = np.array(outputImage)

                hsv = cv2.cvtColor(arrayImageOutput, cv2.COLOR_RGB2HSV)

                red_lower_limit1 = np.array([0, 50, 50])
                red_upper_limit1 = np.array([12, 255, 255])
                red_lower_limit2 = np.array([150, 50, 50])
                red_upper_limit2 = np.array([180, 255, 255])
                mask_red1 = cv2.inRange(hsv, red_lower_limit1, red_upper_limit1)
                mask_red2 = cv2.inRange(hsv, red_lower_limit2, red_upper_limit2)
                mask_red = cv2.bitwise_or(mask_red1, mask_red2)
                outputImage_red = cv2.bitwise_and(arrayImageOutput, arrayImageOutput, mask=mask_red)
                mean_r = outputImage_red.mean()

                yellow_lower_limit1 = np.array([22, 50, 50])
                yellow_upper_limit1 = np.array([30, 255, 255])
                yellow_lower_limit2 = np.array([31, 50, 50])
                yellow_upper_limit2 = np.array([38, 255, 255])
                mask_yellow1 = cv2.inRange(hsv, yellow_lower_limit1, yellow_upper_limit1)
                mask_yellow2 = cv2.inRange(hsv, yellow_lower_limit2, yellow_upper_limit2)
                mask_yellow = cv2.bitwise_or(mask_yellow1, mask_yellow2)
                outputImage_yellow = cv2.bitwise_and(arrayImageOutput, arrayImageOutput, mask=mask_yellow)
                mean_y = outputImage_yellow.mean()

                green_lower_limit1 = np.array([40, 50, 50])
                green_upper_limit1 = np.array([70, 255, 255])
                green_lower_limit2 = np.array([71, 50, 50])
                green_upper_limit2 = np.array([80, 255, 255])
                mask_green1 = cv2.inRange(hsv, green_lower_limit1, green_upper_limit1)
                mask_green2 = cv2.inRange(hsv, green_lower_limit2, green_upper_limit2)
                mask_green = cv2.bitwise_or(mask_green1, mask_green2)
                outputImage_green = cv2.bitwise_and(arrayImageOutput, arrayImageOutput, mask=mask_green)
                mean_g = outputImage_green.mean()

                blue_lower_limit1 = np.array([100, 50, 50])
                blue_upper_limit1 = np.array([120, 255, 255])
                blue_lower_limit2 = np.array([121, 50, 50])
                blue_upper_limit2 = np.array([140, 255, 255])
                mask_blue1 = cv2.inRange(hsv, blue_lower_limit1, blue_upper_limit1)
                mask_blue2 = cv2.inRange(hsv, blue_lower_limit2, blue_upper_limit2)
                mask_blue = cv2.bitwise_or(mask_blue1, mask_blue2)
                outputImage_blue = cv2.bitwise_and(arrayImageOutput, arrayImageOutput, mask=mask_blue)
                mean_b = outputImage_blue.mean()

                route.append(itinerary)
                hora_ref.append(row['data'])
                imageSatelite_red.append(mean_r)
                imageSatelite_yellow.append(mean_y)
                imageSatelite_green.append(mean_g)
                imageSatelite_blue.append(mean_b)

                if self._printImage:
                    gap_size = 1
                    arrayImageOutput_with_gap = self._add_gap(arrayImageOutput, gap_size)
                    outputImage_red_with_gap = self._add_gap(outputImage_red, gap_size)
                    outputImage_yellow_with_gap = self._add_gap(outputImage_yellow, gap_size)
                    outputImage_green_with_gap = self._add_gap(outputImage_green, gap_size)

                    # Concatenar as imagens verticalmente com espaços
                    concatenated_image = np.vstack((arrayImageOutput_with_gap, outputImage_red_with_gap, outputImage_yellow_with_gap, outputImage_green_with_gap, outputImage_blue))

                    # Exibir a imagem resultante
                    plt.figure(figsize=(10,50)) # 5, 25
                    plt.imshow(concatenated_image)
                    plt.title(f"{itinerary} - {row['data']} r: {mean_r: .1f} y: {mean_y: .1f} g: {mean_g: .1f} b: {mean_b: .1f}", fontsize=6)
                    plt.axis('off')
                    plt.show()

            if firstCicle and self._printRoutes:
                plt.figure(figsize=(16, 16)) # 8,8
                plt.imshow(cv2ImageRGBCopy)
                plt.show()
                plt.close()

                firstCicle = False

        self._imagesRouteData = pd.DataFrame({
            'route': route,
            'hora_ref': hora_ref, 
            'imageSatelite_red': imageSatelite_red,
            'imageSatelite_yellow': imageSatelite_yellow,
            'imageSatelite_green': imageSatelite_green,
            'imageSatelite_blue': imageSatelite_blue
        })
                
        return self

    def transform(self, X, y=None):
        X['route'] = X['origem'] + '_' + X['destino']
        X = X.merge(self._imagesRouteData, on=['hora_ref', 'route'], how='left')
        X.fillna(0, inplace=True)
        X.drop(['route', 'path', 'hora_ref'], axis = 1, inplace = True)
        
        return X
    
    def _add_gap(self, image, gap_size, gap_color=[255, 255, 255]):
        return np.vstack((image, np.full((gap_size, image.shape[1], 3), gap_color, dtype=np.uint8)))

### MXGB Remove Constant Attribute

In [None]:
class RemoveConstantAttribute(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):               
        X.drop(['origem', 'destino'], axis = 1, inplace = True)
        
        return X

### Plot Boxtplot

In [None]:
def plot_duration_boxplot(data, x_col, y_col, figsize=(8, 50)):
    f, ax = plt.subplots(figsize=figsize)

    sns.boxplot(   
        data=data, x=x_col, y=y_col,   
        showcaps=True,
        flierprops={"marker": "x"},
        boxprops={"facecolor": (.3, .5, .7, .5)},
        medianprops={"color": "r", "linewidth": 1},
        whis=1.5
    )

    ax.xaxis.grid(True)
    ax.yaxis.grid(True)

    ax.set(ylabel="")
    sns.despine(trim=True, left=True)
    plt.legend([], [], frameon=False)

    plt.show()
    plt.close()

def adjust_timestamp(ts):
    timestamp_s = ts / 1000
    dt = datetime.utcfromtimestamp(timestamp_s)
    return dt.replace(minute=0, second=0, microsecond=0)

def timestamp_to_string(timestamp):
    # Convert the timestamp to a datetime object
    dt_object = datetime.fromtimestamp(timestamp)
    
    # Format the datetime object as a string
    formatted_string = dt_object.strftime('%Y-%m-%d %H:%M:%S') + '.' + '{:03d}'.format(int(dt_object.microsecond / 1000))
    
    return formatted_string

def string_to_timestamp(date_str):
    # Convert the string to a datetime object
    dt_object = datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S.%f')
    
    # Convert the datetime object to a timestamp
    timestamp = dt_object.timestamp()
    
    return timestamp

## Pipeline

### Imputer

In [None]:
imputer = ColumnTransformer(
    transformers=[
        (
            'imputer_cat',
            SimpleImputer(strategy='most_frequent'),
            ['origem', 'destino', 'metar', 'hora_ref', 'path', 'snapshot_radar']
        ),
        (
            'imputer_num',
            SimpleImputer(strategy='constant', fill_value=0),
            ['troca', 'esperas']
        )
    ],
    remainder='passthrough'
)

### Feature Engineering

In [None]:
feature_engineer = Pipeline(steps=[
    ('feat_eng_est_duration', EstimatedDuration(selected_cols)),
    ('feat_eng_snapshot_radar', SnapshotRadarHandler()),
    ('feat_eng_metar', MetarHandler()),
    ('feat_eng_satelliteImage', SatelliteImageHandler(_restore='../data/imagesRouteData.csv'))    
])

### Encode Categorical Variables

In [None]:
coding_category_var = ColumnTransformer(
    transformers=[
        (
            'coding_cat',
            OneHotEncoder(handle_unknown='ignore'),
            ['origem', 'destino']
        ),
    ],
    remainder='passthrough'
)

### Transformation

In [None]:
transformer = Pipeline(steps=[
    ('trans_stand', StandardScaler(with_mean=True)),
    # ('trans_norm', Normalizer()),
    # ('trans_pca', PCA(n_components=pca_n_components))
])

### Preprocessor

In [None]:
preprocessor = Pipeline(steps=[
    ('imputer', imputer),
    ('feature_engineer', feature_engineer),
    ('coding_category_var', coding_category_var),
    ('transformer', transformer)
])


preprocessor

## Data

In [None]:
data_train_origin = pd.read_csv("../data/idsc_train.csv", index_col='flightid')

data_train_origin.sample(3)

### Training and Validation Set

In [None]:
data_train_without_outlier = dropOutlier(
    data_train_origin,
    name_cols = ['duration'],
    lower_percentile=0.1,
    upper_percentile=0.9
)

In [None]:
data_train, _ = train_test_split(
    data_train_without_outlier,
    train_size=data_train_without_outlier.shape[0] * training_sample_size -1,
    random_state=seed
)

In [None]:
# Separate target from predictors
y = data_train.duration
X = data_train.drop(['duration'], axis=1)

# Divide data into training and validation subsets
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8)

# Keep selected columns only
X_train = X_train_full[selected_cols].copy()
X_valid = X_valid_full[selected_cols].copy()

#### For clarity, the pipeline will be applied to the training data in intermediate steps

In [None]:
aux_origin = data_train_origin.copy()
aux_origin['route'] = aux_origin['origem'] + '_' + aux_origin['destino']

plot_duration_boxplot(aux_origin, "duration", "route")

### Imputation of missing data

In [None]:
start = time()

step_a = preprocessor.named_steps['imputer'].fit_transform(X_train, y_train)

end = time()

print(f"Transforming Time: {(end-start)/60:.2f} minutes")

pd.DataFrame(step_a, columns = X_train.columns).sample(3)

In [None]:
pd.DataFrame(step_a, columns = X_train.columns).isnull().sum()

### Create an estimate duration feature

In [None]:
start = time()

step_b = preprocessor.named_steps['feature_engineer'].named_steps['feat_eng_est_duration'].fit_transform(step_a, y_train)

end = time()

print(f"Transforming Time: {(end-start)/60:.2f} minutes")

step_b.sample(3)

### create the "route_linearity" attribute (CAT-62 Feature Engineer)

In [None]:
start = time()

step_c = preprocessor.named_steps['feature_engineer'].named_steps['feat_eng_snapshot_radar'].fit_transform(step_b, y_train)

end = time()

print(f"Transforming Time: {(end-start)/60:.2f} minutes")

step_c.sample(3)

### create the "wind_variation", "qnh", "cavok", "gust" and "FEW_presenceCB" attribute (METAR Feature Engineer)

In [None]:
start = time()

step_d = preprocessor.named_steps['feature_engineer'].named_steps['feat_eng_metar'].fit_transform(step_c, y_train)

end = time()

print(f"Transforming Time: {(end-start)/60:.2f} minutes")

step_d.sample(3)

### create the "imageSatelite_red", "imageSatelite_yellow", "imageSatelite_green" and "imageSatelite_blue" attribute (meteorological satellite Feature Engineer)

In [None]:
start = time()

step_e = preprocessor.named_steps['feature_engineer'].named_steps['feat_eng_satelliteImage'].fit_transform(step_d, y_train)

end = time()

print(f"Transforming Time: {(end-start)/60:.2f} minutes")

step_e.sample(3)

### coding of categorical variables

In [None]:
start = time()

step_f = preprocessor.named_steps['coding_category_var'].fit_transform(step_e, y_train)

end = time()

print(f"Transforming Time: {(end-start)/60:.2f} minutes")

pd.DataFrame(step_f).sample(3)

### standardization of variables to the same scale (unit standard deviation and zero mean)

In [None]:
start = time()

_X_train = preprocessor.named_steps['transformer'].fit_transform(step_f, y_train)

end = time()

print(f"Transforming Time: {(end-start)/60:.2f} minutes")

pd.DataFrame(_X_train).sample(3)

### Validation Set

In [None]:
start = time()

_X_valid = preprocessor.transform(X_valid)

end = time()

print(f"Transforming Time: {(end-start)/60:.2f} minutes")



### Test Set

In [None]:
path_data_test = '../data/idsc_dataset.csv'

data_test = pd.read_csv(path_data_test, index_col = 'flightid', delimiter=';')

X_test = data_test[selected_cols].copy()

start = time()

_X_test = preprocessor.transform(X_test)

end = time()

print(f"Transforming Time: {(end-start)/60:.2f} minutes")

## Models

## Dummy Model

In [None]:
dummyModel = preprocessor.named_steps['feature_engineer'].named_steps['feat_eng_est_duration']._dammy_estimator

In [None]:
gethourFromDatetime = preprocessor.named_steps['feature_engineer'].named_steps['feat_eng_est_duration']._gethourFromDatetime

In [None]:
aux = X_test.copy().reset_index()

aux['route'] = aux['origem'] + '_' + aux['destino']
aux['estimated'] = None
aux['only_hour'] = aux.apply(lambda row: gethourFromDatetime(row), axis = 1)

### Inference

In [None]:
for index, row in aux.iterrows():
    try:
        mean_hour = dummyModel[(dummyModel['route'] == row['route']) & (dummyModel['only_hour'] == row['only_hour'])]['mean_hour']
        mean_absolute = dummyModel[(dummyModel['route'] == row['route'])].iloc[0, 3]
    except:
        mean_absolute = 0
    
    if(np.isnan(mean_hour).any()):
        aux.loc[index, 'estimated'] = mean_hour
    else:
        aux.loc[index, 'estimated'] = mean_absolute

### Submission

In [None]:
submission_dummy = aux[['flightid', 'estimated']]

submission_dummy.columns = ['ID', 'solution']
submission_dummy.to_csv(f"../data/submission/idsc_submission_dummy{datetime.now().strftime('%d-%B-%Ih%Mmin')}.csv", index=False)

submission_dummy.head(5)


## Artificial Neural Network

In [None]:
def create_model(input_shape = (_X_train.shape[1],),
                 neuron_number = 128,
                 optimizer = 'adam',
                 checkpoint = None,
                 activation ='relu',
                 layers=1,
                 dropout=0
                ):
    
    """
    Constructs a neural network model with specified parameters.
    
    :param input_shape: tuple, shape of the input data. Default is (37,).
    :param neuron_number: int, number of neurons in the dense layer(s). Default is 128.
    :param optimizer: str, optimizer used in model compilation. Default is 'adam'.
    :param checkpoint: str, path to the model checkpoint to load weights from. If the path exists, the model will load the weights. Default is None.
    :param activation: str, activation function used in the dense layers. Default is 'relu'.
    :param layers: int, number of dense layers to be added before the output layer. Default is 1.
    :param dropout: float, dropout rate to be applied after the dense layers. Value should be between 0 (no dropout) and 1 (full dropout). Default is 0.
    
    :return: Sequential, a Keras Sequential model constructed based on the provided parameters.
    """

    model = Sequential()
    model.add(Input(shape=input_shape))
                    
    #################################################################
                    
    for _ in range(layers):
        model.add(Dense(neuron_number, activation=activation ))
        model.add(Dropout(dropout))
        
    #################################################################
                        
    model.add(Dense(neuron_number//2, activation=activation ))
    model.add(Dense(1, activation='linear' ))

    if(checkpoint != None) and (os.path.exists(checkpoint)):
        model.load_weights(checkpoint)
        
    model.compile(loss= MeanSquaredError(), optimizer=optimizer)

    return model

### Callbacks

In [None]:
# log_dir=f"../logs/{datetime.now().strftime('%d-%B-%Ih%Mmin')}",

tensorboard = TensorBoard(
    log_dir=f"../logs/{datetime.now().strftime('%d-%B-%Ih%Mmin')}",
    histogram_freq=0,
    write_graph=False,
    write_images=False,
    write_steps_per_second=False,
    update_freq="epoch",
    profile_batch=0,
    embeddings_freq=0,
    embeddings_metadata=None
)

checkpoint = ModelCheckpoint(
    '../model/checkpoints',
    monitor="val_loss",
    verbose=verbose,
    save_best_only=True,
    save_weights_only=False,
    mode="min",
    save_freq="epoch"
)

earlystop = EarlyStopping(
    monitor='val_loss',
    min_delta=0,
    patience=20,
    verbose=verbose,
    restore_best_weights=True
)

reduceLr = ReduceLROnPlateau(
    monitor="val_loss",
    factor=0.1,
    patience=10,
    mode="min",
    verbose=verbose,
    min_delta=0.0001,
    min_lr=0
)

BackAndRes = BackupAndRestore(
    backup_dir = '../tmp',
    save_freq="epoch",
    delete_checkpoint=True,
    save_before_preemption=False
)


callbacks = [
    checkpoint,
    earlystop,
    tensorboard,
    reduceLr,
    BackAndRes,
    TerminateOnNaN()
]

### Wrapper Keras

In [None]:
create_model_params = {
    'checkpoint': None,
    'neuron_number': 128,
    'activation': 'relu',
    'model__layers': 3,
    'model__dropout': 0.05
}

In [None]:
# Wrap the model with KerasClassifier
ann_model = KerasRegressor(model=create_model, callbacks=callbacks, **create_model_params)

### Pipeline

In [None]:
ann_regressor = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('model', ann_model)
    ]
)

### Training

In [None]:
ann_param_grid = {
    'model__batch_size': [32],
    'model__optimizer': [
        Adam(learning_rate=0.01),
        # RMSprop(learning_rate=0.01),
        # SGD(learning_rate=0.01),
        # Adamax(learning_rate=0.01),
        # Adadelta(learning_rate=0.01),
        # Adagrad(learning_rate=0.01)
    ]
}

In [None]:
ann_fit_params = {
    'model__epochs': ann_epochs,
    'model__verbose': verbose,
    'model__shuffle': True,
    'model__steps_per_epoch': None,
    'model__validation_data': (_X_valid, y_valid),
    'model__validation_steps': None,
    'model__validation_batch_size': None,
    'model__validation_freq': 1
}

### Gridsearch and Cross-Validation

In [None]:
ann_gridSearch = GridSearchCV(
    estimator = ann_regressor,
    param_grid = ann_param_grid,
    scoring='neg_mean_squared_error',
    n_jobs=1,
    refit=True,
    cv= fold_cross_validation,
    verbose=2#verbose
)

In [None]:
start = time()

ann_gridSearch.fit(X=X_train, y=y_train, **ann_fit_params)

end = time()

print(f"Training Time: {(end-start)/60:.2f} minutes")

pd.DataFrame(ann_gridSearch.cv_results_)[[
    'rank_test_score', 'param_model__batch_size', 'param_model__optimizer',
    'mean_test_score', 'std_test_score', 'mean_fit_time'
]].sort_values(by='rank_test_score').set_index('rank_test_score')

### Best Model

In [None]:
ann_best_model = ann_gridSearch.best_estimator_

In [None]:
dump(ann_best_model, '../model/ann_best_model.pkl')

## XGBoost

In [None]:
xgb_param_grid = {
    'xgb__booster': ['gbtree'], # 'gbtree', 'dart', 'gblinear'
    'xgb__n_estimators': [xgb_estimators], # normalmente entre 50 e 1000
    
    ##### Parâmetros para Tree Booster (booster = gbtree) ##########
       
    'xgb__learning_rate': [0.1], # de 0 a 1
    'xgb__gamma': [0],
    'xgb__max_depth': [5],
    'xgb__min_child_weight': [5], # 1
    'xgb__max_delta_step': [0],
    'xgb__subsample': [1],
    'xgb__sampling_method': ['uniform'], # 'uniform', 'gradient_based', 
    'xgb__colsample_bytree': [1], # de 0 a 1
    'xgb__colsample_bylevel': [1], # de 0 a 1
    'xgb__colsample_bynode': [1], # de 0 a 1
    'xgb__scale_pos_weight': [1],
    'xgb__grow_policy': ['depthwise'], # 'depthwise', 'lossguide'
    'xgb__max_leaves': [0],
    'xgb__max_bin': [256],
    'xgb__num_parallel_tree': [1],
    'xgb__refresh_leaf': [1],
    
    #################################################################
    
    # Parâmetros para Dart Booster (booster = dart) #################
    
    'xgb__sample_type': ['uniform'], # 'uniform', 'weighted'
    'xgb__normalize_type': ['tree'], # 'tree', 'forest'
    'xgb__rate_drop': [0], # de 0 a 1
    'xgb__one_drop': [0], 
    'xgb__skip_drop': [0],
    
    #################################################################
    
    # Parâmetros para Linear Booster (booster = gblinear) ###########

    'xgb__feature_selector': ['cyclic'], # 'cyclic', 'shuffle', 'random', 'greedy', 'thrifty',  
    'xgb__top_k': [0]
    
    #################################################################   
}

In [None]:
xgb_fit_params = {
    'xgb__eval_set': [(_X_valid, y_valid)]
}

### Gridseach and Cross Validation

In [None]:
xgb_gridSearch = GridSearchCV(
    estimator = xgb_regressor,
    param_grid = xgb_param_grid,
    scoring='neg_mean_squared_error',
    n_jobs=1,
    refit=True,
    cv=fold_cross_validation,
    verbose=verbose
)

In [None]:
start = time()

xgb_gridSearch.fit(X=X_train, y=y_train, **xgb_fit_params)

end = time()

print(f"Training Time: {(end-start)/60:.2f} minutes")

pd.DataFrame(xgb_gridSearch.cv_results_)[[
    'rank_test_score', 'param_xgb__booster', 'mean_test_score', 
    'std_test_score', 'mean_fit_time'
]].sort_values(by='rank_test_score').set_index('rank_test_score')

### Best Model

In [None]:
xgb_best_model = xgb_gridSearch.best_estimator_

In [None]:
dump(xgb_best_model, '../model/xgb_best_model.pkl')

## Multiple XGBoost

### Training and Validation Set

In [None]:
data_train_all = data_train_origin.copy() # pd.read_csv(path_data_train, index_col = 'flightid')

data_train_all = dropOutlier(
    data_train_all, ['duration'],
    lower_percentile=0.1,
    upper_percentile=0.9
)


data_train_all['route'] = data_train_all['origem'] + '_' + data_train_all['destino']

### Test Set

In [None]:
data_test_all = pd.read_csv(path_data_test, index_col = 'flightid', delimiter=';')

data_test_all['route'] = data_test_all['origem'] + '_' + data_test_all['destino']

_X_train_mxgb = dict()
_X_valid_mxgb = dict()
_X_test_mxgb = dict()
y_train_mxgb = dict()
y_valid_mxgb = dict()
preprocessor_mxgb = dict()

for route in data_train_all['route'].unique():
        
    transformer_mxgb = Pipeline(steps=[
        # ('trans_stand', StandardScaler(with_mean=True)),
        ('trans_norm', Normalizer()),
        # ('trans_sparse_pca', SparsePCA(n_components=pca_n_components_mxgb)),
        # ('trans_pca', PCA(n_components=pca_n_components_mxgb))
    ])
    
    # Pipeline
    preprocessor_mxgb[route] = Pipeline(steps=[
        ('imputer', imputer),
        ('feature_engineer', feature_engineer),
        ('remove_const_attr', RemoveConstantAttribute()),
        ('transformer', transformer_mxgb)
    ])
        
    ## Training and Validation Set
    
    data_train_mxgb = data_train_all[data_train_all['route'] == route]
    
    # Separate target from predictors
    X_mxgb = data_train_mxgb.drop(['duration'], axis=1)
    y_mxgb = data_train_mxgb.duration
    
    # Divide data into training and validation subsets
    if len(X_mxgb) > 1:
        X_train_full, X_valid_full, y_train_temp, y_valid_temp = train_test_split(X_mxgb, y_mxgb, train_size=0.8)
    
    else:
        X_train_full = X_mxgb
        X_valid_full = X_mxgb
        y_train_temp = y_mxgb
        y_valid_temp = y_mxgb
     
    # Keep selected columns only
    X_train_mxgb = X_train_full[selected_cols].copy()
    X_valid_mxgb = X_valid_full[selected_cols].copy()
    X_test_mxgb = data_test_all[data_test_all['route'] == route][selected_cols].copy()
    
    if len(X_train_mxgb) == 0:
        X_train_mxgb = X_valid_mxgb
    
    y_train_mxgb[route] = y_train_temp
    y_valid_mxgb[route] = y_valid_temp
    
    
    print(f"{route}: train: {len(X_train_mxgb)} - valid: {len(X_valid_mxgb)} - test: {len(X_test_mxgb)}")
    
    _X_train_mxgb[route] = preprocessor_mxgb[route].fit_transform(X_train_mxgb, y_train_mxgb[route])
    _X_valid_mxgb[route] = preprocessor_mxgb[route].transform(X_valid_mxgb)
    
    if len(X_test_mxgb) > 0:
        _X_test_mxgb[route] = preprocessor_mxgb[route].transform(X_test_mxgb)

### Segment data

In [None]:
_X_train_mxgb = dict()
_X_valid_mxgb = dict()
_X_test_mxgb = dict()
y_train_mxgb = dict()
y_valid_mxgb = dict()
preprocessor_mxgb = dict()

for route in data_train_all['route'].unique():
        
    transformer_mxgb = Pipeline(steps=[
        # ('trans_stand', StandardScaler(with_mean=True)),
        ('trans_norm', Normalizer()),
        # ('trans_sparse_pca', SparsePCA(n_components=pca_n_components_mxgb)),
        # ('trans_pca', PCA(n_components=pca_n_components_mxgb))
    ])
    
    # Pipeline
    preprocessor_mxgb[route] = Pipeline(steps=[
        ('imputer', imputer),
        ('feature_engineer', feature_engineer),
        ('remove_const_attr', RemoveConstantAttribute()),
        ('transformer', transformer_mxgb)
    ])
        
    ## Training and Validation Set
    
    data_train_mxgb = data_train_all[data_train_all['route'] == route]
    
    # Separate target from predictors
    X_mxgb = data_train_mxgb.drop(['duration'], axis=1)
    y_mxgb = data_train_mxgb.duration
    
    # Divide data into training and validation subsets
    if len(X_mxgb) > 1:
        X_train_full, X_valid_full, y_train_temp, y_valid_temp = train_test_split(X_mxgb, y_mxgb, train_size=0.8)
    
    else:
        X_train_full = X_mxgb
        X_valid_full = X_mxgb
        y_train_temp = y_mxgb
        y_valid_temp = y_mxgb
     
    # Keep selected columns only
    X_train_mxgb = X_train_full[selected_cols].copy()
    X_valid_mxgb = X_valid_full[selected_cols].copy()
    X_test_mxgb = data_test_all[data_test_all['route'] == route][selected_cols].copy()
    
    if len(X_train_mxgb) == 0:
        X_train_mxgb = X_valid_mxgb
    
    y_train_mxgb[route] = y_train_temp
    y_valid_mxgb[route] = y_valid_temp
    
    
    print(f"{route}: train: {len(X_train_mxgb)} - valid: {len(X_valid_mxgb)} - test: {len(X_test_mxgb)}")
    
    _X_train_mxgb[route] = preprocessor_mxgb[route].fit_transform(X_train_mxgb, y_train_mxgb[route])
    _X_valid_mxgb[route] = preprocessor_mxgb[route].transform(X_valid_mxgb)
    
    if len(X_test_mxgb) > 0:
        _X_test_mxgb[route] = preprocessor_mxgb[route].transform(X_test_mxgb)

### Training

In [None]:
mxgb_model = dict()

for route in data_train_all['route'].unique():  
    
    print(f"Training route: {route}")
    
    mxgb = XGBRegressor(
        tree_method= 'hist',
        device="cuda",
        objective='reg:squarederror',
        eval_metric=mean_squared_error,
        early_stopping_rounds=100,
        verbosity = verbose
    )  
    
    mxgb_regressor = Pipeline(
        steps=[
            ('mxgb', mxgb)
        ]
    )
    
    mxgb_param_grid = {
        'mxgb__booster': ['gbtree'], # 'gbtree', 'dart', 'gblinear'
        'mxgb__n_estimators': [xgb_estimators], # normalmente entre 50 e 1000

        ##### Parâmetros para Tree Booster (booster = gbtree) ##########

        'mxgb__learning_rate': [0.1], # de 0 a 1
        'mxgb__gamma': [0],
        'mxgb__max_depth': [5],
        'mxgb__min_child_weight': [5], # 1
        'mxgb__max_delta_step': [0],
        'mxgb__subsample': [1],
        'mxgb__sampling_method': ['uniform'], # 'uniform', 'gradient_based', 
        'mxgb__colsample_bytree': [1], # de 0 a 1
        'mxgb__colsample_bylevel': [1], # de 0 a 1
        'mxgb__colsample_bynode': [1], # de 0 a 1
        'mxgb__scale_pos_weight': [1],
        'mxgb__grow_policy': ['depthwise'], # 'depthwise', 'lossguide'
        'mxgb__max_leaves': [0],
        'mxgb__max_bin': [256],
        'mxgb__num_parallel_tree': [1],
        'mxgb__refresh_leaf': [1],

        #################################################################

        # Parâmetros para Dart Booster (booster = dart) #################

        'mxgb__sample_type': ['uniform'], # 'uniform', 'weighted'
        'mxgb__normalize_type': ['tree'], # 'tree', 'forest'
        'mxgb__rate_drop': [0], # de 0 a 1
        'mxgb__one_drop': [0], 
        'mxgb__skip_drop': [0],

        #################################################################

        # Parâmetros para Linear Booster (booster = gblinear) ###########

        'mxgb__feature_selector': ['cyclic'], # 'cyclic', 'shuffle', 'random', 'greedy', 'thrifty',  
        'mxgb__top_k': [0]

        #################################################################   
    }
    
    mxgb_fit_params = {
        'mxgb__eval_set': [(_X_valid_mxgb[route], y_valid_mxgb[route])]
    }
    
    mxgb_gridSearch = GridSearchCV(
        estimator = mxgb_regressor,
        param_grid = mxgb_param_grid,
        scoring='neg_mean_squared_error',
        n_jobs=1,
        refit=True,
        cv=5,
        verbose=verbose
    )
    
    mxgb_gridSearch.fit(X=_X_train_mxgb[route], y=y_train_mxgb[route], **mxgb_fit_params)
    mxgb_model[route] = mxgb_gridSearch.best_estimator_

## Test

### ANN

In [None]:
y_predict_ann = ann_best_model.predict(X_test, verbose=verbose)

submission_ann = pd.DataFrame(y_predict_ann, index=X_test.index).reset_index()

submission_ann.columns = ['ID', 'solution']
submission_ann.to_csv(f"../data/submission/idsc_submission_ann_{datetime.now().strftime('%d-%B-%Ih%Mmin')}.csv", index=False)

submission_ann.head(5)


### XGBoost

In [None]:
y_predict_xgb = xgb_best_model.predict(X_test)

submission_xgb = pd.DataFrame(y_predict_xgb, index=X_test.index).reset_index()

submission_xgb.columns = ['ID', 'solution']
submission_xgb.to_csv(f"../data/submission/idsc_submission_xgb_{datetime.now().strftime('%d-%B-%Ih%Mmin')}.csv", index=False)

submission_xgb.head(5)

### Multiple XGBoost

In [None]:
data_test_all.loc[:, 'solution'] = None

for route in data_test_all['route'].unique():
    try:
        prediction = mxgb_model[route].predict(_X_test_mxgb[route])
    except:
        prediction = 0

    index = data_test_all[data_test_all['route'] == route].index
    data_test_all.loc[index,'solution'] = prediction

In [None]:
submission_mxgb = data_test_all.reset_index()[['flightid', 'solution']]

submission_mxgb.columns = [['ID', 'solution']]
submission_mxgb.to_csv(f"../data/submission/idsc_submission_mxgb_{datetime.now().strftime('%d-%B-%Ih%Mmin')}.csv", index=False)


submission_mxgb.head(5)