In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()
import os

pd.set_option('future.no_silent_downcasting', True)

In [2]:
def fix_duration_format(s):
    if pd.isna(s):
        return pd.NaT
    if isinstance(s, (int, float)): 
        return pd.NaT
    if '.' in s:
        days, time = s.split('.', 1)
        if days.isdigit():
            return f"{int(days)} days {time}"
    return s

In [3]:
unk_code = -1

In [4]:
train = pd.read_parquet('C:/Users/Николай/PycharmProjects/FlightRank_2025/data/train.parquet', engine='pyarrow')

In [5]:
test = pd.read_parquet('C:/Users/Николай/PycharmProjects/FlightRank_2025/data/test.parquet', engine='pyarrow')

In [6]:
train.fillna(-2, inplace=True)
test.fillna(-2, inplace=True)

In [7]:
del train['Id'], test['Id']
del train['bySelf'], test['bySelf']
del train['pricingInfo_passengerCount'], test['pricingInfo_passengerCount']

In [8]:
companyID_map = {old_id: new_id for new_id, old_id in enumerate(train['companyID'].unique())}

train['companyID'] = train['companyID'].map(companyID_map).astype('UInt16')
test['companyID'] = test['companyID'].map(companyID_map).fillna(len(companyID_map)).astype('UInt16')

In [9]:
corporateTariffCode_map = {old_id: new_id for new_id, old_id in enumerate(train['corporateTariffCode'].unique())}

train['corporateTariffCode'] = train['corporateTariffCode'].map(corporateTariffCode_map).astype('UInt8')
test['corporateTariffCode'] = test['corporateTariffCode'].map(corporateTariffCode_map).fillna(len(corporateTariffCode_map)).astype('UInt8')

In [10]:
nationality_map = {old_id: new_id for new_id, old_id in enumerate(train['nationality'].unique())}

train['nationality'] = train['nationality'].map(nationality_map).astype('UInt8')
test['nationality'] = test['nationality'].map(nationality_map).fillna(len(nationality_map)).astype('UInt8')

# CHECK

In [11]:
values = train['frequentFlyer']
all_codes = values.str.split('/').explode()
frequentFlyer_mapping = {old_id: new_id for new_id, old_id in enumerate(all_codes.unique())}

def encode_codes(s):
    s = str(s)
    return '/'.join(str(frequentFlyer_mapping.get(code, len(frequentFlyer_mapping))) for code in s.split('/'))

# Применяем к train и test
train['frequentFlyer'] = train['frequentFlyer'].progress_apply(encode_codes)
test['frequentFlyer']  = test['frequentFlyer'].progress_apply(encode_codes)

100%|██████████| 18145372/18145372 [00:16<00:00, 1107151.08it/s]
100%|██████████| 6897776/6897776 [00:06<00:00, 996907.58it/s] 


# L_0

In [12]:
train['legs0_arrivalAt'] = pd.to_datetime(train['legs0_arrivalAt'])
test['legs0_arrivalAt'] = pd.to_datetime(test['legs0_arrivalAt'])

In [13]:
train['legs0_departureAt'] = pd.to_datetime(train['legs0_departureAt'])
test['legs0_departureAt'] = pd.to_datetime(test['legs0_departureAt'])

In [14]:
train['legs0_duration'] = pd.to_timedelta(train['legs0_duration'].apply(fix_duration_format))
train['legs0_duration'] = (train['legs0_duration'].dt.total_seconds() // 60).astype('UInt16')

test['legs0_duration'] = pd.to_timedelta(test['legs0_duration'].apply(fix_duration_format))
test['legs0_duration'] = (test['legs0_duration'].dt.total_seconds() // 60).astype('UInt16')

# L_0_S_0

In [15]:
legs0_segments0_aircraft_code_map = {old_id: new_id for new_id, old_id in enumerate(train['legs0_segments0_aircraft_code'].unique())}

train['legs0_segments0_aircraft_code'] = train['legs0_segments0_aircraft_code'].map(legs0_segments0_aircraft_code_map).astype('UInt8')
test['legs0_segments0_aircraft_code'] = test['legs0_segments0_aircraft_code'].map(legs0_segments0_aircraft_code_map).fillna(len(legs0_segments0_aircraft_code_map)).astype('UInt8')

In [16]:
legs0_segments0_arrivalTo_airport_city_iata_map = {old_id: new_id for new_id, old_id in enumerate(train['legs0_segments0_arrivalTo_airport_city_iata'].unique())}

train['legs0_segments0_arrivalTo_airport_city_iata'] = train['legs0_segments0_arrivalTo_airport_city_iata'].map(legs0_segments0_arrivalTo_airport_city_iata_map).astype('UInt16')
test['legs0_segments0_arrivalTo_airport_city_iata'] = test['legs0_segments0_arrivalTo_airport_city_iata'].map(legs0_segments0_arrivalTo_airport_city_iata_map).fillna(len(legs0_segments0_arrivalTo_airport_city_iata_map)).astype('UInt16')

In [17]:
legs0_segments0_arrivalTo_airport_iata_map = {old_id: new_id for new_id, old_id in enumerate(train['legs0_segments0_arrivalTo_airport_iata'].unique())}

train['legs0_segments0_arrivalTo_airport_iata'] = train['legs0_segments0_arrivalTo_airport_iata'].map(legs0_segments0_arrivalTo_airport_iata_map).astype('UInt16')
test['legs0_segments0_arrivalTo_airport_iata'] = test['legs0_segments0_arrivalTo_airport_iata'].map(legs0_segments0_arrivalTo_airport_iata_map).fillna(len(legs0_segments0_arrivalTo_airport_iata_map)).astype('UInt16')

In [18]:
legs0_segments0_baggageAllowance_quantity_map = {old_id: new_id for new_id, old_id in enumerate(train['legs0_segments0_baggageAllowance_quantity'].unique())}

train['legs0_segments0_baggageAllowance_quantity'] = train['legs0_segments0_baggageAllowance_quantity'].map(legs0_segments0_baggageAllowance_quantity_map).astype('UInt8')
test['legs0_segments0_baggageAllowance_quantity'] = test['legs0_segments0_baggageAllowance_quantity'].map(legs0_segments0_baggageAllowance_quantity_map).fillna(len(legs0_segments0_baggageAllowance_quantity_map)).astype('UInt8')

In [19]:
legs0_segments0_baggageAllowance_weightMeasurementType_map = {old_id: new_id for new_id, old_id in enumerate(train['legs0_segments0_baggageAllowance_weightMeasurementType'].unique())}

train['legs0_segments0_baggageAllowance_weightMeasurementType'] = train['legs0_segments0_baggageAllowance_weightMeasurementType'].map(legs0_segments0_baggageAllowance_weightMeasurementType_map).astype('UInt8')
test['legs0_segments0_baggageAllowance_weightMeasurementType'] = test['legs0_segments0_baggageAllowance_weightMeasurementType'].map(legs0_segments0_baggageAllowance_weightMeasurementType_map).fillna(len(legs0_segments0_baggageAllowance_weightMeasurementType_map)).astype('UInt8')

In [20]:
legs0_segments0_cabinClass_map = {old_id: new_id for new_id, old_id in enumerate(train['legs0_segments0_cabinClass'].unique())}

train['legs0_segments0_cabinClass'] = train['legs0_segments0_cabinClass'].map(legs0_segments0_cabinClass_map).astype('UInt8')
test['legs0_segments0_cabinClass'] = test['legs0_segments0_cabinClass'].map(legs0_segments0_cabinClass_map).fillna(len(legs0_segments0_cabinClass_map)).astype('UInt8')

In [21]:
legs0_segments0_departureFrom_airport_iata_map = {old_id: new_id for new_id, old_id in enumerate(train['legs0_segments0_departureFrom_airport_iata'].unique())}

train['legs0_segments0_departureFrom_airport_iata'] = train['legs0_segments0_departureFrom_airport_iata'].map(legs0_segments0_departureFrom_airport_iata_map).astype('UInt16')
test['legs0_segments0_departureFrom_airport_iata'] = test['legs0_segments0_departureFrom_airport_iata'].map(legs0_segments0_departureFrom_airport_iata_map).fillna(len(legs0_segments0_departureFrom_airport_iata_map)).astype('UInt16')

In [22]:
train['legs0_segments0_duration'] = pd.to_timedelta(train['legs0_segments0_duration'])
train['legs0_segments0_duration'] = (train['legs0_segments0_duration'].dt.total_seconds() // 60).astype('UInt16')

test['legs0_segments0_duration'] = pd.to_timedelta(test['legs0_segments0_duration'])
test['legs0_segments0_duration'] = (test['legs0_segments0_duration'].dt.total_seconds() // 60).astype('UInt16')

In [23]:
legs0_segments0_flightNumber_map = {old_id: new_id for new_id, old_id in enumerate(train['legs0_segments0_flightNumber'].unique())}

train['legs0_segments0_flightNumber'] = train['legs0_segments0_flightNumber'].map(legs0_segments0_flightNumber_map).astype('UInt16')
test['legs0_segments0_flightNumber'] = test['legs0_segments0_flightNumber'].map(legs0_segments0_flightNumber_map).fillna(len(legs0_segments0_flightNumber_map)).astype('UInt16')

In [24]:
legs0_segments0_marketingCarrier_code_map = {old_id: new_id for new_id, old_id in enumerate(train['legs0_segments0_marketingCarrier_code'].unique())}

train['legs0_segments0_marketingCarrier_code'] = train['legs0_segments0_marketingCarrier_code'].map(legs0_segments0_marketingCarrier_code_map).astype('UInt8')
test['legs0_segments0_marketingCarrier_code'] = test['legs0_segments0_marketingCarrier_code'].map(legs0_segments0_marketingCarrier_code_map).fillna(len(legs0_segments0_marketingCarrier_code_map)).astype('UInt8')

In [25]:
legs0_segments0_operatingCarrier_code_map = {old_id: new_id for new_id, old_id in enumerate(train['legs0_segments0_operatingCarrier_code'].unique())}

train['legs0_segments0_operatingCarrier_code'] = train['legs0_segments0_operatingCarrier_code'].map(legs0_segments0_operatingCarrier_code_map).astype('UInt8')
test['legs0_segments0_operatingCarrier_code'] = test['legs0_segments0_operatingCarrier_code'].map(legs0_segments0_operatingCarrier_code_map).fillna(len(legs0_segments0_operatingCarrier_code_map)).astype('UInt8')

In [26]:
legs0_segments0_seatsAvailable_map = {old_id: new_id for new_id, old_id in enumerate(train['legs0_segments0_seatsAvailable'].unique())}

train['legs0_segments0_seatsAvailable'] = train['legs0_segments0_seatsAvailable'].map(legs0_segments0_seatsAvailable_map).astype('UInt8')
test['legs0_segments0_seatsAvailable'] = test['legs0_segments0_seatsAvailable'].map(legs0_segments0_seatsAvailable_map).fillna(len(legs0_segments0_seatsAvailable_map)).astype('UInt8')

# L_0_S_1

In [27]:
legs0_segments1_aircraft_code_map = {old_id: new_id for new_id, old_id in enumerate(train['legs0_segments1_aircraft_code'].unique())}

train['legs0_segments1_aircraft_code'] = train['legs0_segments1_aircraft_code'].map(legs0_segments1_aircraft_code_map).astype('UInt8')
test['legs0_segments1_aircraft_code'] = test['legs0_segments1_aircraft_code'].map(legs0_segments1_aircraft_code_map).fillna(len(legs0_segments1_aircraft_code_map)).astype('UInt8')

In [28]:
legs0_segments1_arrivalTo_airport_city_iata_map = {old_id: new_id for new_id, old_id in enumerate(train['legs0_segments1_arrivalTo_airport_city_iata'].unique())}

train['legs0_segments1_arrivalTo_airport_city_iata'] = train['legs0_segments1_arrivalTo_airport_city_iata'].map(legs0_segments1_arrivalTo_airport_city_iata_map).astype('UInt16')
test['legs0_segments1_arrivalTo_airport_city_iata'] = test['legs0_segments1_arrivalTo_airport_city_iata'].map(legs0_segments1_arrivalTo_airport_city_iata_map).fillna(len(legs0_segments1_arrivalTo_airport_city_iata_map)).astype('UInt16')

In [29]:
legs0_segments1_arrivalTo_airport_iata_map = {old_id: new_id for new_id, old_id in enumerate(train['legs0_segments1_arrivalTo_airport_iata'].unique())}

train['legs0_segments1_arrivalTo_airport_iata'] = train['legs0_segments1_arrivalTo_airport_iata'].map(legs0_segments1_arrivalTo_airport_iata_map).astype('UInt16')
test['legs0_segments1_arrivalTo_airport_iata'] = test['legs0_segments1_arrivalTo_airport_iata'].map(legs0_segments1_arrivalTo_airport_iata_map).fillna(len(legs0_segments1_arrivalTo_airport_iata_map)).astype('UInt16')

In [30]:
legs0_segments1_baggageAllowance_quantity_map = {old_id: new_id for new_id, old_id in enumerate(train['legs0_segments1_baggageAllowance_quantity'].unique())}

train['legs0_segments1_baggageAllowance_quantity'] = train['legs0_segments1_baggageAllowance_quantity'].map(legs0_segments1_baggageAllowance_quantity_map).astype('UInt8')
test['legs0_segments1_baggageAllowance_quantity'] = test['legs0_segments1_baggageAllowance_quantity'].map(legs0_segments1_baggageAllowance_quantity_map).fillna(len(legs0_segments1_baggageAllowance_quantity_map)).astype('UInt8')

In [31]:
legs0_segments1_baggageAllowance_weightMeasurementType_map = {old_id: new_id for new_id, old_id in enumerate(train['legs0_segments1_baggageAllowance_weightMeasurementType'].unique())}

train['legs0_segments1_baggageAllowance_weightMeasurementType'] = train['legs0_segments1_baggageAllowance_weightMeasurementType'].map(legs0_segments1_baggageAllowance_weightMeasurementType_map).astype('UInt8')
test['legs0_segments1_baggageAllowance_weightMeasurementType'] = test['legs0_segments1_baggageAllowance_weightMeasurementType'].map(legs0_segments1_baggageAllowance_weightMeasurementType_map).fillna(len(legs0_segments1_baggageAllowance_weightMeasurementType_map)).astype('UInt8')

In [32]:
legs0_segments1_cabinClass_map = {old_id: new_id for new_id, old_id in enumerate(train['legs0_segments1_cabinClass'].unique())}

train['legs0_segments1_cabinClass'] = train['legs0_segments1_cabinClass'].map(legs0_segments1_cabinClass_map).astype('UInt8')
test['legs0_segments1_cabinClass'] = test['legs0_segments1_cabinClass'].map(legs0_segments1_cabinClass_map).fillna(len(legs0_segments1_cabinClass_map)).astype('UInt8')

In [33]:
legs0_segments1_departureFrom_airport_iata_map = {old_id: new_id for new_id, old_id in enumerate(train['legs0_segments1_departureFrom_airport_iata'].unique())}

train['legs0_segments1_departureFrom_airport_iata'] = train['legs0_segments1_departureFrom_airport_iata'].map(legs0_segments1_departureFrom_airport_iata_map).astype('UInt16')
test['legs0_segments1_departureFrom_airport_iata'] = test['legs0_segments1_departureFrom_airport_iata'].map(legs0_segments1_departureFrom_airport_iata_map).fillna(len(legs0_segments1_departureFrom_airport_iata_map)).astype('UInt16')

In [34]:
train['legs0_segments1_duration'] = pd.to_timedelta(train['legs0_segments1_duration'].apply(fix_duration_format))
train['legs0_segments1_duration'] = (train['legs0_segments1_duration'].dt.total_seconds() // 60).fillna(0).astype('UInt16')

test['legs0_segments1_duration'] = pd.to_timedelta(test['legs0_segments1_duration'].apply(fix_duration_format))
test['legs0_segments1_duration'] = (test['legs0_segments1_duration'].dt.total_seconds() // 60).fillna(0).astype('UInt16')

In [35]:
legs0_segments1_flightNumber_map = {old_id: new_id for new_id, old_id in enumerate(train['legs0_segments1_flightNumber'].unique())}

train['legs0_segments1_flightNumber'] = train['legs0_segments1_flightNumber'].map(legs0_segments1_flightNumber_map).astype('UInt16')
test['legs0_segments1_flightNumber'] = test['legs0_segments1_flightNumber'].map(legs0_segments1_flightNumber_map).fillna(len(legs0_segments1_flightNumber_map)).astype('UInt16')

In [36]:
legs0_segments1_marketingCarrier_code_map = {old_id: new_id for new_id, old_id in enumerate(train['legs0_segments1_marketingCarrier_code'].unique())}

train['legs0_segments1_marketingCarrier_code'] = train['legs0_segments1_marketingCarrier_code'].map(legs0_segments1_marketingCarrier_code_map).astype('UInt8')
test['legs0_segments1_marketingCarrier_code'] = test['legs0_segments1_marketingCarrier_code'].map(legs0_segments1_marketingCarrier_code_map).fillna(len(legs0_segments1_marketingCarrier_code_map)).astype('UInt8')

In [37]:
legs0_segments1_operatingCarrier_code_map = {old_id: new_id for new_id, old_id in enumerate(train['legs0_segments1_operatingCarrier_code'].unique())}

train['legs0_segments1_operatingCarrier_code'] = train['legs0_segments1_operatingCarrier_code'].map(legs0_segments1_operatingCarrier_code_map).astype('UInt8')
test['legs0_segments1_operatingCarrier_code'] = test['legs0_segments1_operatingCarrier_code'].map(legs0_segments1_operatingCarrier_code_map).fillna(len(legs0_segments1_operatingCarrier_code_map)).astype('UInt8')

In [38]:
legs0_segments1_seatsAvailable_map = {old_id: new_id for new_id, old_id in enumerate(train['legs0_segments1_seatsAvailable'].unique())}

train['legs0_segments1_seatsAvailable'] = train['legs0_segments1_seatsAvailable'].map(legs0_segments1_seatsAvailable_map).astype('UInt8')
test['legs0_segments1_seatsAvailable'] = test['legs0_segments1_seatsAvailable'].map(legs0_segments1_seatsAvailable_map).fillna(len(legs0_segments1_seatsAvailable_map)).astype('UInt8')

# L_0_S_2

In [39]:
legs0_segments2_aircraft_code_map = {old_id: new_id for new_id, old_id in enumerate(train['legs0_segments2_aircraft_code'].unique())}

train['legs0_segments2_aircraft_code'] = train['legs0_segments2_aircraft_code'].map(legs0_segments2_aircraft_code_map).astype('UInt8')
test['legs0_segments2_aircraft_code'] = test['legs0_segments2_aircraft_code'].map(legs0_segments2_aircraft_code_map).fillna(len(legs0_segments2_aircraft_code_map)).astype('UInt8')

In [40]:
legs0_segments2_arrivalTo_airport_city_iata_map = {old_id: new_id for new_id, old_id in enumerate(train['legs0_segments2_arrivalTo_airport_city_iata'].unique())}

train['legs0_segments2_arrivalTo_airport_city_iata'] = train['legs0_segments2_arrivalTo_airport_city_iata'].map(legs0_segments2_arrivalTo_airport_city_iata_map).astype('UInt8')
test['legs0_segments2_arrivalTo_airport_city_iata'] = test['legs0_segments2_arrivalTo_airport_city_iata'].map(legs0_segments2_arrivalTo_airport_city_iata_map).fillna(len(legs0_segments2_arrivalTo_airport_city_iata_map)).astype('UInt8')

In [41]:
legs0_segments2_arrivalTo_airport_iata_map = {old_id: new_id for new_id, old_id in enumerate(train['legs0_segments2_arrivalTo_airport_iata'].unique())}

train['legs0_segments2_arrivalTo_airport_iata'] = train['legs0_segments2_arrivalTo_airport_iata'].map(legs0_segments2_arrivalTo_airport_iata_map).astype('UInt16')
test['legs0_segments2_arrivalTo_airport_iata'] = test['legs0_segments2_arrivalTo_airport_iata'].map(legs0_segments2_arrivalTo_airport_iata_map).fillna(len(legs0_segments2_arrivalTo_airport_iata_map)).astype('UInt16')

In [42]:
legs0_segments2_baggageAllowance_quantity_map = {old_id: new_id for new_id, old_id in enumerate(train['legs0_segments2_baggageAllowance_quantity'].unique())}

train['legs0_segments2_baggageAllowance_quantity'] = train['legs0_segments2_baggageAllowance_quantity'].map(legs0_segments2_baggageAllowance_quantity_map).astype('UInt8')
test['legs0_segments2_baggageAllowance_quantity'] = test['legs0_segments2_baggageAllowance_quantity'].map(legs0_segments2_baggageAllowance_quantity_map).fillna(len(legs0_segments2_baggageAllowance_quantity_map)).astype('UInt8')

In [43]:
legs0_segments2_baggageAllowance_weightMeasurementType_map = {old_id: new_id for new_id, old_id in enumerate(train['legs0_segments2_baggageAllowance_weightMeasurementType'].unique())}

train['legs0_segments2_baggageAllowance_weightMeasurementType'] = train['legs0_segments2_baggageAllowance_weightMeasurementType'].map(legs0_segments2_baggageAllowance_weightMeasurementType_map).astype('UInt8')
test['legs0_segments2_baggageAllowance_weightMeasurementType'] = test['legs0_segments2_baggageAllowance_weightMeasurementType'].map(legs0_segments2_baggageAllowance_weightMeasurementType_map).fillna(len(legs0_segments2_baggageAllowance_weightMeasurementType_map)).astype('UInt8')

In [44]:
legs0_segments2_cabinClass_map = {old_id: new_id for new_id, old_id in enumerate(train['legs0_segments2_cabinClass'].unique())}

train['legs0_segments2_cabinClass'] = train['legs0_segments2_cabinClass'].map(legs0_segments2_cabinClass_map).astype('UInt8')
test['legs0_segments2_cabinClass'] = test['legs0_segments2_cabinClass'].map(legs0_segments2_cabinClass_map).fillna(len(legs0_segments2_cabinClass_map)).astype('UInt8')

In [45]:
legs0_segments2_departureFrom_airport_iata_map = {old_id: new_id for new_id, old_id in enumerate(train['legs0_segments2_departureFrom_airport_iata'].unique())}

train['legs0_segments2_departureFrom_airport_iata'] = train['legs0_segments2_departureFrom_airport_iata'].map(legs0_segments2_departureFrom_airport_iata_map).astype('UInt16')
test['legs0_segments2_departureFrom_airport_iata'] = test['legs0_segments2_departureFrom_airport_iata'].map(legs0_segments2_departureFrom_airport_iata_map).fillna(len(legs0_segments2_departureFrom_airport_iata_map)).astype('UInt16')

In [46]:
train['legs0_segments2_duration'] = pd.to_timedelta(train['legs0_segments2_duration'].apply(fix_duration_format))
train['legs0_segments2_duration'] = (train['legs0_segments2_duration'].dt.total_seconds() // 60).fillna(0).astype('UInt16')

test['legs0_segments2_duration'] = pd.to_timedelta(test['legs0_segments2_duration'].apply(fix_duration_format))
test['legs0_segments2_duration'] = (test['legs0_segments2_duration'].dt.total_seconds() // 60).fillna(0).astype('UInt16')

In [47]:
legs0_segments2_flightNumber_map = {old_id: new_id for new_id, old_id in enumerate(train['legs0_segments2_flightNumber'].unique())}

train['legs0_segments2_flightNumber'] = train['legs0_segments2_flightNumber'].map(legs0_segments2_flightNumber_map).astype('UInt16')
test['legs0_segments2_flightNumber'] = test['legs0_segments2_flightNumber'].map(legs0_segments2_flightNumber_map).fillna(len(legs0_segments2_flightNumber_map)).astype('UInt16')

In [48]:
legs0_segments2_marketingCarrier_code_map = {old_id: new_id for new_id, old_id in enumerate(train['legs0_segments2_marketingCarrier_code'].unique())}

train['legs0_segments2_marketingCarrier_code'] = train['legs0_segments2_marketingCarrier_code'].map(legs0_segments2_marketingCarrier_code_map).astype('UInt8')
test['legs0_segments2_marketingCarrier_code'] = test['legs0_segments2_marketingCarrier_code'].map(legs0_segments2_marketingCarrier_code_map).fillna(len(legs0_segments2_marketingCarrier_code_map)).astype('UInt8')

In [49]:
legs0_segments2_operatingCarrier_code_map = {old_id: new_id for new_id, old_id in enumerate(train['legs0_segments2_operatingCarrier_code'].unique())}

train['legs0_segments2_operatingCarrier_code'] = train['legs0_segments2_operatingCarrier_code'].map(legs0_segments2_operatingCarrier_code_map).astype('UInt8')
test['legs0_segments2_operatingCarrier_code'] = test['legs0_segments2_operatingCarrier_code'].map(legs0_segments2_operatingCarrier_code_map).fillna(len(legs0_segments2_operatingCarrier_code_map)).astype('UInt8')

In [50]:
legs0_segments2_seatsAvailable_map = {old_id: new_id for new_id, old_id in enumerate(train['legs0_segments2_seatsAvailable'].unique())}

train['legs0_segments2_seatsAvailable'] = train['legs0_segments2_seatsAvailable'].map(legs0_segments2_seatsAvailable_map).astype('UInt8')
test['legs0_segments2_seatsAvailable'] = test['legs0_segments2_seatsAvailable'].map(legs0_segments2_seatsAvailable_map).fillna(len(legs0_segments2_seatsAvailable_map)).astype('UInt8')

# L_0_S_3

In [51]:
legs0_segments3_aircraft_code_map = {old_id: new_id for new_id, old_id in enumerate(train['legs0_segments3_aircraft_code'].unique())}

train['legs0_segments3_aircraft_code'] = train['legs0_segments3_aircraft_code'].map(legs0_segments3_aircraft_code_map).astype('UInt8')
test['legs0_segments3_aircraft_code'] = test['legs0_segments3_aircraft_code'].map(legs0_segments3_aircraft_code_map).fillna(len(legs0_segments3_aircraft_code_map)).astype('UInt8')

In [52]:
legs0_segments3_arrivalTo_airport_city_iata_map = {old_id: new_id for new_id, old_id in enumerate(train['legs0_segments3_arrivalTo_airport_city_iata'].unique())}

train['legs0_segments3_arrivalTo_airport_city_iata'] = train['legs0_segments3_arrivalTo_airport_city_iata'].map(legs0_segments3_arrivalTo_airport_city_iata_map).astype('UInt8')
test['legs0_segments3_arrivalTo_airport_city_iata'] = test['legs0_segments3_arrivalTo_airport_city_iata'].map(legs0_segments3_arrivalTo_airport_city_iata_map).fillna(len(legs0_segments3_arrivalTo_airport_city_iata_map)).astype('UInt8')

In [53]:
legs0_segments3_arrivalTo_airport_iata_map = {old_id: new_id for new_id, old_id in enumerate(train['legs0_segments3_arrivalTo_airport_iata'].unique())}

train['legs0_segments3_arrivalTo_airport_iata'] = train['legs0_segments3_arrivalTo_airport_iata'].map(legs0_segments3_arrivalTo_airport_iata_map).astype('UInt8')
test['legs0_segments3_arrivalTo_airport_iata'] = test['legs0_segments3_arrivalTo_airport_iata'].map(legs0_segments3_arrivalTo_airport_iata_map).fillna(len(legs0_segments3_arrivalTo_airport_iata_map)).astype('UInt8')

In [54]:
legs0_segments3_baggageAllowance_quantity_map = {old_id: new_id for new_id, old_id in enumerate(train['legs0_segments3_baggageAllowance_quantity'].unique())}

train['legs0_segments3_baggageAllowance_quantity'] = train['legs0_segments3_baggageAllowance_quantity'].map(legs0_segments3_baggageAllowance_quantity_map).astype('UInt8')
test['legs0_segments3_baggageAllowance_quantity'] = test['legs0_segments3_baggageAllowance_quantity'].map(legs0_segments3_baggageAllowance_quantity_map).fillna(len(legs0_segments3_baggageAllowance_quantity_map)).astype('UInt8')

In [55]:
legs0_segments3_baggageAllowance_weightMeasurementType_map = {old_id: new_id for new_id, old_id in enumerate(train['legs0_segments3_baggageAllowance_weightMeasurementType'].unique())}

train['legs0_segments3_baggageAllowance_weightMeasurementType'] = train['legs0_segments3_baggageAllowance_weightMeasurementType'].map(legs0_segments3_baggageAllowance_weightMeasurementType_map).astype('UInt8')
test['legs0_segments3_baggageAllowance_weightMeasurementType'] = test['legs0_segments3_baggageAllowance_weightMeasurementType'].map(legs0_segments3_baggageAllowance_weightMeasurementType_map).fillna(len(legs0_segments3_baggageAllowance_weightMeasurementType_map)).astype('UInt8')

In [56]:
legs0_segments3_cabinClass_map = {old_id: new_id for new_id, old_id in enumerate(train['legs0_segments3_cabinClass'].unique())}

train['legs0_segments3_cabinClass'] = train['legs0_segments3_cabinClass'].map(legs0_segments3_cabinClass_map).astype('UInt8')
test['legs0_segments3_cabinClass'] = test['legs0_segments3_cabinClass'].map(legs0_segments3_cabinClass_map).fillna(len(legs0_segments3_cabinClass_map)).astype('UInt8')

In [57]:
legs0_segments3_departureFrom_airport_iata_map = {old_id: new_id for new_id, old_id in enumerate(train['legs0_segments3_departureFrom_airport_iata'].unique())}

train['legs0_segments3_departureFrom_airport_iata'] = train['legs0_segments3_departureFrom_airport_iata'].map(legs0_segments3_departureFrom_airport_iata_map).astype('UInt8')
test['legs0_segments3_departureFrom_airport_iata'] = test['legs0_segments3_departureFrom_airport_iata'].map(legs0_segments3_departureFrom_airport_iata_map).fillna(len(legs0_segments3_departureFrom_airport_iata_map)).astype('UInt8')

In [58]:
train['legs0_segments3_duration'] = pd.to_timedelta(train['legs0_segments3_duration'].apply(fix_duration_format))
train['legs0_segments3_duration'] = (train['legs0_segments3_duration'].dt.total_seconds() // 60).fillna(0).astype('UInt16')

test['legs0_segments3_duration'] = test['legs0_segments3_duration'].replace(-2, pd.NA)
test['legs0_segments3_duration'] = test['legs0_segments3_duration'].fillna(0).astype('UInt16')

In [59]:
legs0_segments3_flightNumber_map = {old_id: new_id for new_id, old_id in enumerate(train['legs0_segments3_flightNumber'].unique())}

train['legs0_segments3_flightNumber'] = train['legs0_segments3_flightNumber'].map(legs0_segments3_flightNumber_map).astype('UInt8')
test['legs0_segments3_flightNumber'] = test['legs0_segments3_flightNumber'].map(legs0_segments3_flightNumber_map).fillna(len(legs0_segments3_flightNumber_map)).astype('UInt8')

In [60]:
legs0_segments3_marketingCarrier_code_map = {old_id: new_id for new_id, old_id in enumerate(train['legs0_segments3_marketingCarrier_code'].unique())}

train['legs0_segments3_marketingCarrier_code'] = train['legs0_segments3_marketingCarrier_code'].map(legs0_segments3_marketingCarrier_code_map).astype('UInt8')
test['legs0_segments3_marketingCarrier_code'] = test['legs0_segments3_marketingCarrier_code'].map(legs0_segments3_marketingCarrier_code_map).fillna(len(legs0_segments3_marketingCarrier_code_map)).astype('UInt8')

In [61]:
legs0_segments3_operatingCarrier_code_map = {old_id: new_id for new_id, old_id in enumerate(train['legs0_segments3_operatingCarrier_code'].unique())}

train['legs0_segments3_operatingCarrier_code'] = train['legs0_segments3_operatingCarrier_code'].map(legs0_segments3_operatingCarrier_code_map).astype('UInt8')
test['legs0_segments3_operatingCarrier_code'] = test['legs0_segments3_operatingCarrier_code'].map(legs0_segments3_operatingCarrier_code_map).fillna(len(legs0_segments3_operatingCarrier_code_map)).astype('UInt8')

In [62]:
legs0_segments3_seatsAvailable_map = {old_id: new_id for new_id, old_id in enumerate(train['legs0_segments3_seatsAvailable'].unique())}

train['legs0_segments3_seatsAvailable'] = train['legs0_segments3_seatsAvailable'].map(legs0_segments3_seatsAvailable_map).astype('UInt8')
test['legs0_segments3_seatsAvailable'] = test['legs0_segments3_seatsAvailable'].map(legs0_segments3_seatsAvailable_map).fillna(len(legs0_segments3_seatsAvailable_map)).astype('UInt8')

# L_1

In [63]:
train['legs1_arrivalAt'] = pd.to_datetime(train['legs1_arrivalAt'], format='%Y-%m-%dT%H:%M:%S', errors='coerce')
test['legs1_arrivalAt'] = pd.to_datetime(test['legs1_arrivalAt'], format='%Y-%m-%dT%H:%M:%S', errors='coerce')

In [64]:
train['legs1_departureAt'] = pd.to_datetime(train['legs1_departureAt'], format='%Y-%m-%dT%H:%M:%S', errors='coerce')
test['legs1_departureAt'] = pd.to_datetime(test['legs1_departureAt'], format='%Y-%m-%dT%H:%M:%S', errors='coerce')

In [65]:
train['legs1_arrivalAt'] = train['legs1_arrivalAt'].fillna(pd.Timestamp("2024-01-01 00:00:00"))
train['legs1_departureAt'] = train['legs1_departureAt'].fillna(pd.Timestamp("2024-01-01 00:00:00"))

test['legs1_arrivalAt'] = test['legs1_arrivalAt'].fillna(pd.Timestamp("2024-01-01 00:00:00"))
test['legs1_departureAt'] = test['legs1_departureAt'].fillna(pd.Timestamp("2024-01-01 00:00:00"))

In [66]:
train['legs1_duration'] = pd.to_timedelta(train['legs1_duration'].apply(fix_duration_format))
train['legs1_duration'] = (train['legs1_duration'].dt.total_seconds() // 60).fillna(0).astype('UInt16')

test['legs1_duration'] = pd.to_timedelta(test['legs1_duration'].apply(fix_duration_format))
test['legs1_duration'] = (test['legs1_duration'].dt.total_seconds() // 60).fillna(0).astype('UInt16')

# L_1_S_0

In [67]:
legs1_segments0_aircraft_code_map = {old_id: new_id for new_id, old_id in enumerate(train['legs1_segments0_aircraft_code'].unique())}

train['legs1_segments0_aircraft_code'] = train['legs1_segments0_aircraft_code'].map(legs1_segments0_aircraft_code_map).astype('UInt8')
test['legs1_segments0_aircraft_code'] = test['legs1_segments0_aircraft_code'].map(legs1_segments0_aircraft_code_map).fillna(len(legs1_segments0_aircraft_code_map)).astype('UInt8')

In [68]:
legs1_segments0_arrivalTo_airport_city_iata_map = {old_id: new_id for new_id, old_id in enumerate(train['legs1_segments0_arrivalTo_airport_city_iata'].unique())}

train['legs1_segments0_arrivalTo_airport_city_iata'] = train['legs1_segments0_arrivalTo_airport_city_iata'].map(legs1_segments0_arrivalTo_airport_city_iata_map).astype('UInt16')
test['legs1_segments0_arrivalTo_airport_city_iata'] = test['legs1_segments0_arrivalTo_airport_city_iata'].map(legs1_segments0_arrivalTo_airport_city_iata_map).fillna(len(legs1_segments0_arrivalTo_airport_city_iata_map)).astype('UInt16')

In [69]:
legs1_segments0_arrivalTo_airport_iata_map = {old_id: new_id for new_id, old_id in enumerate(train['legs1_segments0_arrivalTo_airport_iata'].unique())}

train['legs1_segments0_arrivalTo_airport_iata'] = train['legs1_segments0_arrivalTo_airport_iata'].map(legs1_segments0_arrivalTo_airport_iata_map).astype('UInt16')
test['legs1_segments0_arrivalTo_airport_iata'] = test['legs1_segments0_arrivalTo_airport_iata'].map(legs1_segments0_arrivalTo_airport_iata_map).fillna(len(legs1_segments0_arrivalTo_airport_iata_map)).astype('UInt16')

In [70]:
legs1_segments0_baggageAllowance_quantity_map = {old_id: new_id for new_id, old_id in enumerate(train['legs1_segments0_baggageAllowance_quantity'].unique())}

train['legs1_segments0_baggageAllowance_quantity'] = train['legs1_segments0_baggageAllowance_quantity'].map(legs1_segments0_baggageAllowance_quantity_map).astype('UInt8')
test['legs1_segments0_baggageAllowance_quantity'] = test['legs1_segments0_baggageAllowance_quantity'].map(legs1_segments0_baggageAllowance_quantity_map).fillna(len(legs1_segments0_baggageAllowance_quantity_map)).astype('UInt8')

In [71]:
legs1_segments0_baggageAllowance_weightMeasurementType_map = {old_id: new_id for new_id, old_id in enumerate(train['legs1_segments0_baggageAllowance_weightMeasurementType'].unique())}

train['legs1_segments0_baggageAllowance_weightMeasurementType'] = train['legs1_segments0_baggageAllowance_weightMeasurementType'].map(legs1_segments0_baggageAllowance_weightMeasurementType_map).astype('UInt8')
test['legs1_segments0_baggageAllowance_weightMeasurementType'] = test['legs1_segments0_baggageAllowance_weightMeasurementType'].map(legs1_segments0_baggageAllowance_weightMeasurementType_map).fillna(len(legs1_segments0_baggageAllowance_weightMeasurementType_map)).astype('UInt8')

In [72]:
legs1_segments0_cabinClass_map = {old_id: new_id for new_id, old_id in enumerate(train['legs1_segments0_cabinClass'].unique())}

train['legs1_segments0_cabinClass'] = train['legs1_segments0_cabinClass'].map(legs1_segments0_cabinClass_map).astype('UInt8')
test['legs1_segments0_cabinClass'] = test['legs1_segments0_cabinClass'].map(legs1_segments0_cabinClass_map).fillna(len(legs1_segments0_cabinClass_map)).astype('UInt8')

In [73]:
legs1_segments0_departureFrom_airport_iata_map = {old_id: new_id for new_id, old_id in enumerate(train['legs1_segments0_departureFrom_airport_iata'].unique())}

train['legs1_segments0_departureFrom_airport_iata'] = train['legs1_segments0_departureFrom_airport_iata'].map(legs1_segments0_departureFrom_airport_iata_map).astype('UInt16')
test['legs1_segments0_departureFrom_airport_iata'] = test['legs1_segments0_departureFrom_airport_iata'].map(legs1_segments0_departureFrom_airport_iata_map).fillna(len(legs1_segments0_departureFrom_airport_iata_map)).astype('UInt16')

In [74]:
train['legs1_segments0_duration'] = pd.to_timedelta(train['legs1_segments0_duration'].apply(fix_duration_format))
train['legs1_segments0_duration'] = (train['legs1_segments0_duration'].dt.total_seconds() // 60).fillna(0).astype('UInt16')

test['legs1_segments0_duration'] = pd.to_timedelta(test['legs1_segments0_duration'].apply(fix_duration_format))
test['legs1_segments0_duration'] = (test['legs1_segments0_duration'].dt.total_seconds() // 60).fillna(0).astype('UInt16')

In [75]:
legs1_segments0_flightNumber_map = {old_id: new_id for new_id, old_id in enumerate(train['legs1_segments0_flightNumber'].unique())}

train['legs1_segments0_flightNumber'] = train['legs1_segments0_flightNumber'].map(legs1_segments0_flightNumber_map).astype('UInt16')
test['legs1_segments0_flightNumber'] = test['legs1_segments0_flightNumber'].map(legs1_segments0_flightNumber_map).fillna(len(legs1_segments0_flightNumber_map)).astype('UInt16')

In [76]:
legs1_segments0_marketingCarrier_code_map = {old_id: new_id for new_id, old_id in enumerate(train['legs1_segments0_marketingCarrier_code'].unique())}

train['legs1_segments0_marketingCarrier_code'] = train['legs1_segments0_marketingCarrier_code'].map(legs1_segments0_marketingCarrier_code_map).astype('UInt8')
test['legs1_segments0_marketingCarrier_code'] = test['legs1_segments0_marketingCarrier_code'].map(legs1_segments0_marketingCarrier_code_map).fillna(len(legs1_segments0_marketingCarrier_code_map)).astype('UInt8')

In [77]:
legs1_segments0_operatingCarrier_code_map = {old_id: new_id for new_id, old_id in enumerate(train['legs1_segments0_operatingCarrier_code'].unique())}

train['legs1_segments0_operatingCarrier_code'] = train['legs1_segments0_operatingCarrier_code'].map(legs1_segments0_operatingCarrier_code_map).astype('UInt8')
test['legs1_segments0_operatingCarrier_code'] = test['legs1_segments0_operatingCarrier_code'].map(legs1_segments0_operatingCarrier_code_map).fillna(len(legs1_segments0_operatingCarrier_code_map)).astype('UInt8')

In [78]:
legs1_segments0_seatsAvailable_map = {old_id: new_id for new_id, old_id in enumerate(train['legs1_segments0_seatsAvailable'].unique())}

train['legs1_segments0_seatsAvailable'] = train['legs1_segments0_seatsAvailable'].map(legs1_segments0_seatsAvailable_map).astype('UInt8')
test['legs1_segments0_seatsAvailable'] = test['legs1_segments0_seatsAvailable'].map(legs1_segments0_seatsAvailable_map).fillna(len(legs1_segments0_seatsAvailable_map)).astype('UInt8')

# L_1_S_1

In [79]:
legs1_segments1_aircraft_code_map = {old_id: new_id for new_id, old_id in enumerate(train['legs1_segments1_aircraft_code'].unique())}

train['legs1_segments1_aircraft_code'] = train['legs1_segments1_aircraft_code'].map(legs1_segments1_aircraft_code_map).astype('UInt8')
test['legs1_segments1_aircraft_code'] = test['legs1_segments1_aircraft_code'].map(legs1_segments1_aircraft_code_map).fillna(len(legs1_segments1_aircraft_code_map)).astype('UInt8')

In [80]:
legs1_segments1_arrivalTo_airport_city_iata_map = {old_id: new_id for new_id, old_id in enumerate(train['legs1_segments1_arrivalTo_airport_city_iata'].unique())}

train['legs1_segments1_arrivalTo_airport_city_iata'] = train['legs1_segments1_arrivalTo_airport_city_iata'].map(legs1_segments1_arrivalTo_airport_city_iata_map).astype('UInt8')
test['legs1_segments1_arrivalTo_airport_city_iata'] = test['legs1_segments1_arrivalTo_airport_city_iata'].map(legs1_segments1_arrivalTo_airport_city_iata_map).fillna(len(legs1_segments1_arrivalTo_airport_city_iata_map)).astype('UInt8')

In [81]:
legs1_segments1_arrivalTo_airport_iata_map = {old_id: new_id for new_id, old_id in enumerate(train['legs1_segments1_arrivalTo_airport_iata'].unique())}

train['legs1_segments1_arrivalTo_airport_iata'] = train['legs1_segments1_arrivalTo_airport_iata'].map(legs1_segments1_arrivalTo_airport_iata_map).astype('UInt8')
test['legs1_segments1_arrivalTo_airport_iata'] = test['legs1_segments1_arrivalTo_airport_iata'].map(legs1_segments1_arrivalTo_airport_iata_map).fillna(len(legs1_segments1_arrivalTo_airport_iata_map)).astype('UInt8')

In [82]:
legs1_segments1_baggageAllowance_quantity_map = {old_id: new_id for new_id, old_id in enumerate(train['legs1_segments1_baggageAllowance_quantity'].unique())}

train['legs1_segments1_baggageAllowance_quantity'] = train['legs1_segments1_baggageAllowance_quantity'].map(legs1_segments1_baggageAllowance_quantity_map).astype('UInt8')
test['legs1_segments1_baggageAllowance_quantity'] = test['legs1_segments1_baggageAllowance_quantity'].map(legs1_segments1_baggageAllowance_quantity_map).fillna(len(legs1_segments1_baggageAllowance_quantity_map)).astype('UInt8')

In [83]:
legs1_segments1_baggageAllowance_weightMeasurementType_map = {old_id: new_id for new_id, old_id in enumerate(train['legs1_segments1_baggageAllowance_weightMeasurementType'].unique())}

train['legs1_segments1_baggageAllowance_weightMeasurementType'] = train['legs1_segments1_baggageAllowance_weightMeasurementType'].map(legs1_segments1_baggageAllowance_weightMeasurementType_map).astype('UInt8')
test['legs1_segments1_baggageAllowance_weightMeasurementType'] = test['legs1_segments1_baggageAllowance_weightMeasurementType'].map(legs1_segments1_baggageAllowance_weightMeasurementType_map).fillna(len(legs1_segments1_baggageAllowance_weightMeasurementType_map)).astype('UInt8')

In [84]:
legs1_segments1_cabinClass_map = {old_id: new_id for new_id, old_id in enumerate(train['legs1_segments1_cabinClass'].unique())}

train['legs1_segments1_cabinClass'] = train['legs1_segments1_cabinClass'].map(legs1_segments1_cabinClass_map).astype('UInt8')
test['legs1_segments1_cabinClass'] = test['legs1_segments1_cabinClass'].map(legs1_segments1_cabinClass_map).fillna(len(legs1_segments1_cabinClass_map)).astype('UInt8')

In [85]:
legs1_segments1_departureFrom_airport_iata_map = {old_id: new_id for new_id, old_id in enumerate(train['legs1_segments1_departureFrom_airport_iata'].unique())}

train['legs1_segments1_departureFrom_airport_iata'] = train['legs1_segments1_departureFrom_airport_iata'].map(legs1_segments1_departureFrom_airport_iata_map).astype('UInt16')
test['legs1_segments1_departureFrom_airport_iata'] = test['legs1_segments1_departureFrom_airport_iata'].map(legs1_segments1_departureFrom_airport_iata_map).fillna(len(legs1_segments1_departureFrom_airport_iata_map)).astype('UInt16')

In [86]:
train['legs1_segments1_duration'] = pd.to_timedelta(train['legs1_segments1_duration'].apply(fix_duration_format))
train['legs1_segments1_duration'] = (train['legs1_segments1_duration'].dt.total_seconds() // 60).fillna(0).astype('UInt16')

test['legs1_segments1_duration'] = pd.to_timedelta(test['legs1_segments1_duration'].apply(fix_duration_format))
test['legs1_segments1_duration'] = (test['legs1_segments1_duration'].dt.total_seconds() // 60).fillna(0).astype('UInt16')

In [87]:
legs1_segments1_flightNumber_map = {old_id: new_id for new_id, old_id in enumerate(train['legs1_segments1_flightNumber'].unique())}

train['legs1_segments1_flightNumber'] = train['legs1_segments1_flightNumber'].map(legs1_segments1_flightNumber_map).astype('UInt16')
test['legs1_segments1_flightNumber'] = test['legs1_segments1_flightNumber'].map(legs1_segments1_flightNumber_map).fillna(len(legs1_segments1_flightNumber_map)).astype('UInt16')

In [88]:
legs1_segments1_marketingCarrier_code_map = {old_id: new_id for new_id, old_id in enumerate(train['legs1_segments1_marketingCarrier_code'].unique())}

train['legs1_segments1_marketingCarrier_code'] = train['legs1_segments1_marketingCarrier_code'].map(legs1_segments1_marketingCarrier_code_map).astype('UInt8')
test['legs1_segments1_marketingCarrier_code'] = test['legs1_segments1_marketingCarrier_code'].map(legs1_segments1_marketingCarrier_code_map).fillna(len(legs1_segments1_marketingCarrier_code_map)).astype('UInt8')

In [89]:
legs1_segments1_operatingCarrier_code_map = {old_id: new_id for new_id, old_id in enumerate(train['legs1_segments1_operatingCarrier_code'].unique())}

train['legs1_segments1_operatingCarrier_code'] = train['legs1_segments1_operatingCarrier_code'].map(legs1_segments1_operatingCarrier_code_map).astype('UInt8')
test['legs1_segments1_operatingCarrier_code'] = test['legs1_segments1_operatingCarrier_code'].map(legs1_segments1_operatingCarrier_code_map).fillna(len(legs1_segments1_operatingCarrier_code_map)).astype('UInt8')

In [90]:
legs1_segments1_seatsAvailable_map = {old_id: new_id for new_id, old_id in enumerate(train['legs1_segments1_seatsAvailable'].unique())}

train['legs1_segments1_seatsAvailable'] = train['legs1_segments1_seatsAvailable'].map(legs1_segments1_seatsAvailable_map).astype('UInt8')
test['legs1_segments1_seatsAvailable'] = test['legs1_segments1_seatsAvailable'].map(legs1_segments1_seatsAvailable_map).fillna(len(legs1_segments1_seatsAvailable_map)).astype('UInt8')

# L_1_S_2

In [91]:
legs1_segments2_aircraft_code_map = {old_id: new_id for new_id, old_id in enumerate(train['legs1_segments2_aircraft_code'].unique())}

train['legs1_segments2_aircraft_code'] = train['legs1_segments2_aircraft_code'].map(legs1_segments2_aircraft_code_map).astype('UInt8')
test['legs1_segments2_aircraft_code'] = test['legs1_segments2_aircraft_code'].map(legs1_segments2_aircraft_code_map).fillna(len(legs1_segments2_aircraft_code_map)).astype('UInt8')

In [92]:
legs1_segments2_arrivalTo_airport_city_iata_map = {old_id: new_id for new_id, old_id in enumerate(train['legs1_segments2_arrivalTo_airport_city_iata'].unique())}

train['legs1_segments2_arrivalTo_airport_city_iata'] = train['legs1_segments2_arrivalTo_airport_city_iata'].map(legs1_segments2_arrivalTo_airport_city_iata_map).astype('UInt8')
test['legs1_segments2_arrivalTo_airport_city_iata'] = test['legs1_segments2_arrivalTo_airport_city_iata'].map(legs1_segments2_arrivalTo_airport_city_iata_map).fillna(len(legs1_segments2_arrivalTo_airport_city_iata_map)).astype('UInt8')

In [93]:
legs1_segments2_arrivalTo_airport_iata_map = {old_id: new_id for new_id, old_id in enumerate(train['legs1_segments2_arrivalTo_airport_iata'].unique())}

train['legs1_segments2_arrivalTo_airport_iata'] = train['legs1_segments2_arrivalTo_airport_iata'].map(legs1_segments2_arrivalTo_airport_iata_map).astype('UInt8')
test['legs1_segments2_arrivalTo_airport_iata'] = test['legs1_segments2_arrivalTo_airport_iata'].map(legs1_segments2_arrivalTo_airport_iata_map).fillna(len(legs1_segments2_arrivalTo_airport_iata_map)).astype('UInt8')

In [94]:
test['legs1_segments2_baggageAllowance_quantity'] = test['legs1_segments2_baggageAllowance_quantity'].replace(45.0, 50.0)
legs1_segments2_baggageAllowance_quantity_map = {old_id: new_id for new_id, old_id in enumerate(train['legs1_segments2_baggageAllowance_quantity'].unique())}

train['legs1_segments2_baggageAllowance_quantity'] = train['legs1_segments2_baggageAllowance_quantity'].map(legs1_segments2_baggageAllowance_quantity_map).astype('UInt8')
test['legs1_segments2_baggageAllowance_quantity'] = test['legs1_segments2_baggageAllowance_quantity'].map(legs1_segments2_baggageAllowance_quantity_map).fillna(len(legs1_segments2_baggageAllowance_quantity_map)).astype('UInt8')

In [95]:
legs1_segments2_baggageAllowance_weightMeasurementType_map = {old_id: new_id for new_id, old_id in enumerate(train['legs1_segments2_baggageAllowance_weightMeasurementType'].unique())}

train['legs1_segments2_baggageAllowance_weightMeasurementType'] = train['legs1_segments2_baggageAllowance_weightMeasurementType'].map(legs1_segments2_baggageAllowance_weightMeasurementType_map).astype('UInt8')
test['legs1_segments2_baggageAllowance_weightMeasurementType'] = test['legs1_segments2_baggageAllowance_weightMeasurementType'].map(legs1_segments2_baggageAllowance_weightMeasurementType_map).fillna(len(legs1_segments2_baggageAllowance_weightMeasurementType_map)).astype('UInt8')

In [96]:
legs1_segments2_cabinClass_map = {old_id: new_id for new_id, old_id in enumerate(train['legs1_segments2_cabinClass'].unique())}

train['legs1_segments2_cabinClass'] = train['legs1_segments2_cabinClass'].map(legs1_segments2_cabinClass_map).astype('UInt8')
test['legs1_segments2_cabinClass'] = test['legs1_segments2_cabinClass'].map(legs1_segments2_cabinClass_map).fillna(len(legs1_segments2_cabinClass_map)).astype('UInt8')

In [97]:
legs1_segments2_departureFrom_airport_iata_map = {old_id: new_id for new_id, old_id in enumerate(train['legs1_segments2_departureFrom_airport_iata'].unique())}

train['legs1_segments2_departureFrom_airport_iata'] = train['legs1_segments2_departureFrom_airport_iata'].map(legs1_segments2_departureFrom_airport_iata_map).astype('UInt8')
test['legs1_segments2_departureFrom_airport_iata'] = test['legs1_segments2_departureFrom_airport_iata'].map(legs1_segments2_departureFrom_airport_iata_map).fillna(len(legs1_segments2_departureFrom_airport_iata_map)).astype('UInt8')

In [98]:
train['legs1_segments2_duration'] = pd.to_timedelta(train['legs1_segments2_duration'].apply(fix_duration_format))
train['legs1_segments2_duration'] = (train['legs1_segments2_duration'].dt.total_seconds() // 60).fillna(0).astype('UInt16')

test['legs1_segments2_duration'] = pd.to_timedelta(test['legs1_segments2_duration'].apply(fix_duration_format))
test['legs1_segments2_duration'] = (test['legs1_segments2_duration'].dt.total_seconds() // 60).fillna(0).astype('UInt16')

In [99]:
legs1_segments2_flightNumber_map = {old_id: new_id for new_id, old_id in enumerate(train['legs1_segments2_flightNumber'].unique())}

train['legs1_segments2_flightNumber'] = train['legs1_segments2_flightNumber'].map(legs1_segments2_flightNumber_map).astype('UInt16')
test['legs1_segments2_flightNumber'] = test['legs1_segments2_flightNumber'].map(legs1_segments2_flightNumber_map).fillna(len(legs1_segments2_flightNumber_map)).astype('UInt16')

In [100]:
legs1_segments2_marketingCarrier_code_map = {old_id: new_id for new_id, old_id in enumerate(train['legs1_segments2_marketingCarrier_code'].unique())}

train['legs1_segments2_marketingCarrier_code'] = train['legs1_segments2_marketingCarrier_code'].map(legs1_segments2_marketingCarrier_code_map).astype('UInt8')
test['legs1_segments2_marketingCarrier_code'] = test['legs1_segments2_marketingCarrier_code'].map(legs1_segments2_marketingCarrier_code_map).fillna(len(legs1_segments2_marketingCarrier_code_map)).astype('UInt8')

In [101]:
legs1_segments2_operatingCarrier_code_map = {old_id: new_id for new_id, old_id in enumerate(train['legs1_segments2_operatingCarrier_code'].unique())}

train['legs1_segments2_operatingCarrier_code'] = train['legs1_segments2_operatingCarrier_code'].map(legs1_segments2_operatingCarrier_code_map).astype('UInt8')
test['legs1_segments2_operatingCarrier_code'] = test['legs1_segments2_operatingCarrier_code'].map(legs1_segments2_operatingCarrier_code_map).fillna(len(legs1_segments2_operatingCarrier_code_map)).astype('UInt8')

In [102]:
legs1_segments2_seatsAvailable_map = {old_id: new_id for new_id, old_id in enumerate(train['legs1_segments2_seatsAvailable'].unique())}

train['legs1_segments2_seatsAvailable'] = train['legs1_segments2_seatsAvailable'].map(legs1_segments2_seatsAvailable_map).astype('UInt8')
test['legs1_segments2_seatsAvailable'] = test['legs1_segments2_seatsAvailable'].map(legs1_segments2_seatsAvailable_map).fillna(len(legs1_segments2_seatsAvailable_map)).astype('UInt8')

# L_1_S_3

In [103]:
legs1_segments3_aircraft_code_map = {old_id: new_id for new_id, old_id in enumerate(train['legs1_segments3_aircraft_code'].unique())}

train['legs1_segments3_aircraft_code'] = train['legs1_segments3_aircraft_code'].map(legs1_segments3_aircraft_code_map).astype('UInt8')
test['legs1_segments3_aircraft_code'] = test['legs1_segments3_aircraft_code'].map(legs1_segments3_aircraft_code_map).fillna(len(legs1_segments3_aircraft_code_map)).astype('UInt8')

In [104]:
legs1_segments3_arrivalTo_airport_city_iata_map = {old_id: new_id for new_id, old_id in enumerate(train['legs1_segments3_arrivalTo_airport_city_iata'].unique())}

train['legs1_segments3_arrivalTo_airport_city_iata'] = train['legs1_segments3_arrivalTo_airport_city_iata'].map(legs1_segments3_arrivalTo_airport_city_iata_map).astype('UInt8')
test['legs1_segments3_arrivalTo_airport_city_iata'] = test['legs1_segments3_arrivalTo_airport_city_iata'].map(legs1_segments3_arrivalTo_airport_city_iata_map).fillna(len(legs1_segments3_arrivalTo_airport_city_iata_map)).astype('UInt8')

In [105]:
legs1_segments3_arrivalTo_airport_iata_map = {old_id: new_id for new_id, old_id in enumerate(train['legs1_segments3_arrivalTo_airport_iata'].unique())}

train['legs1_segments3_arrivalTo_airport_iata'] = train['legs1_segments3_arrivalTo_airport_iata'].map(legs1_segments3_arrivalTo_airport_iata_map).astype('UInt8')
test['legs1_segments3_arrivalTo_airport_iata'] = test['legs1_segments3_arrivalTo_airport_iata'].map(legs1_segments3_arrivalTo_airport_iata_map).fillna(len(legs1_segments3_arrivalTo_airport_iata_map)).astype('UInt8')

In [106]:
legs1_segments3_baggageAllowance_quantity_map = {old_id: new_id for new_id, old_id in enumerate(train['legs1_segments3_baggageAllowance_quantity'].unique())}

train['legs1_segments3_baggageAllowance_quantity'] = train['legs1_segments3_baggageAllowance_quantity'].map(legs1_segments3_baggageAllowance_quantity_map).astype('UInt8')
test['legs1_segments3_baggageAllowance_quantity'] = test['legs1_segments3_baggageAllowance_quantity'].map(legs1_segments3_baggageAllowance_quantity_map).fillna(len(legs1_segments3_baggageAllowance_quantity_map)).astype('UInt8')

In [107]:
legs1_segments3_baggageAllowance_weightMeasurementType_map = {old_id: new_id for new_id, old_id in enumerate(train['legs1_segments3_baggageAllowance_weightMeasurementType'].unique())}

train['legs1_segments3_baggageAllowance_weightMeasurementType'] = train['legs1_segments3_baggageAllowance_weightMeasurementType'].map(legs1_segments3_baggageAllowance_weightMeasurementType_map).astype('UInt8')
test['legs1_segments3_baggageAllowance_weightMeasurementType'] = test['legs1_segments3_baggageAllowance_weightMeasurementType'].map(legs1_segments3_baggageAllowance_weightMeasurementType_map).fillna(len(legs1_segments3_baggageAllowance_weightMeasurementType_map)).astype('UInt8')

In [108]:
legs1_segments3_cabinClass_map = {old_id: new_id for new_id, old_id in enumerate(train['legs1_segments3_cabinClass'].unique())}

train['legs1_segments3_cabinClass'] = train['legs1_segments3_cabinClass'].map(legs1_segments3_cabinClass_map).astype('UInt8')
test['legs1_segments3_cabinClass'] = test['legs1_segments3_cabinClass'].map(legs1_segments3_cabinClass_map).fillna(len(legs1_segments3_cabinClass_map)).astype('UInt8')

In [109]:
legs1_segments3_departureFrom_airport_iata_map = {old_id: new_id for new_id, old_id in enumerate(train['legs1_segments3_departureFrom_airport_iata'].unique())}

train['legs1_segments3_departureFrom_airport_iata'] = train['legs1_segments3_departureFrom_airport_iata'].map(legs1_segments3_departureFrom_airport_iata_map).astype('UInt8')
test['legs1_segments3_departureFrom_airport_iata'] = test['legs1_segments3_departureFrom_airport_iata'].map(legs1_segments3_departureFrom_airport_iata_map).fillna(len(legs1_segments3_departureFrom_airport_iata_map)).astype('UInt8')

In [110]:
train['legs1_segments3_duration'] = pd.to_timedelta(train['legs1_segments3_duration'].apply(fix_duration_format))
train['legs1_segments3_duration'] = (train['legs1_segments3_duration'].dt.total_seconds() // 60).fillna(0).astype('UInt16')

test['legs1_segments3_duration'] = test['legs1_segments3_duration'].replace(-2, pd.NA)
test['legs1_segments3_duration'] = test['legs1_segments3_duration'].fillna(0).astype('UInt16')

In [111]:
legs1_segments3_flightNumber_map = {old_id: new_id for new_id, old_id in enumerate(train['legs1_segments3_flightNumber'].unique())}

train['legs1_segments3_flightNumber'] = train['legs1_segments3_flightNumber'].map(legs1_segments3_flightNumber_map).astype('UInt8')
test['legs1_segments3_flightNumber'] = test['legs1_segments3_flightNumber'].map(legs1_segments3_flightNumber_map).fillna(len(legs1_segments3_flightNumber_map)).astype('UInt8')

In [112]:
legs1_segments3_marketingCarrier_code_map = {old_id: new_id for new_id, old_id in enumerate(train['legs1_segments3_marketingCarrier_code'].unique())}

train['legs1_segments3_marketingCarrier_code'] = train['legs1_segments3_marketingCarrier_code'].map(legs1_segments3_marketingCarrier_code_map).astype('UInt8')
test['legs1_segments3_marketingCarrier_code'] = test['legs1_segments3_marketingCarrier_code'].map(legs1_segments3_marketingCarrier_code_map).fillna(len(legs1_segments3_marketingCarrier_code_map)).astype('UInt8')

In [113]:
legs1_segments3_operatingCarrier_code_map = {old_id: new_id for new_id, old_id in enumerate(train['legs1_segments3_operatingCarrier_code'].unique())}

train['legs1_segments3_operatingCarrier_code'] = train['legs1_segments3_operatingCarrier_code'].map(legs1_segments3_operatingCarrier_code_map).astype('UInt8')
test['legs1_segments3_operatingCarrier_code'] = test['legs1_segments3_operatingCarrier_code'].map(legs1_segments3_operatingCarrier_code_map).fillna(len(legs1_segments3_operatingCarrier_code_map)).astype('UInt8')

In [114]:
legs1_segments3_seatsAvailable_map = {old_id: new_id for new_id, old_id in enumerate(train['legs1_segments3_seatsAvailable'].unique())}

train['legs1_segments3_seatsAvailable'] = train['legs1_segments3_seatsAvailable'].map(legs1_segments3_seatsAvailable_map).astype('UInt8')
test['legs1_segments3_seatsAvailable'] = test['legs1_segments3_seatsAvailable'].map(legs1_segments3_seatsAvailable_map).fillna(len(legs1_segments3_seatsAvailable_map)).astype('UInt8')

# Others

In [115]:
train['miniRules0_monetaryAmount'] = train['miniRules0_monetaryAmount'].replace(-2, pd.NA).fillna(0).astype('UInt32')
test['miniRules0_monetaryAmount'] = test['miniRules0_monetaryAmount'].replace(-2, pd.NA).fillna(0).astype('UInt32')

In [116]:
miniRules0_percentage_map = {old_id: new_id for new_id, old_id in enumerate(train['miniRules0_percentage'].unique())}

train['miniRules0_percentage'] = train['miniRules0_percentage'].map(miniRules0_percentage_map).astype('UInt8')
test['miniRules0_percentage'] = test['miniRules0_percentage'].map(miniRules0_percentage_map).astype('UInt8')

In [117]:
miniRules0_statusInfos_map = {old_id: new_id for new_id, old_id in enumerate(train['miniRules0_statusInfos'].unique())}

train['miniRules0_statusInfos'] = train['miniRules0_statusInfos'].map(miniRules0_statusInfos_map).astype('UInt8')
test['miniRules0_statusInfos'] = test['miniRules0_statusInfos'].map(miniRules0_statusInfos_map).astype('UInt8')

In [118]:
train['miniRules1_monetaryAmount'] = train['miniRules1_monetaryAmount'].replace(-2, pd.NA).fillna(0).astype('UInt32')
test['miniRules1_monetaryAmount'] = test['miniRules1_monetaryAmount'].replace(-2, pd.NA).fillna(0).astype('UInt32')

In [119]:
miniRules1_percentage_map = {old_id: new_id for new_id, old_id in enumerate(train['miniRules1_percentage'].unique())}

train['miniRules1_percentage'] = train['miniRules1_percentage'].map(miniRules1_percentage_map).astype('UInt8')
test['miniRules1_percentage'] = test['miniRules1_percentage'].map(miniRules1_percentage_map).astype('UInt8')

In [120]:
miniRules1_statusInfos_map = {old_id: new_id for new_id, old_id in enumerate(train['miniRules1_statusInfos'].unique())}

train['miniRules1_statusInfos'] = train['miniRules1_statusInfos'].map(miniRules1_statusInfos_map).astype('UInt8')
test['miniRules1_statusInfos'] = test['miniRules1_statusInfos'].map(miniRules1_statusInfos_map).astype('UInt8')

In [121]:
pricingInfo_isAccessTP_map = {old_id: new_id for new_id, old_id in enumerate(train['pricingInfo_isAccessTP'].unique())}

train['pricingInfo_isAccessTP'] = train['pricingInfo_isAccessTP'].map(pricingInfo_isAccessTP_map).astype('UInt8')
test['pricingInfo_isAccessTP'] = test['pricingInfo_isAccessTP'].map(pricingInfo_isAccessTP_map).fillna(len(pricingInfo_isAccessTP_map)).astype('UInt8')

In [122]:
profileId_map = {old_id: new_id for new_id, old_id in enumerate(train['profileId'].unique())}

train['profileId'] = train['profileId'].map(profileId_map).astype('UInt16')
test['profileId'] = test['profileId'].map(profileId_map).fillna(len(profileId_map)).astype('UInt16')

In [123]:
train['ranker_id'] = train['ranker_id'].astype('category')
test['ranker_id'] = test['ranker_id'].astype('category')

In [124]:
train['searchRoute'] = train['searchRoute'].astype('category')
test['searchRoute'] = test['searchRoute'].astype('category')

In [125]:
parts = train['searchRoute'].str.split('/', expand=True)

train['departure_origin'] = parts[0].str.slice(0, 3)
train['departure_destination'] = parts[0].str.slice(3, 6)

train['return_origin'] = np.where(parts[1].notna(), parts[1].str.slice(0, 3), -2)
train['return_destination'] = np.where(parts[1].notna(), parts[1].str.slice(3, 6), -2)

parts = test['searchRoute'].str.split('/', expand=True)

test['departure_origin'] = parts[0].str.slice(0, 3)
test['departure_destination'] = parts[0].str.slice(3, 6)

test['return_origin'] = np.where(parts[1].notna(), parts[1].str.slice(0, 3), -2)
test['return_destination'] = np.where(parts[1].notna(), parts[1].str.slice(3, 6), -2)

In [126]:
departure_origin_map = {old_id: new_id for new_id, old_id in enumerate(train['departure_origin'].unique())}

train['departure_origin'] = train['departure_origin'].map(departure_origin_map).astype('UInt16')
test['departure_origin'] = test['departure_origin'].map(departure_origin_map).fillna(len(departure_origin_map)).astype('UInt16')

In [127]:
departure_destination_map = {old_id: new_id for new_id, old_id in enumerate(train['departure_destination'].unique())}

train['departure_destination'] = train['departure_destination'].map(departure_destination_map).astype('UInt16')
test['departure_destination'] = test['departure_destination'].map(departure_destination_map).fillna(len(departure_destination_map)).astype('UInt16')

In [128]:
return_origin_map = {old_id: new_id for new_id, old_id in enumerate(train['return_origin'].unique())}

train['return_origin'] = train['return_origin'].map(return_origin_map).astype('UInt16')
test['return_origin'] = test['return_origin'].map(return_origin_map).fillna(len(return_origin_map)).astype('UInt16')

In [129]:
return_destination_map = {old_id: new_id for new_id, old_id in enumerate(train['return_destination'].unique())}

train['return_destination'] = train['return_destination'].map(return_destination_map).astype('UInt8')
test['return_destination'] = test['return_destination'].map(return_destination_map).fillna(len(return_destination_map)).astype('UInt8')

In [130]:
del train['searchRoute']
del test['searchRoute']

In [131]:
train['taxes'] = train['taxes'].round().astype('UInt32')
test['taxes'] = test['taxes'].round().astype('UInt32')

In [132]:
train['totalPrice'] = train['totalPrice'].round().astype('UInt32')
test['totalPrice'] = test['totalPrice'].round().astype('UInt32')

In [133]:
train['selected'] = train['selected'].astype('bool')

In [134]:
memory_bytes = train.memory_usage(deep=True).sum()
memory_mb = memory_bytes / (1024 ** 2)
print(f"Размер DataFrame в памяти: {memory_mb:.2f} МБ")

Размер DataFrame в памяти: 6762.19 МБ


In [135]:
current_dir = os.getcwd()
folder_name = os.path.basename(current_dir)

train_path = f"{folder_name}_train.parquet"
test_path = f"{folder_name}_test.parquet"

In [136]:
train.to_parquet(train_path, index=False)
test.to_parquet(test_path, index=False)

In [137]:
train_path

'1_train.parquet'

In [138]:
current_dir

'C:\\Users\\Николай\\PycharmProjects\\FlightRank_2025\\mydata\\1'