In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()
import os
import json

pd.set_option('future.no_silent_downcasting', True)

In [2]:
RAW_DIR = "C:/Users/Николай/PycharmProjects/FlightRank_2025/data/raw"

In [3]:
train = pd.read_parquet('C:/Users/Николай/PycharmProjects/FlightRank_2025/mydata/1/1_train.parquet', engine='pyarrow')

In [4]:
test = pd.read_parquet('C:/Users/Николай/PycharmProjects/FlightRank_2025/mydata/1/1_test.parquet', engine='pyarrow')

In [5]:
def load_ranker_info(ranker_ids, raw_dir=RAW_DIR):
    """Считываем json для каждого ranker_id и сохраняем в dict"""
    data_list = []
    for rid in tqdm(ranker_ids, desc="Loading JSON"):
        file_path = os.path.join(raw_dir, f"{rid}.json")
        if not os.path.exists(file_path):
            continue
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)
        data_list.append({
            "ranker_id": rid,
            "searchType": data.get("metadata", {}).get("searchType"),
            "yearOfBirth": data.get("personalData", {}).get("yearOfBirth"),
            "hasAssistant": data.get("personalData", {}).get("hasAssistant"),
            "isGlobal": data.get("personalData", {}).get("isGlobal"),
            "requestDepartureDate": data.get("routeData", {}).get("requestDepartureDate"),
        })
    return pd.DataFrame(data_list).set_index("ranker_id")


def add_ranker_info(df, info_df):
    """Быстрое добавление через join по ranker_id"""
    return df.join(info_df, on="ranker_id")


# Пример использования:
# train = pd.read_csv("train.csv")
# test = pd.read_csv("test.csv")

all_rankers = pd.concat([train["ranker_id"], test["ranker_id"]]).unique()
info_df = load_ranker_info(all_rankers, RAW_DIR)

train = add_ranker_info(train, info_df)
test = add_ranker_info(test, info_df)

Loading JSON: 100%|██████████| 150770/150770 [16:34<00:00, 151.63it/s]


In [27]:
train.dtypes

companyID                       UInt16
corporateTariffCode              UInt8
frequentFlyer                   object
nationality                      UInt8
isAccess3D                        bool
                             ...      
searchType                        bool
yearOfBirth                      UInt8
hasAssistant                      bool
isGlobal                          bool
requestDepartureDate    datetime64[ns]
Length: 131, dtype: object

In [12]:
train['searchType'] = train['searchType'].astype('bool')
test['searchType'] = test['searchType'].astype('bool')

In [19]:
test["yearOfBirth"] = test["yearOfBirth"].replace({
    2007.0: 2006.0,
    2014.0: 2013.0
})

In [21]:
yearOfBirth_map = {old_id: new_id for new_id, old_id in enumerate(train['yearOfBirth'].unique())}

train['yearOfBirth'] = train['yearOfBirth'].map(yearOfBirth_map).astype('UInt8')
test['yearOfBirth'] = test['yearOfBirth'].map(yearOfBirth_map).fillna(len(yearOfBirth_map)).astype('UInt8')

In [26]:
train['requestDepartureDate'] = pd.to_datetime(train['requestDepartureDate'])
test['requestDepartureDate'] = pd.to_datetime(test['requestDepartureDate'])

In [28]:
memory_bytes = train.memory_usage(deep=True).sum()
memory_mb = memory_bytes / (1024 ** 2)
print(f"Размер DataFrame в памяти: {memory_mb:.2f} МБ")

Размер DataFrame в памяти: 8306.63 МБ


In [29]:
current_dir = os.getcwd()
folder_name = os.path.basename(current_dir)

train_path = f"{folder_name}_train.parquet"
test_path = f"{folder_name}_test.parquet"

In [31]:
train.to_parquet(train_path, index=False)
test.to_parquet(test_path, index=False)