In [1]:
import pandas as pd
from os.path import join
import os
from sklearn.model_selection import train_test_split

In [2]:
raw_data_path = join("..", "data", "raw")

In [3]:
cols_to_int = [
    "YEAR",
    "QUARTER",
    "MONTH",
    "DAY_OF_MONTH",
    "DAY_OF_WEEK",
    "AIRLINE_ID",
    "FL_NUM",
    "ORIGIN_AIRPORT_ID",
    "ORIGIN_AIRPORT_SEQ_ID",
    "ORIGIN_CITY_MARKET_ID",
    "ORIGIN_STATE_FIPS",
    "ORIGIN_WAC",
    "DEST_AIRPORT_ID",
    "DEST_AIRPORT_SEQ_ID",
    "DEST_CITY_MARKET_ID",
    "DEST_STATE_FIPS",
    "DEST_WAC",
    "CRS_DEP_TIME",
    "DEP_TIME",
    "DEP_DELAY",
    "DEP_DELAY_NEW",
    "DEP_DEL15",
    "DEP_DELAY_GROUP",
    "TAXI_OUT",
    "WHEELS_OFF",
    "WHEELS_ON",
    "TAXI_IN",
    "CRS_ARR_TIME",
    "ARR_TIME",
    "ARR_DELAY",
    "ARR_DELAY_NEW",
    "ARR_DEL15",
    "ARR_DELAY_GROUP",
    "CANCELLED",
    "DIVERTED",
    "CRS_ELAPSED_TIME",
    "ACTUAL_ELAPSED_TIME",
    "AIR_TIME",
    "FLIGHTS",
    "DISTANCE",
    "DISTANCE_GROUP",
    "CARRIER_DELAY",
    "WEATHER_DELAY",
    "NAS_DELAY",
    "SECURITY_DELAY",
    "LATE_AIRCRAFT_DELAY",
    "FIRST_DEP_TIME",
    "TOTAL_ADD_GTIME",
    "LONGEST_ADD_GTIME",
]

In [4]:
for month in range(1, 13):
    data_path = join(raw_data_path, f"2016_{month:02}.csv")
    df = pd.read_csv(data_path, on_bad_lines="warn", low_memory=False)

    df = df.drop(columns=["Unnamed: 64"])

    if month == 4:
        df = df.drop(index=461814)
        df = df[df["MONTH"] != 3]

    for col in cols_to_int:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce").astype("Int64")

    parquet_path = data_path.replace(".csv", ".parquet")
    df.to_parquet(parquet_path)
    print(f"Converti : {data_path} -> {os.path.basename(parquet_path)}")

Converti : ../data/raw/2016_01.csv -> 2016_01.parquet
Converti : ../data/raw/2016_02.csv -> 2016_02.parquet
Converti : ../data/raw/2016_03.csv -> 2016_03.parquet


Skipping line 386249: expected 65 fields, saw 83
Skipping line 388291: expected 65 fields, saw 78
Skipping line 389371: expected 65 fields, saw 72
Skipping line 389548: expected 65 fields, saw 81
Skipping line 453858: expected 65 fields, saw 97

  df = pd.read_csv(data_path, on_bad_lines="warn", low_memory=False)


Converti : ../data/raw/2016_04.csv -> 2016_04.parquet
Converti : ../data/raw/2016_05.csv -> 2016_05.parquet
Converti : ../data/raw/2016_06.csv -> 2016_06.parquet
Converti : ../data/raw/2016_07.csv -> 2016_07.parquet
Converti : ../data/raw/2016_08.csv -> 2016_08.parquet
Converti : ../data/raw/2016_09.csv -> 2016_09.parquet
Converti : ../data/raw/2016_10.csv -> 2016_10.parquet
Converti : ../data/raw/2016_11.csv -> 2016_11.parquet
Converti : ../data/raw/2016_12.csv -> 2016_12.parquet


In [5]:
pd.set_option("display.max_rows", 65)
pd.set_option("display.max_columns", 65)

df_list = []
total_lines_count = 0

for month in range(1, 13):
    data_path = join(raw_data_path, f"2016_{month:02}.parquet")
    try:
        month_df = pd.read_parquet(data_path)
        df_list.append(month_df)
        lines_count = month_df.shape[0]
        total_lines_count += lines_count
        print(f"{lines_count} lines loaded from {data_path} ✅")
    except pd.errors.ParserError as err:
        print(f"ParserError for {data_path}: {err}")
    print("-" * 50)

print(f"For a total lines count of: {total_lines_count}")


445827 lines loaded from ../data/raw/2016_01.parquet ✅
--------------------------------------------------
423889 lines loaded from ../data/raw/2016_02.parquet ✅
--------------------------------------------------
479122 lines loaded from ../data/raw/2016_03.parquet ✅
--------------------------------------------------
362832 lines loaded from ../data/raw/2016_04.parquet ✅
--------------------------------------------------
479358 lines loaded from ../data/raw/2016_05.parquet ✅
--------------------------------------------------
487637 lines loaded from ../data/raw/2016_06.parquet ✅
--------------------------------------------------
502457 lines loaded from ../data/raw/2016_07.parquet ✅
--------------------------------------------------
498347 lines loaded from ../data/raw/2016_08.parquet ✅
--------------------------------------------------
454878 lines loaded from ../data/raw/2016_09.parquet ✅
--------------------------------------------------
472626 lines loaded from ../data/raw/2016_10.p

In [6]:
concat_df = pd.concat(df_list, ignore_index=True)

In [7]:
concat_df.to_parquet(join(raw_data_path, "complete.parquet"))