# PREDICTION DES RETARDS DE VOL


## 1. IMPORT DES LIBRAIRIES


In [1]:
import pandas as pd
from os.path import join
from sklearn.model_selection import train_test_split

## 2. CHARGEMENT DES DONNÉES


Les données sont réparties en 12 fichiers csv, un par mois de l'année 2016.  
Chaque fichier comporte plus de 470000 lignes. Cela représente un total de plus de 5,5 millions de lignes. Il nous faut donc trouver une stratégie pour épargner notre mémoire.


In [2]:
raw_data_path = join("..", "data", "raw")

In [3]:
pd.set_option("display.max_rows", 65)
pd.set_option("display.max_columns", 65)

df_list = []
total_lines_count = 0

for month in range(1, 13):
    data_path = join(raw_data_path, f"2016_{month:02}.csv")
    try:
        month_df = pd.read_csv(data_path, on_bad_lines="warn", low_memory=False)
        df_list.append(month_df)
        lines_count = month_df.shape[0]
        total_lines_count += lines_count
        print(f"{lines_count} lines loaded from {data_path} ✅")
    except pd.errors.ParserError as err:
        print(f"ParserError for {data_path}: {err}")
    print("-" * 50)

print(f"For a total lines count of: {total_lines_count}")


445827 lines loaded from ../data/raw/2016_01.csv ✅
--------------------------------------------------
423889 lines loaded from ../data/raw/2016_02.csv ✅
--------------------------------------------------
479122 lines loaded from ../data/raw/2016_03.csv ✅
--------------------------------------------------


Skipping line 386249: expected 65 fields, saw 83
Skipping line 388291: expected 65 fields, saw 78
Skipping line 389371: expected 65 fields, saw 72
Skipping line 389548: expected 65 fields, saw 81
Skipping line 453858: expected 65 fields, saw 97

  month_df = pd.read_csv(data_path, on_bad_lines="warn", low_memory=False)


479950 lines loaded from ../data/raw/2016_04.csv ✅
--------------------------------------------------
479358 lines loaded from ../data/raw/2016_05.csv ✅
--------------------------------------------------
487637 lines loaded from ../data/raw/2016_06.csv ✅
--------------------------------------------------
502457 lines loaded from ../data/raw/2016_07.csv ✅
--------------------------------------------------
498347 lines loaded from ../data/raw/2016_08.csv ✅
--------------------------------------------------
454878 lines loaded from ../data/raw/2016_09.csv ✅
--------------------------------------------------
472626 lines loaded from ../data/raw/2016_10.csv ✅
--------------------------------------------------
450938 lines loaded from ../data/raw/2016_11.csv ✅
--------------------------------------------------
460949 lines loaded from ../data/raw/2016_12.csv ✅
--------------------------------------------------
For a total lines count of: 5635978


Nous voilà donc avec une liste python des dataframes issus des 12 fichiers csv initiaux, pour un total de 5 635 978 lignes (soit plus de 5.6millions de lignes comme anticipé).


In [4]:
concat_df = pd.concat(df_list, ignore_index=True)

Nous avons identifié la colonne `ARR_DEL15` comme étant notre cible (target). Il est indispensable d'exclure toute ligne sans cible.


In [5]:
empty_target_lines_count = concat_df["ARR_DEL15"].isna().sum()
empty_target_percentage = empty_target_lines_count / total_lines_count * 100

print(f"{empty_target_lines_count} lines have an empty target.")
print(
    f"The empty target lines represent {empty_target_percentage:.2f}% of the original dataset."
)

79781 lines have an empty target.
The empty target lines represent 1.42% of the original dataset.


Nous allons maintenant supprimer les lignes dont la cible est manquante.


In [None]:
clean_df = concat_df.dropna(subset=["ARR_DEL15"])
clean_df["ARR_DEL15"] = clean_df["ARR_DEL15"].astype("bool")

In [7]:
actual_clean_df_line_count = int(clean_df.shape[0])
expected_clean_df_line_count = int(total_lines_count - empty_target_lines_count)

if actual_clean_df_line_count == expected_clean_df_line_count:
    print(
        f"{empty_target_lines_count} lines with empty target have been dropped with success."
    )
else:
    print("Something wrong happened while cleaning dataframe")

79781 lines with empty target have been dropped with success.


In [8]:
clean_data_path = join(raw_data_path, "clean_concat.parquet")
clean_df.to_csv(clean_data_path)

## 3. EXPLORATION DES DONNÉES


Notre dataset est très volumineux, pour des questions de mémoire nous allons travailler sur un échantillon représentatif du dataset en terme de cible.


In [None]:
df_sample, _ = train_test_split(
    clean_df, train_size=0.1, stratify=clean_df["ARR_DEL15"], random_state=42
)

- La colonne `FL_DATE` contient à elle seule toutes les informations temporelles. Du point de vue du stockage, il pourrait être intéressant de ne conserver que cette donnée temporelle et de supprimer `YEAR`, `QUARTER`, `MONTH`, `DAY_OF_MONTH`, `DAY_OF_WEEK`


In [None]:
df = clean_df.copy()

## 4. VISUALISATIONS


## 5.MODÉLISATION
