### Setup

#### Imports

In [75]:
import pandas as pd
import os

#### Paths/Folders

In [76]:
data_folder = "data"

clean_data_folder = "cleaned_data"

if not os.path.exists(clean_data_folder):
    os.mkdir(clean_data_folder)

#### Helper Functions

In [77]:
def load(filename):
    csv = os.path.join(data_folder, filename)
    return pd.read_csv(csv)

def save(dataframe, filename):
    csv = os.path.join(clean_data_folder, filename)
    dataframe.to_csv(csv, index=False)

### Data Cleaning

#### Load Factor Data

In [78]:
load_factor_data = load("load_factor_data.csv")

# Rename organization_code_iata to airline_iata, period_end_date to date, and passenger_load_factor_data_pct to load_factor_data
load_factor_data = load_factor_data.rename(
    columns={
        "organization_code_iata": "airline_iata",
        "period_end_date": "date",
        "passenger_load_factor_data_pct": "load_factor_data",
    }
)

# Drop organization_code_icao and period_start_date
load_factor_data = load_factor_data.drop(
    columns=["organization_code_icao", "period_start_date"]
)

# Keep rows where accounting_period is month {number}
load_factor_data = load_factor_data[
    load_factor_data["accounting_period"].str.contains("Month")
]

# Drop accounting_period
load_factor_data = load_factor_data.drop(columns=["accounting_period"])

# Convert dates to yyyy-mm format
load_factor_data["date"] = pd.to_datetime(load_factor_data["date"]).dt.strftime("%Y-%m")

save(load_factor_data, "load_factor_data.csv")

load_factor_data.head()

Unnamed: 0,airline_iata,date,passenger_load_factor_pct
0,EI,2020-07,24.696155
1,EI,2020-08,29.712449
2,EI,2020-09,25.965699
3,EI,2020-10,17.795881
4,EI,2020-11,15.954516


#### Submission

In [79]:
submission = load("submission.csv")

# Combine load_year and load_month into new date column at the front in yyyy-mm format
submission.insert(
    0,
    "date",
    pd.to_datetime(
        submission["load_year"].astype(str) + "-" + submission["load_month"].astype(str)
    ).dt.strftime("%Y-%m"),
)

# Drop load_year and load_month
submission = submission.drop(columns=["load_year", "load_month"])

# Rename organization_code_iata to airline_iata
submission = submission.rename(
    columns={"organization_code_iata": "airline_iata"}
)

# Rename forecast_load_factor_pct to load_factor
submission = submission.rename(
    columns={"forecast_load_factor_pct": "load_factor"}
)

save(submission, "submission.csv")

submission.head()

Unnamed: 0,date,airline_iata,load_factor
0,2022-01,G4,
1,2022-02,G4,
2,2022-03,G4,
3,2022-04,G4,
4,2022-05,G4,


#### Country

In [80]:
country = load("country.csv")

# Rename country to country_name, iso to country_iso, iata to country_iata, and numeric to country_numeric
country = country.rename(
    columns={
        "country": "country_name",
        "iso": "country_iso",
        "iata": "country_iata",
        "numeric": "country_numeric"
    }
)

save(country, "country.csv")

country.head()

Unnamed: 0,country_name,country_iata,country_iso,country_numeric
0,Afghanistan,AF,AFG,4
1,Åland Islands,AX,ALA,248
2,Albania,AL,ALB,8
3,Algeria,DZ,DZA,12
4,American Samoa,AS,ASM,16


#### GDP

In [81]:
gdp = load("gdp.csv")

# Rename location_iso to country_iso
gdp = gdp.rename(columns={"location_iso": "country_iso"})

# Drop indicator and subject
gdp = gdp.drop(columns=["indicator", "subject"])

# Drop rows where frequency is annual
gdp = gdp[gdp["frequency"] != "A"]

# Drop frequency
gdp = gdp.drop(columns=["frequency"])

# Rename time to date and value to gdp
gdp = gdp.rename(columns={"time": "date", "value": "gdp"})

# Convert dates to yyyy-mm format
gdp = gdp.assign(date=pd.to_datetime(gdp["date"]).dt.strftime("%Y-%m"))

# Drop rows where measure is PC_CHGPY
gdp = gdp[gdp["measure"] != "PC_CHGPY"]

# Drop measure
gdp = gdp.drop(columns=["measure"])

# Create new rows for each month in date column
gdp = gdp.assign(
    date=pd.to_datetime(gdp["date"])
    .dt.to_period("M")
    .apply(lambda r: r.to_timestamp("M"))
    .dt.strftime("%Y-%m")
)

save(gdp, "gdp.csv")

gdp.head()

Unnamed: 0,country_iso,date,gdp
47,KOR,2017-01,0.984785
48,KOR,2017-04,0.714648
49,KOR,2017-07,1.441838
50,KOR,2017-10,-0.319869
51,KOR,2018-01,1.205852


#### Seats

In [87]:
seats = load("seats.csv")

# Rename operating_airline to airline_iata and total_seats to seats
seats = seats.rename(columns={"operating_airline": "airline_iata", "total_seats": "seats"})

# Combine seats_year and seats_month into new date column before total_seats column in yyyy-mm format
seats.insert(
    3,
    "date",
    pd.to_datetime(
        seats["seats_year"].astype(str) + "-" + seats["seats_month"].astype(str)
    ).dt.strftime("%Y-%m"),
)

# Drop seats_year and seats_month
seats = seats.drop(columns=["seats_year", "seats_month"])

save(seats, "seats.csv")

seats.head()

Unnamed: 0,airline_iata,departure_country_iata,arrival_country_iata,date,seats
0,8E,US,US,2017-10,3130
1,AA,US,US,2021-12,17677677
2,BB,VI,VI,2020-03,2295
3,NH,JP,JP,2020-08,3146684
4,NX,CN,MO,2022-02,58849


#### Validation

In [88]:
validation = load("validation.csv")

# Drop load_year, load_month, and organization_code_icao
validation = validation.drop(columns=["load_year", "load_month", "organization_code_icao"])

# Rename time to date, organization_code_iata to airline_iata, and passenger_load_factor_pct to load_factor
validation = validation.rename(
    columns={
        "time": "date",
        "organization_code_iata": "airline_iata",
        "passenger_load_factor_pct": "load_factor",
    }
)

# Convert date to yyyy-mm format
validation["date"] = pd.to_datetime(validation["date"]).dt.strftime("%Y-%m")

save(validation, "validation.csv")

validation.head()

Unnamed: 0,date,airline_iata,load_factor
0,2022-01,5T,58.690042
1,2022-02,5T,63.308469
2,2022-03,5T,64.637411
3,2022-04,5T,66.932459
4,2022-05,5T,67.446506
