In [107]:
# Imports
import pandas as pd
import re

## Load Raw Dataframe

---

In [108]:
# Load raw dataframe
aig_df = pd.read_csv('../data/raw/Sample_Data.csv')

## Data Transformation

---

### **Transformation (EDA-1)**

Impute `Subsidiary`, `Subsidiary Name` missing values with 'No subsidiary' and `Aircraft type` with 'Not specified'.

In [109]:
# Address missing values in 'Subsidiary', 'Subsidiary Name' and 'Aircraft type' columns accordingly
aig_df['Subsidiary'] = aig_df['Subsidiary'].fillna('NS')
aig_df['Subsidiary Name'] = aig_df['Subsidiary Name'].fillna('No subsidiary')
aig_df['Aircraft type'] = aig_df['Aircraft type'].fillna('Not specified')

# Check for remaining missing values
aig_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3460 entries, 0 to 3459
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   OpCo                    3460 non-null   object
 1   OpCo Name               3460 non-null   object
 2   Subsidiary              3460 non-null   object
 3   Subsidiary Name         3460 non-null   object
 4   Departure Airport       3460 non-null   object
 5   Departure Airport Name  3460 non-null   object
 6   Departure Country       3460 non-null   object
 7   Departure Country Name  3460 non-null   object
 8   Departure Region        3460 non-null   object
 9   Arrival Airport         3460 non-null   object
 10  Arrival Airport Name    3460 non-null   object
 11  Arrival Country         3460 non-null   object
 12  Arrival Country Name    3460 non-null   object
 13  Arrival Region          3460 non-null   object
 14  Aircraft type           3460 non-null   object
 15  Date

### **Transformation (EDA-2)**

Replace 'Vueling+' instance with 'Vueling' for consistency in `OpCo Name` column.

In [110]:
# Replace 'Vueling+' with 'Vueling' in 'OpCo Name' column
aig_df['OpCo Name'] = aig_df['OpCo Name'].replace('Vueling+', 'Vueling')

# Check for unique values in 'OpCo Name'
print(aig_df['OpCo Name'].unique())

['Vueling' 'British Airways' 'Level' 'Iberia' 'Aer Lingus'
 'Iberia Express']


### **Transformation (EDA-3)**

Remove special characters in `Arrival/Departure Country Name` for standarization for names in English.

In [111]:
# Clean and standardize country names
clean_country_names = lambda name: re.sub(r'[^a-zA-Z0-9\s\.,-]', '', name.strip()).title()

# Apply the cleaning function directly within the assignment
aig_df['Arrival Country Name'] = aig_df['Arrival Country Name'].apply(clean_country_names)
aig_df['Departure Country Name'] = aig_df['Departure Country Name'].apply(clean_country_names)

In [112]:
# Validation
regex = r'[^a-zA-Z0-9\s\.,-]' # Regex expression

# Identify and filter invalid records in a single step
invalid_rows_df = aig_df[aig_df[['Departure Country Name', 'Arrival Country Name']]\
                         .apply(lambda x: x.str.contains(regex)).any(axis=1)]

print(f"Instances with invalid characters in 'Country Names': {len(invalid_rows_df[['Departure Country Name', 'Arrival Country Name']])}")

Instances with invalid characters in 'Country Names': 0


### **Transformation (EDA-4)**

Convert `Date` column with date strings to datetime objects instead of strings, which will allow us to perform datetime operations on this column.

In [113]:
# Convert 'Date' column to datetime objects
aig_df['Date'] = pd.to_datetime(aig_df['Date'])

# Check datatypes for validation
aig_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3460 entries, 0 to 3459
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   OpCo                    3460 non-null   object        
 1   OpCo Name               3460 non-null   object        
 2   Subsidiary              3460 non-null   object        
 3   Subsidiary Name         3460 non-null   object        
 4   Departure Airport       3460 non-null   object        
 5   Departure Airport Name  3460 non-null   object        
 6   Departure Country       3460 non-null   object        
 7   Departure Country Name  3460 non-null   object        
 8   Departure Region        3460 non-null   object        
 9   Arrival Airport         3460 non-null   object        
 10  Arrival Airport Name    3460 non-null   object        
 11  Arrival Country         3460 non-null   object        
 12  Arrival Country Name    3460 non-null   object  

### **Transformation (EDA-6)**

Adjust column names and add necessary fields to optimize for data modeling and integration into data warehouses.

In [114]:
# Check current columns
aig_df.columns

Index(['OpCo', 'OpCo Name', 'Subsidiary', 'Subsidiary Name',
       'Departure Airport', 'Departure Airport Name', 'Departure Country',
       'Departure Country Name', 'Departure Region', 'Arrival Airport',
       'Arrival Airport Name', 'Arrival Country', 'Arrival Country Name',
       'Arrival Region', 'Aircraft type', 'Date', 'Cabin', 'Service',
       '# Passengers', '# Flights'],
      dtype='object')

In [115]:
# Rename columns
aig_df.rename(columns={
    'OpCo': 'OpCo_Code',
    'OpCo Name': 'OpCo_Name',
    'Subsidiary': 'Subsidiary_Code',
    'Subsidiary Name': 'Subsidiary_Name',
    'Departure Airport': 'Departure_Airport_Code',
    'Departure Airport Name': 'Departure_Airport_Name',
    'Departure Country': 'Departure_Country_Code',
    'Departure Country Name': 'Departure_Country_Name',
    'Departure Region': 'Departure_Region',
    'Arrival Airport': 'Arrival_Airport_Code',
    'Arrival Airport Name': 'Arrival_Airport_Name',
    'Arrival Country': 'Arrival_Country_Code',
    'Arrival Country Name': 'Arrival_Country_Name',
    'Arrival Region': 'Arrival_Region',
    'Aircraft type': 'Aircraft_Type',
    '# Passengers': 'Passengers',
    '# Flights': 'Flights'
}, inplace=True)

In [116]:
# Extract 'Year' and 'Quarter' from 'Date'
aig_df['Year'] = aig_df['Date'].dt.year
aig_df['Quarter'] = aig_df['Date'].dt.quarter

### **Data Integrity Check (Transformation Phase)**

Prior to database insertion, an essential intermediary verification will be conducted to ensure the absence of duplicate entries, maintaining data integrity.

In [117]:
# Remove any duplicated rows based on all columns
aig_df = aig_df.drop_duplicates()

**Check processed dataframe**

In [118]:
# First records
aig_df.head()

Unnamed: 0,OpCo_Code,OpCo_Name,Subsidiary_Code,Subsidiary_Name,Departure_Airport_Code,Departure_Airport_Name,Departure_Country_Code,Departure_Country_Name,Departure_Region,Arrival_Airport_Code,...,Arrival_Country_Name,Arrival_Region,Aircraft_Type,Date,Cabin,Service,Passengers,Flights,Year,Quarter
0,VY,Vueling,NS,No subsidiary,BIO,BILBAO,ES,Spain,Europe/Domestic,BCN,...,Spain,Europe/Domestic,A320,2019-07-02,Economy,Non-Premium,220,9,2019,3
1,VY,Vueling,NS,No subsidiary,BCN,BARCELONA,ES,Spain,Europe/Domestic,BIO,...,Spain,Europe/Domestic,A320,2019-07-02,Economy,Non-Premium,503,6,2019,3
2,VY,Vueling,NS,No subsidiary,BIO,BILBAO,ES,Spain,Europe/Domestic,PMI,...,Spain,Europe/Domestic,A320,2019-07-02,Economy,Non-Premium,188,11,2019,3
3,VY,Vueling,NS,No subsidiary,PMI,PALMA,ES,Spain,Europe/Domestic,BIO,...,Spain,Europe/Domestic,A320,2019-07-02,Economy,Non-Premium,405,8,2019,3
4,VY,Vueling,NS,No subsidiary,BIO,BILBAO,ES,Spain,Europe/Domestic,LIS,...,Portugal,Europe,A320,2019-07-02,Economy,Non-Premium,152,20,2019,3


In [119]:
# Dataframe info
aig_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2257 entries, 0 to 2256
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   OpCo_Code               2257 non-null   object        
 1   OpCo_Name               2257 non-null   object        
 2   Subsidiary_Code         2257 non-null   object        
 3   Subsidiary_Name         2257 non-null   object        
 4   Departure_Airport_Code  2257 non-null   object        
 5   Departure_Airport_Name  2257 non-null   object        
 6   Departure_Country_Code  2257 non-null   object        
 7   Departure_Country_Name  2257 non-null   object        
 8   Departure_Region        2257 non-null   object        
 9   Arrival_Airport_Code    2257 non-null   object        
 10  Arrival_Airport_Name    2257 non-null   object        
 11  Arrival_Country_Code    2257 non-null   object        
 12  Arrival_Country_Name    2257 non-null   object       

## Saving Processed Dataframe

---

In [120]:
# Save the DataFrame to a CSV file
aig_df.to_csv('../data/processed/aig_data_processed.csv', index=False)