# Data Preparation of the Airline Crawling Results
This script loads the results from the flight data crawlers (Austrian Airlines, KLM, Lufthansa, and Qatar Airways), cleans up column names, ensures uniform column structure across datasets, and merges the data. Additionally, it converts airport names to their respective IATA codes and processes the 'crawling_date' column to make it easier to work with dates. Finally, the merged data is sorted by the 'crawling_date' and saved into a new CSV file.

In [5]:
import pandas as pd

# Load CSV files
aa = pd.read_csv('data/results_AustrianAirlines.csv', encoding='unicode_escape')
klm = pd.read_csv('data/results_KLM.csv', encoding='unicode_escape')
lh = pd.read_csv('data/results_Lufthansa.csv', encoding='unicode_escape')
qa = pd.read_csv('data/results_QatarAirways.csv', encoding='unicode_escape')

# Function to clean column names
def clean_columns(df):
    df.columns = df.columns.str.strip()  
    return df

# Clean column names
aa = clean_columns(aa)
klm = clean_columns(klm)
lh = clean_columns(lh)
qa = clean_columns(qa)

# Ensure all DataFrames have the same columns
klm = klm[aa.columns]
lh = lh[aa.columns]
qa = qa[aa.columns]

# Remove unnecessary column if it exists
if 'Unnamed: 0' in lh.columns:
    lh = lh.drop(columns=['Unnamed: 0'], axis=1)

# Concatenate DataFrames vertically
crawler_data = pd.concat([aa, klm, lh, qa], ignore_index=True)

# Convert 'crawling_date' to a datetime object without specifying the format, allowing Pandas to infer the format
crawler_data['crawling_date'] = pd.to_datetime(crawler_data['crawling_date'], dayfirst=True, errors='coerce')

# Sort by 'crawling_date'
crawler_data.sort_values(by='crawling_date', inplace=True)

# Reset index and drop the old one
crawler_data.reset_index(drop=True, inplace=True)

# Mapping for departure_airport and destination_airport
airport_mapping = {
    'Frankfurt': 'FRA',
    'Berlin': 'BER',
    'Hamburg': 'HAM',
    'München': 'MUC',
    'MÃ¼nchen': 'MUC',
    'M\x9fnchen': 'MUC',
    'London': 'LHR',   
    'Palma': 'PMI',
    'Palma de Mallorca': 'PMI',
    'Istanbul': 'SAW',  
    'Dubai': 'DXB',
    'New York': 'JFK',  
    'Shanghai': 'PVG'   
}

# Convert full airport names to their IATA codes
crawler_data['departure_airport'] = crawler_data['departure_airport'].replace(airport_mapping)
crawler_data['destination_airport'] = crawler_data['destination_airport'].replace(airport_mapping)

# Check if all entries were converted correctly
print("Unique departure_airport values:", crawler_data['departure_airport'].unique())
print("Unique destination_airport values:", crawler_data['destination_airport'].unique())

# Save the combined DataFrame
crawler_data.to_csv('cralwer_data_merged.csv', index=False, encoding='utf-8')

Unique departure_airport values: ['FRA']
Unique destination_airport values: ['BER' 'LHR' 'HAM' 'PVG' 'JFK' 'DXB' 'SAW' 'PMI' 'MUC']
