# 02807 Computational Tools for Data Science Group Project

## Importing the data

In [None]:
import pandas as pd
import glob
import os

In [None]:
# Read CSV files from 'data' directory
csv_files = glob.glob(os.path.join("data", "*.csv"))

In [None]:
# Generate Pandas DataFrames from CSV files
df_all = [pd.read_csv(file) for file in csv_files]

In [None]:
# Concatenate DataFrames
df = pd.concat(df_all, ignore_index=True)

## Cleaning the data

In [None]:
# Inspect data
df.head()

In [None]:
# Inspect data types and missing values
pd.DataFrame({'Data Type': df.dtypes, 'Missing Values': df.isnull().sum(), 'Percentage Missing Values': round((df.isnull().sum() / len(df)) * 100, 2)})

### Missing values

In [None]:
# Drop 'tail_number' column (due to missing values and insignificance)
df = df.drop(['tail_number'], axis=1)

In [None]:
# Drop rows with any missing values across all columns other than the departure and arrival columns (cancelled flights)
df = df.dropna(subset=df.columns.difference(['actual_departure_dt', 'actual_arrival_dt']))

### Data types

In [None]:
# Convert date columns to datetime format
date_columns = ['date', 'scheduled_departure_dt', 'scheduled_arrival_dt', 'actual_departure_dt', 'actual_arrival_dt']

for col in date_columns:
    df[col] = pd.to_datetime(df[col], errors='coerce')

In [None]:
# Inspect adjusted data types and missing values
pd.DataFrame({'Data Type': df.dtypes, 'Missing Values': df.isnull().sum(), 'Percentage Missing Values': round((df.isnull().sum() / len(df)) * 100, 2)})

### Feature engineering

In [None]:
import numpy as np

In [None]:
# Function to categorise delay times
def categorise_delay(delay):
    if delay <= 0:
        return 0
    elif 0 < delay <= 30:
        return 1
    elif 30 < delay <= 90:
        return 2
    else:
        return 3

In [None]:
# Add categorised delay time features to data
df['departure_delay_category'] = df['departure_delay'].apply(categorise_delay)
df['arrival_delay_category'] = df['arrival_delay'].apply(categorise_delay)

In [None]:
# Inspect updated data
df.head()

In [None]:

# Function to categorise weather conditions relative to historical weather conditions AT LOCATION
def categorise_weather(df, weather_col, station_col):
    station_stats = df.groupby(station_col)[weather_col].agg(['mean', 'std']).reset_index()
    
    df = df.merge(station_stats, on=station_col, how='left', suffixes=('', '_stats'))
    
    conditions = [
        (df[weather_col] < df['mean'] - df['std']),  # Much Lower
        (df[weather_col] < df['mean']),              # Lower
        (df[weather_col] < df['mean'] + df['std']),  # Higher
        (df[weather_col] >= df['mean'] + df['std'])  # Much Higher
    ]
    categories = [-2, -1, 1, 2]
    df[f'{weather_col}_category'] = np.select(conditions, categories, default='Average')
    
    df.drop(['mean', 'std'], axis=1, inplace=True)
    
    return df

In [None]:
# Add categorised weather condition features to data
station_columns = ['STATION_x', 'STATION_y']
weather_columns = ['HourlyDryBulbTemperature_x', 'HourlyPrecipitation_x', 'HourlyStationPressure_x', 'HourlyVisibility_x', 'HourlyWindSpeed_x', 'HourlyDryBulbTemperature_y', 'HourlyPrecipitation_y', 'HourlyStationPressure_y', 'HourlyVisibility_y', 'HourlyWindSpeed_y']

for station_col in station_columns:
    for weather_col in weather_columns:
        df = categorise_weather(df, weather_col, station_col)

In [None]:
# Inspect updated data
df.head()