In [None]:
import os
import zipfile
import pandas as pd
from datetime import datetime

In [None]:
def merge_csv_from_zips(dataset_folder, output_folder, output_file):
    """
    Merges CSV files from multiple ZIP archives into a single CSV file.

    Parameters:
    - dataset_folder: Path where ZIP files are stored.
    - output_folder: Path where the merged CSV file will be saved.
    - output_file: Name of the merged CSV file.
    """
    # Create the output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # List to store DataFrames
    df_list = []

    # Iterate over all .zip files in the dataset folder
    for filename in os.listdir(dataset_folder):
        if filename.endswith('.zip'):
            zip_path = os.path.join(dataset_folder, filename)
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                # Iterate through the files in the zip and check for CSVs
                for file in zip_ref.namelist():
                    if file.endswith('.csv'):
                        # Read the CSV file directly from the zip file
                        with zip_ref.open(file) as csv_file:
                            df = pd.read_csv(csv_file)
                            df_list.append(df)
                        print(f"CSV file {file} from {filename} added to the DataFrame list.")
                    else:
                        print(f"No CSV file found in {filename}")

    # Concatenate all DataFrames
    if df_list:
        merged_df = pd.concat(df_list, ignore_index=True)

        # Check for successful merging
        print(f"Merged DataFrame shape: {merged_df.shape}")

        # Save the concatenated DataFrame to the output folder
        output_path = os.path.join(output_folder, output_file)
        merged_df.to_csv(output_path, index=False)

        print(f"All CSV files have been merged and saved to {output_path}")
        return merged_df  # Return the merged DataFrame for further use
    else:
        print("No CSV files were found to merge.")
        return None



In [None]:
def clean_flight_data(df):
   # Create a weather delay indicator
    delay_threshold = 15  # Minutes
    df['WEATHER_DELAY_IND'] = df['DEP_DELAY'].apply(lambda x: 1 if x > delay_threshold else 0)

    # Columns to drop
    # columns_to_drop = [
    #     'DEP_DELAY', 'DEP_DELAY_NEW',    # Actual departure times and delays
    #     'ARR_TIME', 'ARR_DELAY', 'ARR_DELAY_NEW',    # Actual arrival times and delays
    #     'ARR_TIME_BLK',                              # Arrival time block
    #     'CANCELLED',            # Cancellation information
    #     'AIR_TIME', 'FLIGHTS',                       # Actual flight duration and performance
    #     'WEATHER_DELAY',                             # Actual weather delay (data leakage)
    #     'TAIL_NUM',                                  # Aircraft tail number (high cardinality)
    #     'ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID',      # Redundant with ORIGIN and DEST
    #     'ORIGIN_STATE_ABR', 'DEST_STATE_ABR',        # Redundant state abbreviations
    #     'DEP_TIME_BLK'                               # Departure time block (high cardinality)
    # ]


    df =  df[df['ORIGIN_STATE_NM'].isin(['New York', 'Colorado', 'California', 'Florida', 'Texas', 'Illinois', 'Georgia', 'New Jersey', 'Maryland', 'Nevada'])]

    df = df.drop_duplicates()

    # Drop the columns from the DataFrame
    # df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

    return df

In [None]:
def convert_hhmm_to_time(hhmm):
    """Converts HHMM format to time object."""
    if pd.isnull(hhmm) or hhmm == '':
        return None
    hhmm = int(hhmm)
    hours = hhmm // 100
    minutes = hhmm % 100
    # Handle cases where hhmm is invalid
    if hours >= 24 or minutes >= 60:
        return None
    return datetime.strptime(f'{hours:02d}:{minutes:02d}', '%H:%M').time()

def get_departure_datetime(row):
    # Extract FL_DATE
    FL_DATE = row['FL_DATE']

    flight_date = pd.to_datetime(FL_DATE).date()
    # Extract CRS_DEP_TIME
    DEP_TIME = row.get('CRS_DEP_TIME')

    # Use scheduled time if actual time is missing
    if pd.isnull(DEP_TIME) or DEP_TIME == '':
        DEP_TIME = row.get('DEP_TIME')

    # Convert time to datetime.time object
    dep_time = convert_hhmm_to_time(DEP_TIME)

    # Combine flight_date and dep_time to get departure_datetime
    departure_datetime = datetime.combine(flight_date, dep_time)

    return pd.Series({'departure_datetime': departure_datetime})

def convert_time_to_minutes(time):
    time_str = '{0:0>4}'.format(int(time))
    hours = int(time_str[:2])
    minutes = int(time_str[2:])

    return hours * 60 + minutes


def add_departure_datetime_column(df):

    # Apply the function to each row in the DataFrame
    df['departure_datetime'] = df.apply(lambda row: get_departure_datetime(row), axis=1)
    df['departure_datetime'] = pd.to_datetime(df['departure_datetime'], format='%Y-%m-%d %H:%M:%S')
    df['departure_resample15'] = df['departure_datetime'].dt.round('15min')

    # Extract useful time features
    df['CRS_DEP_HOUR'] = df['departure_datetime'].dt.hour
    # df['CRS_DEP_MONTH'] = df['departure_datetime'].dt.month
    # df['CRS_DEP_DAY_OF_WEEK'] = df['departure_datetime'].dt.dayofweek + 1  # Monday=0 in pandas


    df['CRS_DEP_TIME_MINUTES'] = df['CRS_DEP_TIME'].apply(convert_time_to_minutes)
    df = df.drop_duplicates()
    return df


In [None]:
def main():
    """
    Main function to merge CSV files from ZIP archives and save them into a single CSV file.
    """
    # Define the dataset folder containing the ZIP files
    dataset_folder = '/content/Dataset'  # Change this path to your dataset folder

    # Define the output folder and file name for the merged CSV
    output_folder = 'output'             # Output folder where merged CSV will be stored
    output_file = 'merged_Jan_2024.csv'  # Output CSV file name

    # Step 1: Merge CSV files from ZIP archives
    merged_df = merge_csv_from_zips(dataset_folder, output_folder, output_file)

    # Step 2: If merging was successful, process the departure datetime column
    merged_df = clean_flight_data(merged_df)  # Clean the data
    merged_df = add_departure_datetime_column(merged_df)

    # Step 3: Save the final DataFrame with the new departure_datetime column
    final_output_file = 'merged_Jan_2024_with_departure_datetime.csv'
    final_output_path = os.path.join(output_folder, final_output_file)
    print('Shape of Final merged file : ', merged_df.shape)
    merged_df.to_csv(final_output_path, index=False)

    print(f"Final CSV with departure datetime saved to {final_output_path}")

In [None]:
# Check if this script is being run directly and call main
if __name__ == "__main__":
    main()

CSV file T_ONTIME_REPORTING.csv from DL_SelectFields (5).zip added to the DataFrame list.


  df = pd.read_csv(csv_file)


CSV file T_ONTIME_REPORTING.csv from DL_SelectFields.zip added to the DataFrame list.
CSV file T_ONTIME_REPORTING.csv from DL_SelectFields (7).zip added to the DataFrame list.
CSV file T_ONTIME_REPORTING.csv from DL_SelectFields (2).zip added to the DataFrame list.
CSV file T_ONTIME_REPORTING.csv from DL_SelectFields (3).zip added to the DataFrame list.
CSV file T_ONTIME_REPORTING.csv from DL_SelectFields (8).zip added to the DataFrame list.
CSV file T_ONTIME_REPORTING.csv from DL_SelectFields (1).zip added to the DataFrame list.
CSV file T_ONTIME_REPORTING.csv from DL_SelectFields (6).zip added to the DataFrame list.
CSV file T_ONTIME_REPORTING.csv from DL_SelectFields (9).zip added to the DataFrame list.
CSV file T_ONTIME_REPORTING.csv from DL_SelectFields (4).zip added to the DataFrame list.
Merged DataFrame shape: (596071, 38)
All CSV files have been merged and saved to output/merged_Jan_2024.csv
Shape of Final merged file :  (317795, 43)
Final CSV with departure datetime saved to 