In [1]:
import pandas as pd
import numpy as np

## Code to Read CSV Headers

In [2]:
file_path = "../data/itineraries.csv"

# Read only the first rows to get column names
try:
    df_preview = pd.read_csv(file_path, nrows=5)
    column_names = df_preview.columns.tolist()
    
    print("Columns in the dataset:")
    print(column_names)
    
except Exception as e:
    print(f"Error reading file: {e}")


Columns in the dataset:
['legId', 'searchDate', 'flightDate', 'startingAirport', 'destinationAirport', 'fareBasisCode', 'travelDuration', 'elapsedDays', 'isBasicEconomy', 'isRefundable', 'isNonStop', 'baseFare', 'totalFare', 'seatsRemaining', 'totalTravelDistance', 'segmentsDepartureTimeEpochSeconds', 'segmentsDepartureTimeRaw', 'segmentsArrivalTimeEpochSeconds', 'segmentsArrivalTimeRaw', 'segmentsArrivalAirportCode', 'segmentsDepartureAirportCode', 'segmentsAirlineName', 'segmentsAirlineCode', 'segmentsEquipmentDescription', 'segmentsDurationInSeconds', 'segmentsDistance', 'segmentsCabinCode']


## Data Dictionary

In [3]:
# Convert columns to appropriate data types
conversion_dict = {
    "searchDate": "datetime64[ns]",
    "flightDate": "datetime64[ns]",
    "segmentsDepartureTimeRaw": "datetime64[ns]",
    "segmentsArrivalTimeRaw": "datetime64[ns]",
    "elapsedDays": "Int64",
    "isBasicEconomy": "boolean",
    "isRefundable": "boolean",
    "isNonStop": "boolean",
    "baseFare": "float64",
    "totalFare": "float64",
    "seatsRemaining": "Int64",
    "totalTravelDistance": "float64",
    "segmentsDepartureTimeEpochSeconds": "Int64",
    "segmentsArrivalTimeEpochSeconds": "Int64",
    "segmentsDurationInSeconds": "Int64",
    "segmentsDistance": "float64"
}

# Convert categorical columns to category type
categorical_columns = [
    "startingAirport", "destinationAirport", "fareBasisCode", 
    "segmentsArrivalAirportCode", "segmentsDepartureAirportCode", 
    "segmentsAirlineName", "segmentsAirlineCode", "segmentsEquipmentDescription",
    "segmentsCabinCode"
]

# Apply type conversions
for col, dtype in conversion_dict.items():
    if col in df_preview.columns:
        try:
            df_preview[col] = pd.to_datetime(df_preview[col]) if "datetime" in dtype else df_preview[col].astype(dtype)
        except Exception as e:
            print(f"Warning: Could not convert column '{col}' to {dtype}. Error: {e}")

# Convert categorical columns
for col in categorical_columns:
    if col in df_preview.columns:
        df_preview[col] = df_preview[col].astype("category")

# Function to get example values (first non-null value)
def get_example_value(df, column_name):
    return df[column_name].dropna().iloc[0] if column_name in df.columns else "N/A"

# Descriptions from Kaggle
column_descriptions = {
    "legId": "An identifier for the flight.",
    "searchDate": "Date when this entry was recorded from Expedia.",
    "flightDate": "Date of the flight.",
    "startingAirport": "Three-character IATA code for the departure airport.",
    "destinationAirport": "Three-character IATA code for the arrival airport.",
    "fareBasisCode": "The fare basis code.",
    "travelDuration": "Total travel duration in hours and minutes.",
    "elapsedDays": "Number of elapsed days (usually 0).",
    "isBasicEconomy": "Indicates whether the ticket is for basic economy.",
    "isRefundable": "Indicates whether the ticket is refundable.",
    "isNonStop": "Indicates whether the flight is non-stop.",
    "baseFare": "Base price of the ticket (in USD).",
    "totalFare": "Total price of the ticket including taxes and fees.",
    "seatsRemaining": "Number of seats remaining.",
    "totalTravelDistance": "Total travel distance. This data is sometimes missing.",
    "segmentsDepartureTimeEpochSeconds": "Unix time for departure of each segment. Entries are separated by '||'.",
    "segmentsDepartureTimeRaw": "ISO 8601 formatted departure time for each segment. Entries are separated by '||'.",
    "segmentsArrivalTimeEpochSeconds": "Unix time for arrival of each segment. Entries are separated by '||'.",
    "segmentsArrivalTimeRaw": "ISO 8601 formatted arrival time for each segment. Entries are separated by '||'.",
    "segmentsArrivalAirportCode": "IATA code for arrival airport of each segment. Entries are separated by '||'.",
    "segmentsDepartureAirportCode": "IATA code for departure airport of each segment. Entries are separated by '||'.",
    "segmentsAirlineName": "Name of the airline for each segment. Entries are separated by '||'.",
    "segmentsAirlineCode": "Two-letter airline code for each segment. Entries are separated by '||'.",
    "segmentsEquipmentDescription": "Type of airplane used for each segment. Entries are separated by '||'.",
    "segmentsDurationInSeconds": "Duration of the flight (in seconds) for each segment. Entries are separated by '||'.",
    "segmentsDistance": "Distance traveled (in miles) for each segment. Entries are separated by '||'.",
    "segmentsCabinCode": "Cabin code for each segment (e.g., coach). Entries are separated by '||'."
}

# Create the Data Dictionary dynamically
data_dict = [
    {
        "Column Name": col,
        "Data Type": str(df_preview[col].dtype),  # Now retrieves ACTUAL type from DataFrame
        "Description": column_descriptions.get(col, "N/A"),  # Use predefined description
        "Example Value": get_example_value(df_preview, col)
    }
    for col in df_preview.columns
]

# Convert to DataFrame
df_dict = pd.DataFrame(data_dict)

# Display the Data Dictionary
display(df_dict)

Unnamed: 0,Column Name,Data Type,Description,Example Value
0,legId,object,An identifier for the flight.,9ca0e81111c683bec1012473feefd28f
1,searchDate,datetime64[ns],Date when this entry was recorded from Expedia.,2022-04-16 00:00:00
2,flightDate,datetime64[ns],Date of the flight.,2022-04-17 00:00:00
3,startingAirport,category,Three-character IATA code for the departure ai...,ATL
4,destinationAirport,category,Three-character IATA code for the arrival airp...,BOS
5,fareBasisCode,category,The fare basis code.,LA0NX0MC
6,travelDuration,object,Total travel duration in hours and minutes.,PT2H29M
7,elapsedDays,Int64,Number of elapsed days (usually 0).,0
8,isBasicEconomy,boolean,Indicates whether the ticket is for basic econ...,False
9,isRefundable,boolean,Indicates whether the ticket is refundable.,False


## Data Ingestion

In [None]:
# Define column data types
data_types = {
    "legId": "string",
    "searchDate": "datetime64[ns]",
    "flightDate": "datetime64[ns]",
    "startingAirport": "category",
    "destinationAirport": "category",
    "fareBasisCode": "category",
    "travelDuration": "string",
    "elapsedDays": "Int64",
    "isBasicEconomy": "boolean",
    "isRefundable": "boolean",
    "isNonStop": "boolean",
    "baseFare": "float64",
    "totalFare": "float64",
    "seatsRemaining": "Int64",
    "totalTravelDistance": "float64",
    "segmentsDepartureTimeRaw": "datetime64[ns, UTC]",
    "segmentsArrivalTimeRaw": "datetime64[ns, UTC]",
    "segmentsArrivalAirportCode": "category",
    "segmentsDepartureAirportCode": "category",
    "segmentsAirlineName": "category",
    "segmentsAirlineCode": "category",
    "segmentsEquipmentDescription": "category",
    "segmentsCabinCode": "category"
}

# Columns that contain multi-value data (need splitting)
multi_value_columns = [
    "segmentsDepartureTimeEpochSeconds",
    "segmentsArrivalTimeEpochSeconds",
    "segmentsDurationInSeconds",
    "segmentsDistance"
]

# Initialize storage for statistics
total_rows = 0
missing_values = {}
duplicate_rows = 0

# Read the CSV file in chunks
chunksize = 500000
chunk_count = 0

print("Starting Data Ingestion...")

for chunk in pd.read_csv(file_path, chunksize=chunksize, dtype=str):  # Read everything as string first
    chunk_count += 1
    print(f"Processing Chunk {chunk_count}")

    # Convert datetime columns with explicit format handling
    datetime_columns = ["searchDate", "flightDate", "segmentsDepartureTimeRaw", "segmentsArrivalTimeRaw"]
    datetime_format = "%Y-%m-%d"  # Standard YYYY-MM-DD format
    
    for col in datetime_columns:
        if col in chunk.columns:
            try:
                chunk[col] = chunk[col].replace("", np.nan)  # Replace empty strings with NaN
                chunk[col] = pd.to_datetime(chunk[col], format=datetime_format, errors="coerce", utc=True)
            except Exception as e:
                print(f"Warning: Could not convert column '{col}' to datetime. Error: {e}")

    # Convert other data types
    for col, dtype in data_types.items():
        if col in chunk.columns and col not in datetime_columns:  # Skip datetime since it's already processed
            try:
                if dtype == "boolean":
                    chunk[col] = chunk[col].map({"True": True, "False": False})
                else:
                    chunk[col] = chunk[col].astype(dtype)
            except Exception as e:
                print(f"Warning: Could not convert column '{col}' to {dtype}. Error: {e}")

    # Process multi-value columns
    for col in multi_value_columns:
        if col in chunk.columns:
            try:
                # Replace empty strings with NaN before conversion
                chunk[col] = chunk[col].replace("", np.nan)

                # Extract first value only (ignoring multi-leg data for now)
                chunk[col] = chunk[col].str.split("||").str[0]

                # Convert to float, safely handling NaNs
                chunk[col] = pd.to_numeric(chunk[col], errors="coerce")
            except Exception as e:
                print(f"Warning: Could not process multi-value column '{col}'. Error: {e}")

    # Count missing values
    missing_chunk = chunk.isnull().sum()
    for col, missing_count in missing_chunk.items():
        if missing_count > 0:
            missing_values[col] = missing_values.get(col, 0) + missing_count

    # Count duplicate rows
    duplicate_rows += chunk.duplicated().sum()

    # Update total rows count
    total_rows += len(chunk)

print("\nData Ingestion Completed!")
print(f"Total Rows Processed: {total_rows}")
print(f"Duplicate Rows Found: {duplicate_rows}")
print("Missing Values Summary:")
for col, count in missing_values.items():
    print(f"   - {col}: {count} missing values")


Total Rows Processed: 82138753
Duplicate Rows Found: 0
Missing Values Summary:
   - totalTravelDistance: 6094532 missing values
   - segmentsDepartureTimeEpochSeconds: 82138753 missing values
   - segmentsDepartureTimeRaw: 82138753 missing values
   - segmentsArrivalTimeEpochSeconds: 82138753 missing values
   - segmentsArrivalTimeRaw: 82138753 missing values
   - segmentsEquipmentDescription: 1557592 missing values
   - segmentsDurationInSeconds: 82138753 missing values
   - segmentsDistance: 82138753 missing values

Observations from Data Ingestion Output
Total Rows Processed: 82,138,753
Duplicate Rows: 0 (No need for deduplication)
Key Missing Values:
segmentsDepartureTimeEpochSeconds, segmentsDepartureTimeRaw, segmentsArrivalTimeEpochSeconds, segmentsArrivalTimeRaw, segmentsDurationInSeconds, segmentsDistance → 100% missing
totalTravelDistance: 6,094,532 missing values (~7.4% of total rows)
segmentsEquipmentDescription: 1,557,592 missing values (~1.9% of total rows)

## Handling Missing Values

| Column Name | Missing % | Proposed Action |
|-------------|------------|----------------|
| `totalTravelDistance` | 7.4% | Fill with **mean travel distance** per route (group by `startingAirport` & `destinationAirport`). |
| `segmentsDepartureTimeEpochSeconds`<br>`segmentsDepartureTimeRaw`<br>`segmentsArrivalTimeEpochSeconds`<br>`segmentsArrivalTimeRaw` | 100% | Drop from dataset (completely missing). |
| `segmentsEquipmentDescription` | 1.9% | Fill with `"Unknown"` for missing values. |
| `segmentsDurationInSeconds` | 100% | Drop from dataset (completely missing). |
| `segmentsDistance` | 100% | Drop from dataset (completely missing). |


In [2]:
import pandas as pd

# Define file paths
input_file = "../data/itineraries.csv"
output_file = "../data/itineraries_cleaned.csv"

# Define chunk size
chunksize = 500000  

# Columns to drop (completely missing)
columns_to_drop = [
    "segmentsDepartureTimeEpochSeconds", "segmentsDepartureTimeRaw",
    "segmentsArrivalTimeEpochSeconds", "segmentsArrivalTimeRaw",
    "segmentsDurationInSeconds", "segmentsDistance"
]

# Initialize empty dictionary to store mean totalTravelDistance per route
route_means = {}

print("Step 1: Processing large CSV file in chunks...")

# Process the file in chunks
for chunk_id, chunk in enumerate(pd.read_csv(input_file, chunksize=chunksize, dtype=str)):
    print(f"Processing Chunk {chunk_id + 1}")

    # Drop completely missing columns
    chunk.drop(columns=columns_to_drop, inplace=True)

    # Convert totalTravelDistance to float for calculations
    chunk["totalTravelDistance"] = pd.to_numeric(chunk["totalTravelDistance"], errors="coerce")

    # Update route means dictionary
    for (origin, destination), group in chunk.groupby(["startingAirport", "destinationAirport"]):
        mean_distance = group["totalTravelDistance"].mean()
        if not pd.isna(mean_distance):
            route_means[(origin, destination)] = mean_distance

    # Fill missing totalTravelDistance using precomputed means
    for (origin, destination), mean_distance in route_means.items():
        chunk.loc[
            (chunk["startingAirport"] == origin) & (chunk["destinationAirport"] == destination) & chunk["totalTravelDistance"].isnull(),
            "totalTravelDistance"
        ] = mean_distance

    # Fill missing segmentsEquipmentDescription with "Unknown"
    chunk["segmentsEquipmentDescription"].fillna("Unknown", inplace=True)

    # Append cleaned chunk to new file
    chunk.to_csv(output_file, mode="a", header=(chunk_id == 0), index=False)

    print(f"Chunk {chunk_id + 1} processed and saved.")

print("\nData Transformation Completed!")
print(f"Cleaned dataset saved as '{output_file}'.")


Step 1: Processing large CSV file in chunks...


FileNotFoundError: [Errno 2] No such file or directory: 'data/itineraries.csv'