## Phase 01 - Data Collection

In [17]:
# We'll use a seperate folder for keys configuration
import os
from dotenv import load_dotenv 

# we'll load environment variables from a .env file
load_dotenv()

AMADEUS_CLIENT_ID = os.getenv('AMADEUS_CLIENT_ID')
AMADEUS_CLIENT_SECRET = os.getenv('AMADEUS_CLIENT_SECRET')


print("Reading Complete !!!") # Just to confirm setup, don't print actual keys

Reading Complete !!!


In [18]:
# Check if keys were loaded 
if not AMADEUS_CLIENT_ID or not AMADEUS_CLIENT_SECRET:
    raise ValueError("Amadeus API credentials not found.")
else:
    print("Amadeus API credentials loaded successfully.")

Amadeus API credentials loaded successfully.


In [19]:
# Initialize Amadeus Client
from amadeus import Client # Make sure Client is imported, usually done at the top

amadeus = Client(
    client_id=AMADEUS_CLIENT_ID,
    client_secret=AMADEUS_CLIENT_SECRET
)
print("Amadeus client initialized.")

Amadeus client initialized.


In [20]:
print("System First Phase: ")
print("There are about 17 featuress: ", )
print("Feautre 01: Flight Code")
print("Feature 02: Flight Route ")
print("Feature 03: Price " )
print("bucket fare pricing: ")
print("Available seats per bucker fare")

System First Phase: 
There are about 17 featuress: 
Feautre 01: Flight Code
Feature 02: Flight Route 
Feature 03: Price 
bucket fare pricing: 
Available seats per bucker fare


In [21]:
# we'll focus on the garenteed data we can get from Amadeus API
#
# --- Configuration for Data Collection ---
from datetime import date
import datetime
import json

TRACKED_AIRLINES = ['AA', 'DL', 'UA'] # Airlines you want to track
TRACKED_ROUTES = [
    {'origin': 'BOS', 'destination': 'LAX'},
    {'origin': 'JFK', 'destination': 'SFO'},
    # Add more routes as needed
]

# Dates for the flight search should be about 3 months apart
DEPARTURE_DATE = date(2025, 9, 15) # To be modified later
DEPARTURE_DATE_02 = date(2025, 12, 16) # To be modified later    


In [22]:
def process_amadeus_response(response_data, current_collection_timestamp, observation_id_start):
    """
    Parses Amadeus Flight Offers Search response and extracts relevant data.
    Returns a list of dictionaries, each representing a row in our dataset.

    Args:
        response_data (list): The 'data' field from the Amadeus Flight Offers Search API response.
                              This is a list of flight offers.
        current_collection_timestamp (datetime): The UTC timestamp when this data was collected.
        observation_id_start (int): The starting ID for observations in this batch.

    Returns:
        tuple: A tuple containing:
            - list: A list of dictionaries, where each dictionary is a row for the DataFrame.
            - int: The next available observation ID.
    """


    observations = []
    current_obs_id = observation_id_start

    # Basic check: if no data, return empty list and current ID
    if not response_data:
        return [], current_obs_id
    
    # Iterate through each 'offer' returned by Amadeus
    for offer in response_data:
        # An offer can have multiple itineraries (e.g., different layover options).
        # For simplicity, we'll process the first itinerary in each offer.
        # Itineraries contain 'segments' (individual flights that make up the journey).
        if not offer.get('itineraries') or not offer['itineraries'][0].get('segments'):
            continue # Skip invalid offers that don't have itineraries or segments

        itinerary = offer['itineraries'][0]
        segments = itinerary['segments']
        first_segment = segments[0]    # The first flight in the itinerary
        last_segment = segments[-1]    # The last flight in the itinerary (useful for destination)

        try:
            # --- GUARANTEED/HIGHLY RELIABLE Amadeus Data Points ---

            # 1. Airline_Code (carrierCode)
            # Path: offer -> itineraries[0] -> segments[0] -> carrierCode
            # This is the IATA code for the operating airline of the first flight segment.
            carrier_code = first_segment.get('carrierCode', 'UNK') # .get() is safe access

            # 2. Flight_Number
            # Path: offer -> itineraries[0] -> segments[0] -> number
            # The flight number for the first flight segment.
            flight_number_str = first_segment.get('number', 'UNKNOWN')

            # 3. Origin_Airport_Code (departure.iataCode)
            # Path: offer -> itineraries[0] -> segments[0] -> departure -> iataCode
            # The IATA code of the departure airport for the first flight.
            origin_iata = first_segment.get('departure', {}).get('iataCode', 'UNK')

            # 4. Destination_Airport_Code (arrival.iataCode)
            # Path: offer -> itineraries[0] -> segments[-1] -> arrival -> iataCode
            # The IATA code of the arrival airport for the *last* flight segment of the itinerary.
            destination_iata = last_segment.get('arrival', {}).get('iataCode', 'UNK')

            # 5. Departure_DateTime (departure.at)
            # Path: offer -> itineraries[0] -> segments[0] -> departure -> at
            # The scheduled departure date and time of the *first* flight segment (ISO 8601 format).
            departure_datetime_str = first_segment.get('departure', {}).get('at')
            if not departure_datetime_str: # Essential check for a critical field
                print(f"Warning: Skipping offer due to missing departure_datetime: {offer}")
                continue
            # Convert to datetime object, ensuring timezone is handled for consistency
            departure_datetime_obj = datetime.fromisoformat(departure_datetime_str.replace('Z', '+00:00'))

            # 6. Number_of_Stops
            # Derived directly from the number of segments in the itinerary.
            # A direct flight (non-stop) has 1 segment, so stops = 1 - 1 = 0.
            number_of_stops = len(segments) - 1

            # 7. PE_Current (price.grandTotal)
            # Path: offer -> price -> grandTotal
            # This is the total price for the entire flight offer. Since our search
            # (in the main loop) is set for 'ECONOMY', this will typically be the
            # economy-class price.
            pe_current = float(offer.get('price', {}).get('grandTotal', 0.0))

            # --- Derived Identifiers using Amadeus Data ---

            # Flight_Unique_ID:
            # This is an internal identifier you create to uniquely track a *specific scheduled flight*
            # across multiple collection timestamps. It combines key Amadeus-obtained info.
            flight_unique_id = (
                f"{carrier_code}{flight_number_str}_"
                f"{origin_iata}{destination_iata}_"
                f"{departure_datetime_obj.strftime('%Y%m%d%H%M')}" # Include time for uniqueness
            )

            # --- Potentially Obtainable (but less guaranteed/consistent) Amadeus Data ---

            # Aircraft_Type_Code (aircraft.code)
            # Path: offer -> itineraries[0] -> segments[0] -> aircraft -> code
            # The IATA code for the aircraft type. While often present, it's not
            # universally guaranteed for all offers or all segments.
            aircraft_type_code = first_segment.get('aircraft', {}).get('code', 'UNKNOWN')


            # --- Construct Data Point Dictionary (your dataset row) ---
            # This is the dictionary that will represent one row in your final DataFrame.
            # All fields should be included here, even if they are placeholders for now.
            data_point = {
                'Observation_ID': current_obs_id, # Incremented by your script
                'Flight_Unique_ID': flight_unique_id, # Derived from Amadeus data
                'Collection_Timestamp': current_collection_timestamp.isoformat(), # From your collection process
                'Departure_DateTime': departure_datetime_obj.isoformat(), # From Amadeus
                'Airline_Code': carrier_code, # From Amadeus
                'Route': f"{origin_iata}-{destination_iata}", # Derived from Amadeus
                'Origin_Airport_Code': origin_iata, # From Amadeus
                'Destination_Airport_Code': destination_iata, # From Amadeus
                'Flight_Number': flight_number_str, # From Amadeus
                'Aircraft_Type_Code': aircraft_type_code, # From Amadeus (if available)
                'Number_of_Stops': number_of_stops, # Derived from Amadeus

                'PE_Current': pe_current, # From Amadeus
                'PF_Current': None, # Placeholder: Requires separate Amadeus search
                'Price_Bucket_1': pe_current * 0.8, # Synthetic placeholder for now
                'Price_Bucket_2': pe_current * 0.9, # Synthetic placeholder for now

                # Placeholders for Amadeus-derived competitive pricing (calculated in main loop)
                'CP_Cheapest_Flight_Price': None,
                'CP_Avg_Bucket_Fare': None,

                # Placeholders for derived & engineered features (calculated later)
                'DT_Days_to_Departure': (departure_datetime_obj.date() - current_collection_timestamp.date()).days,
                'DT_Hours_to_Departure': (departure_datetime_obj - current_collection_timestamp).total_seconds() / 3600,
                'Departure_Hour_UTC': departure_datetime_obj.hour,
                'Departure_Day_of_Week': departure_datetime_obj.weekday(),
                'Collection_Hour_of_Day_UTC': current_collection_timestamp.hour,
                'Weekday_Morning': 1 if current_collection_timestamp.weekday() < 5 and 6 <= current_collection_timestamp.hour <= 10 else 0,
                'Weekend_Night': 1 if current_collection_timestamp.weekday() >= 5 and 20 <= current_collection_timestamp.hour <= 23 else 0,

                # Placeholders for other Amadeus (like seat availability) or External Data
                'Total_Aircraft_Seats': None, 'AS_Total': None, 'AS_Economy': None, 'AS_First': None,
                'Booked_Seats_Economy': None, 'Booked_Seats_First': None,
                'BR_Economy': None, 'BR_First': None,
                'CPV_Lowest_Price_Volatility': None, 'CPV_Avg_Price_Volatility': None,
                'RC_Num_Competitors': None, 'RC_Weighted_Index': None,
                'ER_Deviation_Ratio': None, 'ET_Daily_Target_Ratio': None,
                'Total_Booked_Seats_for_Day': None,
                'SFPI_Avg_Load_Factor': None, 'SFPI_Avg_Price_Deviation': None, 'SFPI_Avg_Booking_Rate': None,
                'Temp_Dest_C': None, 'Oil_Price_USD': None, 'Fuel_Cost_Index': None,
                'Interest_Limit_Score': None, 'UE_Score': None, 'SE_Score': None, 'CE_Score': None,
                'ANIS_Score': None, 'Social_Media_Sentiment_Score': None,
            }
            observations.append(data_point)
            current_obs_id += 1

        except Exception as e:
            # Good practice to catch errors for individual offers so the script doesn't crash entirely.
            # You might want more sophisticated logging here for production.
            print(f"Error processing offer: {e}. Skipping offer. Raw data: {json.dumps(offer, indent=2)}")
            continue

    return observations, current_obs_id

In [None]:
# --- Main Data Collection Loop ---
all_flight_observations = [] # This list will hold all the structured data points (dictionaries)
observation_id_counter = 1   # A counter to assign unique Observation_ID to each collected data point
MIN_DELAY_BETWEEN_CALLS_SECONDS = 0.5 # A short delay to avoid hitting Amadeus rate limits too quickly

print(f"Starting data collection for flights departing on {DEPARTURE_DATE}...")

# 1. Outer Loop: Iterating through Collection Timestamps
# This loop ensures you collect data at each specified time (e.g., 10 AM, 12 PM, 2 PM UTC).
for collection_ts in COLLECTION_TIMES_UTC:
    print(f"\n--- Collecting data at: {collection_ts.isoformat()} ---")

    # This dictionary will temporarily store ALL raw offers collected for the current
    # collection_ts, grouped by route. This is essential for calculating competitive pricing
    # for a given observation window *after* all individual offers are retrieved.
    offers_at_this_time = {} # Key: route_str (e.g., "BOS-LAX"), Value: list of raw Amadeus offers

    # 2. Inner Loop: Iterating through Tracked Routes
    # For each collection timestamp, we go through all the origin-destination routes you want to track.
    for route in TRACKED_ROUTES:
        origin = route['origin']
        destination = route['destination']
        route_key = f"{origin}-{destination}" # Create a convenient key for the route

        print(f"  Searching for {origin}-{destination} on {DEPARTURE_DATE}...")

        try:
            # 3. Making the Amadeus API Call (Flight Offers Search)
            # This is where the request goes out to Amadeus.
            response = amadeus.shopping.flight_offers.search.get(
                originLocationCode=origin,
                destinationLocationCode=destination,
                departureDate=DEPARTURE_DATE.isoformat(), # The target flight departure date
                adults=1, # Number of travelers (from configuration)
                travelClass=TRAVEL_CLASS, # e.g., 'ECONOMY' (from configuration)
                max=MAX_OFFERS_PER_SEARCH # Limits the number of offers returned by Amadeus
            )

            # 4. Processing the Amadeus Response
            if response.data: # Check if the API returned any flight offers
                # Store the *raw* offers for this route and timestamp.
                # This is crucial for calculating competitive metrics across all offers for this search.
                offers_at_this_time.setdefault(route_key, []).extend(response.data)

                # Call your processing function to extract detailed observations.
                # IMPORTANT: We now pass 'response.result.get('dictionaries')' to process_amadeus_response
                # so it can look up full names for airlines, aircraft, etc.
                new_observations, observation_id_counter = process_amadeus_response(
                    response.data, # The list of offers
                    collection_ts, # The current time this data was collected
                    observation_id_counter, # The starting ID for this batch
                    amadeus_dictionaries=response.result.get('dictionaries') # Pass the dictionaries for lookups
                )
                all_flight_observations.extend(new_observations) # Add processed data to your main list
                print(f"    Collected {len(new_observations)} individual flight observations.")
            else:
                print(f"    No flight offers found for {route_key} on {DEPARTURE_DATE} at {collection_ts}.")

        # 5. Error Handling for API Calls
        except ResponseError as e:
            # Catch specific Amadeus API errors (e.g., invalid input, rate limits).
            print(f"  Amadeus API Error for {route_key}: {e} (HTTP {e.response.status_code})")
            if e.response.status_code == 429: # Amadeus's "Too Many Requests" status code
                print("    Rate limit hit. Waiting for 60 seconds before retrying/continuing...")
                time.sleep(60) # Wait for 60 seconds if rate limit is hit
            # You could add more complex retry logic here (e.g., exponential backoff).
        except Exception as e:
            # Catch any other unexpected Python errors during the API call or response handling.
            print(f"  An unexpected error occurred for {route_key}: {e}")

        # 6. Rate Limiting (Delay between individual route searches)
        # A small delay between each API call to be polite to the API and avoid hitting limits.
        time.sleep(MIN_DELAY_BETWEEN_CALLS_SECONDS)

    # --- 7. Calculating Competitive Pricing (After all individual offers are collected for this timestamp) ---
    # This step is done *outside* the inner `for route` loop but *inside* the outer `for collection_ts` loop.
    # It aggregates data from all offers collected for this specific timestamp and route
    # to derive metrics like the cheapest price among all competitors.
    for route_k, offers_list in offers_at_this_time.items():
        all_prices_for_route = []
        unique_airlines_for_route = set()

        for offer_data in offers_list:
            # Collect prices from all offers for the current route and timestamp
            if offer_data.get('price', {}).get('grandTotal'):
                all_prices_for_route.append(float(offer_data['price']['grandTotal']))
            # Identify unique airlines to count competitors
            if offer_data.get('itineraries') and offer_data['itineraries'][0].get('segments'):
                unique_airlines_for_route.add(offer_data['itineraries'][0]['segments'][0]['carrierCode'])

        # Calculate competitive metrics
        if all_prices_for_route:
            cheapest_flight_price = min(all_prices_for_route)
            avg_bucket_fare = sum(all_prices_for_route) / len(all_prices_for_route)
            num_competitors = len(unique_airlines_for_route)
        else:
            cheapest_flight_price = None
            avg_bucket_fare = None
            num_competitors = 0

        # Now, update the observations that have already been created for this route and collection_ts
        # We iterate through 'all_flight_observations' and find matching records
        for obs in all_flight_observations:
            if obs['Route'] == route_k and obs['Collection_Timestamp'] == collection_ts.isoformat():
                obs['CP_Cheapest_Flight_Price'] = cheapest_flight_price
                obs['CP_Avg_Bucket_Fare'] = avg_bucket_fare
                obs['RC_Num_Competitors'] = num_competitors
        print(f"  Updated competitive pricing for {route_k}.")


print("\nData collection complete for specified period.")

# --- 8. Final Data Conversion and Saving ---
if all_flight_observations:
    df = pd.DataFrame(all_flight_observations)
    df.to_csv(OUTPUT_CSV_FILE, index=False)
    print(f"Data saved to {OUTPUT_CSV_FILE}")
    print("\n--- Sample Data (first 5 rows) ---")
    print(df.head().to_markdown(index=False))
    print(f"\nTotal observations collected: {len(df)}")
else:
    print("No observations collected.")