Introduction

In [None]:
'''
goal: to analysis flight booking data form database to get the past booking status of previous day, previous 7, 15 and 30 day

process flow:
1. get input from blade to python script
2. extract data from database
3. transform data
4. filter data
5. process data
6. send data as json to depict using charts

'''

Process input from blade to python script

In [None]:
import sys

# Debug: Print arguments to verify
print("Arguments passed to script:", sys.argv)

# Ensure you have the deviation argument
if len(sys.argv) != 4:
    print("Usage: python script.py <deviation>")
    sys.exit(1)

try:
    deviation   = float(sys.argv[1])
    startDate   = sys.argv[2]
    endDate     = sys.argv[3]
except ValueError:
    print("The deviation argument must be a float.")
    sys.exit(1)

EXtract data from database

In [None]:
# print("---------START: Reading Data---------")
 
import pymysql
 
mydb = pymysql.connect(
    host     = "host_name",
    user     = "user_name",
    database = "db_name",
    password = "password")

column_header = ['route','flight_no','flight_date', 'total_seat','total_booked','plf','generated_at']

mycursor = mydb.cursor()
mycursor.execute('select route,flight_no,flight_date, total_seat, total_booked,plf,generated_at from table_name')

df = pd.DataFrame(mycursor.fetchall())
df.columns = column_header

# print("---------END: Reading Data---------")

Data Transformation

In [None]:
import pandas as pd

# print("---------START: Transforming Data---------")

# Convert the columns to desired object.
df['flight_date']   = pd.to_datetime(df['flight_date'])
df['generated_at']  = pd.to_datetime(df['generated_at'])
df['total_seat']    = df['total_seat'].astype(int)
df['total_booked']  = df['total_booked'].astype(int)
df['plf']           = df['plf'].apply(lambda x: round(float(x), 2))

# print("---------END: Transforming Data---------")

Data Filter

In [None]:

today_date      = pd.to_datetime('today')
start_date      = pd.to_datetime(startDate)
end_date        = pd.to_datetime(endDate)

past_flights    = df[(df['flight_date'] <= end_date) & (df['flight_date'] >= start_date)]
past_flights_grouped = past_flights.loc[past_flights.groupby([ 'flight_no', 'flight_date','route'])['generated_at'].idxmax()]



Data Process

In [None]:
# print("---------START: Analyzing Data---------")

import numpy as np
import datetime

if past_flights_grouped.empty:

    # Return the summary statistics.
    summary_statistics = {
        'status'          : 0,
        'json_dict_30'    : {}
    }
else:
    
    # Filter the data for the next 1,7,15,30 days
    current_date        = pd.to_datetime('today')
    end_date_30         = current_date - datetime.timedelta(days=30)
    past_30_days_data   = past_flights_grouped[(past_flights_grouped['flight_date'] <= current_date) & (past_flights_grouped['flight_date'] >= end_date_30)]

    # Group by 'route', 'flight_no', 'flight_date' and get unique combinations
    unique_combinations_30 = past_30_days_data[['route', 'flight_no', 'flight_date']].drop_duplicates()
    
    # Merge with another_df to get rows that match the grouped rows
    result_df_30 = past_flights.merge(unique_combinations_30, on=['route', 'flight_no', 'flight_date'], how='inner').sort_values(by='generated_at', ascending=True)
    
    def identify_flights(group, deviation):
        max_plf = group['plf'].cummax()  # Track the cumulative maximum PLF
        has_reached_100 = group['plf'] >= 100
        deviation_threshold = max_plf * deviation  # e.g. 10% deviation threshold
        
        # Check if PLF has dropped below 100 and the drop is more than 10% from the max PLF
        has_significant_drop = (group['plf'] < 100) & (group['plf'] < deviation_threshold)
        
        # Check if the flight meets the criteria
        if has_reached_100.any() and (has_significant_drop & has_reached_100.cummax()).any():
            return True
        return False
    
    # Apply the function to each group of flight_no and flight_date
    result = result_df_30.groupby(['flight_no', 'flight_date']).filter(lambda group: identify_flights(group, deviation))
    
    unique_combinations_100_30 = result[['route', 'flight_no', 'flight_date']].drop_duplicates()
    
    result_df_100_30 = past_flights.merge(unique_combinations_100_30, on=['route', 'flight_no', 'flight_date'], how='inner').sort_values(by='generated_at', ascending=True)
        
    # Create a dictionary to store separate DataFrames
    dataframes_dict_30 = {}
    
    # Iterate over unique combinations to create separate DataFrames
    
    for _, row in unique_combinations_100_30.iterrows():
        route, flight_no, flight_date = row['route'], row['flight_no'], row['flight_date']
        key = f"{route}_{flight_no}_{flight_date}"
        dataframes_dict_30[key] = result_df_100_30[(result_df_100_30['route'] == route) &
                                            (result_df_100_30['flight_no'] == flight_no) &
                                            (result_df_100_30['flight_date'] == flight_date)]
        
    # Convert each DataFrame in the dictionary to JSON and store in a new dictionary
    # json_dict_30 = {key: df.to_json(orient='records') for key, df in dataframes_dict_30.items()}
    json_dict_30 = {key: df.to_json(orient='records') if not df.empty else None for key, df in dataframes_dict_30.items()}

    # Return the summary statistics.
    summary_statistics = {
        'status'          : 1,
        'json_dict_30'    : json_dict_30
    }
    
# print("---------END: Analyzing Data---------")

Data Dump

In [None]:
import json

json_data       = json.dumps(summary_statistics)