Introduction

In [None]:
'''
goal: to insert data form txt file to database along with few new columns + new table to have the summary day wise

process flow:
1. read data from txt file
2. extract desired data from read data
3. clean data
4. transform data
5. filter data
6. process data
7. load data into database
8. create a processed data file and relocate it

'''

Process Flow

In [None]:
import os

dataFolder = r"data_folder"
insertedFolder = r"processed_data_folder"

for file in os.listdir(dataFolder):
    if file.endswith('.TXT'):
        file_path = dataFolder +"\\"+ file
        # function to extract data from file
        extracted_df = read_file(file_path)
        
        # function to clean data
        cleaned_df = clean_file(extracted_df.copy())
        
        # function to get latitude and longitude of all airports
        all_location =  get_all_geo_location()
        
        # function to transform data
        transformed_df = transform_file(cleaned_df.copy(), all_location.copy())
        
        # function to filter data
        grouped_df = group_data(transformed_df.copy())
        
        # function to process data to calculate PLF (pax load factor)
        processed_df = get_PLF(grouped_df.copy())
        
        # fucntions to load data into database
        loadedFile = insert_data_extraction_table(transformed_df.copy())
        inserted_summary = insert_data_summary_table(processed_df.copy())
        
        # function to create processed data file and relocate it
        relocate_file(file, file_path, insertedFolder, transformed_df.copy())

Extracting data from txt file

In [None]:
import pandas as pd
generating_date = ''

def read_file(file_path):
    print("---------Reading & Filtering starting----------")
    
    global generating_date
    target_string = "XX"
    generating_date_finder = "End of Page"
    column_header = ['airline_code','route','flight_no','flight_date','flight_time','c_tot','j_bkd','w_bkd','y_bkd','tot_bkd']

    df = pd.read_csv(file_path, header=None)
    
    matches = df[df.iloc[:, 0].astype(str).str.startswith(target_string)]
    matches_target_date = df[df.iloc[:, 0].astype(str).str.startswith(generating_date_finder)]
    generating_date = str(matches_target_date[0])[30:40]
    
    df1 = matches.iloc[:,0].str.split(expand=True)
    df1.columns = column_header
    
    print("---------Reading & Filtering ending----------")
    
    return df1

Cleaning data

In [None]:
import numpy as np

def clean_file(filtered_df):
    print("---------Cleaning starting----------")
    
    # clearing custom flights like CRCHDQ
    for i in range(len(filtered_df)):        
        if len(filtered_df.iloc[i,1]) > 6:
            tobesplited = filtered_df.iloc[i,1]
            filtered_df.iloc[i,1], filtered_df.iloc[i,2:], filtered_df.iloc[i,2] = tobesplited[:6], filtered_df.iloc[i,2:].shift(1), tobesplited[6:]
    
    # changing data types
    d3= filtered_df.convert_dtypes()
    d3.replace('', np.nan, inplace=True)

    # Identify the columns intended to be integers
    int_columns = ['flight_no', 'c_tot', 'j_bkd', 'w_bkd', 'y_bkd', 'tot_bkd']
    
    # Convert columns to integers and handle NaN or infinite values
    d3[int_columns] = d3[int_columns].apply(lambda x: pd.to_numeric(x, errors='coerce')).astype('Int64')
    
    d3['flight_timestamp'] = pd.to_datetime(d3['flight_date'] + ' ' + d3['flight_time'])
    d3['flight_date'] = d3['flight_date'].apply(lambda x: transform_date(x))

    df_datatype_corrected = d3
    non_numeric_columns = df_datatype_corrected.select_dtypes(exclude=['number', 'datetime']).columns
    df_datatype_corrected[non_numeric_columns] = df_datatype_corrected[non_numeric_columns].fillna('N/A')
    
    df_nullCellTransformed = df_datatype_corrected.fillna(0)
    df_allCellTrimmed = trim_all_columns(df_nullCellTransformed)
    
    print("---------Cleaning ending----------")
    return df_allCellTrimmed

def trim_all_columns(df):
    """
    Trim whitespace from ends of each value across all series in dataframe
    """
    trim_strings = lambda x: x.strip() if isinstance(x, str) else x
    return df.applymap(trim_strings)

Getting all Station's Geo-Location

In [None]:
import pymysql

def get_all_geo_location():
    print("---------START: Reading All_Geo_Location Data---------")

    global df_geo_location
    
    mydb_connection = pymysql.connect(
        host = "host_name",
        user = "user_name",
        database = "db_name",
        password = "password")
    
    mycursor = mydb_connection.cursor()
    
    # Execute the first query to retrieve all latitude and longitude from tha database
    origin_query = "SELECT iata_code, latitude, longitude FROM table_name"
    mycursor.execute(origin_query)    
    df_geo_location = pd.DataFrame(mycursor.fetchall())
    
    # Close the cursor and the connection
    mycursor.close()
    mydb_connection.close()
    
    column_header = ['iata_code','latitude','longitude']
    df_geo_location.columns = column_header
    
    df_geo_location = df_geo_location[df_geo_location['iata_code'] != r'\N']
    
    df_geo_location['latitude'] = df_geo_location['latitude'].astype(float)
    df_geo_location['longitude'] = df_geo_location['longitude'].astype(float)
    
    print("---------END: Reading All_Geo_location Data---------")
    return df_geo_location

Data Transformation

In [None]:
from datetime import datetime

def transform_file(cleaned_df, all_location):
    print("---------Transforming starting for----------")
    global df_merged
    
    cleaned_df['origin']        = cleaned_df['route'].apply(lambda x: x[:3])
    cleaned_df['destination']   = cleaned_df['route'].apply(lambda x : x[3:])
    cleaned_df['day_of_week']   = cleaned_df['flight_date'].apply(lambda x: get_day(x))
    cleaned_df['generated_at']  = transform_date(generating_date)
   
    # assume that df is the large dataset and db is the table in the database
    df_merged = pd.merge(cleaned_df, all_location, left_on='origin', right_on='iata_code', how='left')
    df_merged = pd.merge(df_merged, all_location, left_on='destination', right_on='iata_code', how='left', suffixes=('_origin', '_destination'))

    # assume that df is the dataframe with columns latitude_origin, longitude_origin, latitude_destination, longitude_destination
    df_merged['distance_km'] = calculate_distance_vec(df_merged['latitude_origin'], df_merged['longitude_origin'], df_merged['latitude_destination'], df_merged['longitude_destination'])
    
    df_merged['distance_km'] = pd.to_numeric(df_merged['distance_km'], errors='coerce')
    df_merged['rpk'] = df_merged['tot_bkd'] * df_merged['distance_km']
    
    # Columns to exclude
    columns_to_exclude = ['iata_code_origin', 'latitude_origin', 'longitude_origin', 'iata_code_destination', 'latitude_destination', 'longitude_destination']
    
    # Using drop() method
    df_merged_copy_1 = df_merged.drop(columns=columns_to_exclude, inplace=False)

    print("---------Transforming ending----------")
    return df_merged_copy_1

def transform_date(input_date):
    formats = ['%m/%d/%Y', '%m/%d/%y', '%d/%m/%Y', '%d/%m/%y', "%b'%y"]
    for date_format in formats:
        try:
            date_obj = datetime.strptime(input_date, date_format)
            return date_obj.strftime('%Y-%m-%d')
        except ValueError:
            continue
    raise ValueError('Invalid date format: {}'.format(input_date))

def get_day(input_date):
    formats = ['%m/%d/%Y', '%m/%d/%y', '%d/%m/%Y', '%d/%m/%y', "%b'%y", '%Y-%m-%d']
    for date_format in formats:
        try:
            date_obj = datetime.strptime(input_date, date_format)
            day_of_week = date_obj.weekday()
            return day_of_week
        except ValueError:
            continue
    raise ValueError('Invalid date format: {}'.format(input_date))

def calculate_distance_vec(lat1, lon1, lat2, lon2):
    
    radius_earth = 6.371E3  # km
    phi1         = np.radians(lat1)
    phi2         = np.radians(lat2)
    delta_phi    = np.radians(lat1 - lat2)
    delta_lam    = np.radians(lon1 - lon2)

    a = np.sin(0.5 * delta_phi)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(0.5 * delta_lam)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

    distance_km = radius_earth * c
    return distance_km

Data Processing

In [None]:
def group_data(df):
    # print("---------START: Grouping Data---------")
    
    # today_date = pd.to_datetime('today')
    exclude_list = ['CRCHDQ', 'AAABBB']
    df.drop(df[df['route'].isin(exclude_list)].index, inplace = True) 
    
    future_flights_grouped = df.sort_values(by=['flight_no', 'flight_timestamp', 'route', 'generated_at'])
    
    temp_storage = {'origin1': '', 'destination1': '', 'flight_no': '','origin2': '', 'destination2': '', 'flight_date': '', 'journey_date': '', 'flight_timestamp': ''}

    results = future_flights_grouped.apply(lambda row: accumulate_segment(row, temp_storage), axis=1)
    future_flights_grouped['journey_date'] = results.apply(lambda x: x[0])
    
    # print("---------END: Grouping Data---------")
    return future_flights_grouped


Below algorithm finds out the segments of a flight based on flight_no, flight_date, flight_time, origin and destination. For example, if the flight having flight_no 111 of flight_date 2222-02-22 have segments like AAA-BBB, AAA-CCC and BBB-CCC within one day interval, then below function will consider them as segment i.e. part of same flight and will state the flight route as AAA-BBB-CCC.

![alt text](<Copy of algo_segment_accumulator.drawio.png>)

In [None]:

def accumulate_segment(row, temp_storage):
    flight_no        = row['flight_no']
    flight_date      = row['flight_date']
    flight_timestamp = row['flight_timestamp']
    origin           = row['origin']
    destination      = row['destination']
    
    # if current and previous flight no are same
    if flight_no == temp_storage['flight_no']:
        # if current and previous flight date are same
        if flight_date == temp_storage['flight_date']:
            # if current and previous flight_no flight_date are same and already have 3rd segment
            if temp_storage['origin2'] and temp_storage['destination2']:
                temp_storage['journey_date'] = flight_date
                temp_storage['flight_no'] = flight_no
                temp_storage['flight_date'] = flight_date
                temp_storage['origin1'] = ''
                temp_storage['destination1'] = ''
                temp_storage['origin2'] = ''
                temp_storage['destination2'] = ''
                temp_storage['flight_timestamp'] = flight_timestamp
            # if current and previous flight_no flight_date are same and already have doesn't have 3rd segment
            else:
                temp_storage['journey_date'] = flight_date
                temp_storage['origin2'] = origin
                temp_storage['destination2'] = destination
                temp_storage['flight_no'] = flight_no
                temp_storage['flight_date'] = flight_date
                temp_storage['flight_timestamp'] = flight_timestamp
        # if current and previous flight_no are same but flight_date are NOT same
        else:
            # if current and previous flight_no are same and flight_date are NOT same but flight_dates have difference of one day
            if ((pd.to_datetime(flight_date) - pd.to_datetime(temp_storage['journey_date'])) == pd.Timedelta(days=1)):
                # if it has one segment
                if temp_storage['origin1'] and temp_storage['destination1']:
                    # if it has 2nd segment
                    if temp_storage['origin2'] and temp_storage['destination2']:
                        if ((origin == temp_storage['destination1'] and destination == temp_storage['destination2']) or (destination == temp_storage['destination1'] and origin == temp_storage['destination2'])):
                            temp_storage['flight_no'] = flight_no
                            temp_storage['flight_date'] = flight_date
                            temp_storage['origin1'] = origin
                            temp_storage['destination1'] = destination
                            temp_storage['origin2'] = ''
                            temp_storage['destination2'] = ''
                            temp_storage['flight_timestamp'] = flight_timestamp
                        else:
                            temp_storage['journey_date'] = flight_date
                            temp_storage['flight_no'] = flight_no
                            temp_storage['flight_date'] = flight_date
                            temp_storage['origin1'] = origin
                            temp_storage['destination1'] = destination
                            temp_storage['origin2'] = ''
                            temp_storage['destination2'] = ''
                            temp_storage['flight_timestamp'] = flight_timestamp
                    else:
                        # if the flight is of same number and route but of next day's; so new flight 
                        if origin == temp_storage['origin1'] and destination == temp_storage['destination1']:
                            temp_storage['journey_date'] = flight_date
                            temp_storage['origin1'] = origin
                            temp_storage['destination1'] = destination
                            temp_storage['origin2'] = ''
                            temp_storage['destination2'] = ''
                            temp_storage['flight_no'] = flight_no
                            temp_storage['flight_date'] = flight_date
                            temp_storage['flight_timestamp'] = flight_timestamp
                        # if the flights are of same number and origin but of next day's; 
                        elif origin == temp_storage['origin1'] and destination != temp_storage['destination1']: 
                            if flight_timestamp != temp_storage['flight_timestamp']:
                                temp_storage['journey_date'] = flight_date
                                temp_storage['origin1'] = origin
                                temp_storage['destination1'] = destination
                                temp_storage['origin2'] = ''
                                temp_storage['destination2'] = ''
                                temp_storage['flight_no'] = flight_no
                                temp_storage['flight_date'] = flight_date
                                temp_storage['flight_timestamp'] = flight_timestamp
                            else:
                                temp_storage['journey_date'] = temp_storage['journey_date']
                                temp_storage['flight_no'] = flight_no
                                temp_storage['flight_date'] = temp_storage['flight_date']
                                temp_storage['origin2'] = origin
                                temp_storage['destination2'] = destination
                                temp_storage['flight_timestamp'] = flight_timestamp
                        elif origin == temp_storage['destination1'] and destination != temp_storage['origin1']: 
                            temp_storage['journey_date'] = temp_storage['journey_date']
                            temp_storage['flight_no'] = flight_no
                            temp_storage['flight_date'] = temp_storage['flight_date']
                            temp_storage['origin2'] = origin
                            temp_storage['destination2'] = destination
                            temp_storage['flight_timestamp'] = flight_timestamp
                        elif destination == temp_storage['destination1'] and origin != temp_storage['origin1']: 
                            temp_storage['journey_date'] = temp_storage['journey_date']
                            temp_storage['flight_no'] = flight_no
                            temp_storage['flight_date'] = temp_storage['flight_date']
                            temp_storage['origin2'] = origin
                            temp_storage['destination2'] = destination
                            temp_storage['flight_timestamp'] = flight_timestamp
                # if the flight has no segment enlisted
                else:
                    temp_storage['journey_date'] = flight_date
                    temp_storage['origin1'] = origin
                    temp_storage['destination1'] = destination
                    temp_storage['origin2'] = ''
                    temp_storage['destination2'] = ''
                    temp_storage['flight_no'] = flight_no
                    temp_storage['flight_date'] = flight_date
                    temp_storage['flight_timestamp'] = flight_timestamp
                    
            
            # if current and previous flight_no are same and flight_date are NOT same and flight_dates have difference greater than one day
            else:
                temp_storage['journey_date'] = flight_date
                temp_storage['origin1'] = origin
                temp_storage['destination1'] = destination
                temp_storage['origin2'] = ''
                temp_storage['destination2'] = ''
                temp_storage['flight_no'] = flight_no
                temp_storage['flight_date'] = flight_date
                temp_storage['flight_timestamp'] = flight_timestamp
                
    # if current and previous flight no are NOT same
    else:
        temp_storage['origin1'] = origin
        temp_storage['destination1'] = destination
        temp_storage['origin2'] = ''
        temp_storage['destination2'] = ''
        temp_storage['flight_no'] = flight_no
        temp_storage['flight_date'] = flight_date
        temp_storage['journey_date'] = flight_date
        temp_storage['flight_timestamp'] = flight_timestamp
    
    return (temp_storage['journey_date'], temp_storage['flight_no'],temp_storage['flight_date'], temp_storage['flight_timestamp'])
    
    

Calculating PLF (Pax Load Factor) by determining ASK and RPK of the airports

In [None]:
def get_PLF(future_flights_grouped):

    # print("---------START: Filtering Data---------")
    def calculate_ask_distance(row, temp_storage):
        origin      = row['origin']
        destination = row['destination']
        distance    = row['distance_km']
    
        if temp_storage['origin1'] and temp_storage['destination1']:
            if temp_storage['origin2'] and temp_storage['destination2']:
                if origin == temp_storage['destination1']:
                    temp_storage['total_distance'] = distance + temp_storage['distance1']
                    temp_storage['route'] = temp_storage['origin1'] + '-' + temp_storage['destination1'] + '-' + destination
                elif origin == temp_storage['destination2']:
                    temp_storage['total_distance'] = distance + temp_storage['distance2']
                    temp_storage['route'] = temp_storage['origin2'] + '-' + temp_storage['destination2'] + '-' + destination
                elif origin == temp_storage['origin1'] and destination == temp_storage['origin2']:
                    temp_storage['total_distance'] += distance 
                    temp_storage['route'] = temp_storage['origin1'] + '-' + temp_storage['origin2'] + '-' + temp_storage['destination2']
                elif origin == temp_storage['origin2'] and destination == temp_storage['origin1']:
                    temp_storage['total_distance'] += distance 
                    temp_storage['route'] = temp_storage['origin2'] + '-' + temp_storage['origin1'] + '-' + temp_storage['destination1']
                    
            else:
                if origin == temp_storage['origin1'] or destination == temp_storage['destination1']:
                    temp_storage['origin2'] = origin
                    temp_storage['destination2'] = destination
                    temp_storage['distance2'] = distance  
                elif origin == temp_storage['destination1']:
                    temp_storage['origin2'] = origin
                    temp_storage['destination2'] = destination
                    temp_storage['distance2'] = distance
                    temp_storage['total_distance'] += distance
                    temp_storage['route'] = temp_storage['origin1'] + '-' + temp_storage['destination1'] + '-' + destination
                    # print("this route is transit point")
                elif destination == temp_storage['origin1']:
                    temp_storage['origin2'] = origin
                    temp_storage['destination2'] = destination
                    temp_storage['distance2'] = distance
                    temp_storage['total_distance'] += distance
                    temp_storage['route'] = origin +  '-' + destination + '-' + temp_storage['destination1'] 
                    # print("this route is transit point")
        else:
            temp_storage['origin1'] = origin
            temp_storage['destination1'] = destination
            temp_storage['distance1'] = distance
            temp_storage['total_distance'] = distance
            temp_storage['route'] = origin +  '-' + destination
            # print(row['flight_no'], row['flight_date'] ,origin, destination)
            
        # print(origin,destination,distance,temp_storage['total_distance'])
    
    
        return (temp_storage['total_distance'],temp_storage['route'])

    # Define a custom aggregation function to multiply 'total' by 'a' and retain other columns
    def calculate_plf(group):
        temp_storage = {'origin1': '', 'destination1': '', 'distance1': 0,'origin2': '', 'destination2': '', 'distance2': 0, 'total_distance': 0, 'route': ''}
    
        results = group.apply(lambda row: calculate_ask_distance(row, temp_storage), axis=1)
        group['total_distance'] = results.apply(lambda x: x[0])
        group['route'] = results.apply(lambda x: x[1])
    
        flight_no = group['flight_no'].iloc[0]
        journey_date = group['journey_date'].iloc[0]
        generated_at = group['generated_at'].iloc[0]
        total_RPK = group['rpk'].sum()
        total_booked = group['tot_bkd'].sum()
        total_seat = group['c_tot'].iloc[0]
        
        total_distance = group['total_distance'].iloc[-1]  # Get the last accumulated distance
        total_route = group['route'].iloc[-1]  # Get the last route
        plf = (total_RPK / (total_seat * total_distance)) * 100 if total_seat * total_distance != 0 else 0
    
        return pd.Series({
            'flight_no': flight_no, 
            'route': total_route,
            'flight_date': journey_date,
            'total_seat': total_seat, 
            'total_booked': total_booked, 
            'total_distance': total_distance,
            'total_RPK': total_RPK, 
            'total_ASK': total_seat * total_distance, 
            'plf': plf,
            'generated_at': generated_at
        })
    
    # Group by 'flight_no', 'flight_date', 'route' and aggregate using the custom function
    future_flights_grouped = future_flights_grouped.groupby(['flight_no', 'journey_date']).apply(calculate_plf)

    # Reset index to convert the resulting series to a DataFrame
    future_flights_grouped = future_flights_grouped.reset_index(drop=True)

    # print("---------END: Filtering Data---------")
    return future_flights_grouped

Data Load

In [None]:
from sqlalchemy import create_engine
import pymysql

def insert_data_extraction_table(transformed_df):
    print("---------Loading starting----------")    
    df_data_reset = transformed_df.reset_index(drop=True)
    
    mydb = pymysql.connect(
            host     = "host_name",
            user     = "user_name",
            database = "db_name",
            password = "password")

    engine = create_engine("mysql+pymysql://{user}:{pw}@{host}/{db}"
                           .format(user = "user_name",
                                   host = "host_name",
                                   pw   = "password",
                                   db   = "db_name"))
    
    SQL_CREATE_TBL = "CREATE TABLE IF NOT EXISTS table_name(id INT NOT NULL AUTO_INCREMENT,"
    for name in range(0, len(df_data_reset.columns)):
        SQL_CREATE_TBL += "{} TEXT, ".format(df_data_reset.columns[name])
    
    # SQL_CREATE_TBL = SQL_CREATE_TBL.rstrip(" ,")
    SQL_CREATE_TBL += "created_at timestamp NOT NULL DEFAULT current_timestamp(), updated_at timestamp NOT NULL DEFAULT current_timestamp(), PRIMARY KEY (id));"
    
    mycursor = mydb.cursor()
    
    try:
        print("Creating table {}: ".format("table_name"), end='')
        mycursor.execute(SQL_CREATE_TBL)
        sql_formated = df_data_reset.to_sql('table_name', con= engine, if_exists= 'append', chunksize=1000, index=False)
    except pymysql.Error as err:
        # if err.errno == ER_TABLE_EXISTS_ERROR:
        #     print("already exists.")
        # else:
        print(err)
    else:
        print("OK")
        
    print("---------Loading ended----------")
    
def insert_data_summary_table (df_data_final):
    
    # print("---------Loading starting for----------")
    
    mydb = pymysql.connect(
        host = "host_name",
        user = "user_name",
        database = "db_name",
        password = "password")

    engine = create_engine("mysql+pymysql://{user}:{pw}@{host}/{db}"
                           .format(user="user_name",
                                   host="host_name",
                                   pw="password",
                                   db="db_name"))
    df_data_reset = df_data_final.reset_index(drop=True)
    
    # Define a mapping from Pandas data types to SQL data types and default values

    pandas_to_sql_type_mapping = {
        'int32': {'type': 'INT', 'default': '0'},
        'int64': {'type': 'INT', 'default': '0'},
        'float64': {'type': 'FLOAT', 'default': '0.0'},
        'object': {'type': 'TEXT', 'default': "''"},  # Default empty string for text
        'datetime64[ns]': {'type': 'TIMESTAMP', 'default': 'CURRENT_TIMESTAMP'},  # Example of using a function as a default
    }

    
    SQL_CREATE_TBL = "CREATE TABLE IF NOT EXISTS table_name(id INT NOT NULL AUTO_INCREMENT,"
    
    for column_name, data_type in zip(df_data_reset.columns, df_data_reset.dtypes):
        # Retrieve the mapping for the current data type
        sql_type_info = pandas_to_sql_type_mapping.get(str(data_type), {'type': 'TEXT', 'default': "''"})
        sql_data_type = sql_type_info['type']
        default_value = sql_type_info['default']
    
        # Add column definition to the SQL statement with default values
        SQL_CREATE_TBL += "{} {} DEFAULT {}, ".format(column_name, sql_data_type, default_value)

    SQL_CREATE_TBL += "created_at TIMESTAMP NOT NULL DEFAULT current_timestamp(), updated_at TIMESTAMP NOT NULL DEFAULT current_timestamp(), PRIMARY KEY (id));"
    
    mycursor = mydb.cursor()
    
    def insert_ignore(table, conn, keys, data_iter):
        from sqlalchemy.ext.compiler import compiles
        from sqlalchemy.sql.expression import Insert
    
        @compiles(Insert)
        def replace_string(insert, compiler, **kw):
            s = compiler.visit_insert(insert, **kw)
            s = s.replace("INSERT INTO", "INSERT IGNORE INTO")
            return s
    
        data = [dict(zip(keys, row)) for row in data_iter]
        conn.execute(table.table.insert(), data)

    try:
        # print("Creating table {}: ".format("flight_status"), end='')
        mycursor.execute(SQL_CREATE_TBL)
        # df_data_reset.to_sql('table_name', con= engine, if_exists= 'append', chunksize=1000, index=False)
        df_data_reset.to_sql('table_name', con=engine, if_exists='append', index=False, method=insert_ignore)
    except pymysql.Error as err:
        # print(err)
        pass
    # else:
        # print("OK")
        
    print("---------Loading ended ----------")

Creating processed data file and relocating it

In [None]:
import time
import pathlib

def relocate_file(file, file_path, insertedFolder, transformedFile):
    print("---------Renaming and Relocating starting----------")
    
    timestr = time.strftime("%Y%m%d_%H%M%S_")
    transformedDataFile =  timestr + transformedFile.iloc[0].generated_at + "_"+transformedFile['flight_date'].iloc[0]+"_"+transformedFile['flight_date'].iloc[-1]+".xlsx"
    uploaded_file_path = insertedFolder + "\\" + transformedDataFile
    transformedFile.to_excel(uploaded_file_path, index= False)
    
    backup_file_path = insertedFolder + "\\" + timestr + file
    newName = pathlib.PurePosixPath(backup_file_path).stem + '.TXT'
    
    os.rename(file_path, newName)
    
    print("---------Renaming and Relocating ended----------")