# Lab 3 Data Managment

## Libraries

In [19]:
# Pandas tools
import pandas as pd
import pandas as pd
import pandas.io.sql as sqlio
import io
from pandas import DataFrame

# Database-related tools
import duckdb
import psycopg2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT
from sqlalchemy import create_engine, text, inspect
from sqlalchemy.orm import sessionmaker

# Other utilities
import os

## Connection to the databases (including the data warehouse) and the .csv

Connection or extraction to the provided data sources. 

In [20]:
# Connection to the AIMS and AMOS databases
if 'aims' in globals():
  aims.close()
aims = psycopg2.connect(database='aims', user='bse_airlines', host='dtim.essi.upc.edu', password='BSE2024!', options='-c search_path=bda-aims')
if 'amos' in globals():
  amos.close()
amos = psycopg2.connect(database='amos', user='bse_airlines', host='dtim.essi.upc.edu', password='BSE2024!', options='-c search_path=bda-amos')

# Reading the .csv (must ensure that it is in the same working directory as this notebook)
dfAircrafts = pd.read_csv('aircraft-manufaturerinfo-lookup.csv')

Creation of the data warehouse (ROLAP) using the SQL scripts in the `tables_dw` folder. Note that the code below creates a new database for the specified user.

In [21]:
# Database connection parameters
DB_NAME = "lab3_dw_delgado_fernandez"
DB_USER = "postgres"    # Adjust as necessary
DB_PASSWORD = "datamanagement"    # Adjust as necessary
DB_HOST = "localhost"    # Adjust as necessary
DB_PORT = "5432"    # Adjust as necessary

# PostgreSQL default connection (to 'postgres' database)
DEFAULT_DB_URL = f"postgresql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/postgres"

# Function to drop and recreate the database
def recreate_database():
    try:
        # Connect to the default 'postgres' database
        connection = psycopg2.connect(
            dbname="postgres", 
            user=DB_USER, 
            password=DB_PASSWORD, 
            host=DB_HOST, 
            port=DB_PORT
        )
        connection.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
        cursor = connection.cursor()

        # Drop the database if it exists
        cursor.execute(f"SELECT pg_terminate_backend(pg_stat_activity.pid) "
                       f"FROM pg_stat_activity WHERE pg_stat_activity.datname = '{DB_NAME}';")
        cursor.execute(f"DROP DATABASE IF EXISTS {DB_NAME};")
        print(f"Database '{DB_NAME}' dropped successfully (if it existed).")

        # Create the new database
        cursor.execute(f"CREATE DATABASE {DB_NAME};")
        print(f"Database '{DB_NAME}' created successfully.")

        # Close the connection
        cursor.close()
        connection.close()
    except Exception as e:
        print(f"Error recreating database: {e}")

# Function to execute an SQL file
def execute_sql_file(file_path, engine):
    with open(file_path, 'r') as file:
        sql_code = file.read()
        with engine.connect() as connection:
            try:
                connection.execute(text(sql_code))
                connection.commit()  # Commit the transaction
                print(f"Executed SQL file: {file_path}")
            except Exception as e:
                print(f"Error executing {file_path}: {e}")

# Function to drop all tables in the database
def drop_all_tables(engine):
    inspector = inspect(engine)
    with engine.connect() as connection:
        transaction = connection.begin()
        try:
            for table_name in inspector.get_table_names():
                connection.execute(text(f"DROP TABLE IF EXISTS {table_name} CASCADE"))
            transaction.commit()
            print("Dropped all existing tables.")
        except Exception as e:
            transaction.rollback()
            print(f"Error dropping tables: {e}")

# Function to create tables
def create_tables(engine):
    directory = 'tables_dw'  # Directory containing SQL files
    files_path = sorted([os.path.join(directory, file) for file in os.listdir(directory) if file.endswith('.sql')])

    for file_path in files_path:
        execute_sql_file(file_path, engine)

if __name__ == "__main__":
    # Step 1: Recreate the database (i.e., drop preexisting DB with the same name 
    # and create a new one)
    recreate_database()

    # Step 2: Connect to the newly created database
    DATABASE_URL = f"postgresql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
    engine = create_engine(DATABASE_URL)

    # Step 3: Drop all tables and create new ones
    drop_all_tables(engine)
    create_tables(engine)

Database 'lab3_dw_delgado_fernandez' dropped successfully (if it existed).
Database 'lab3_dw_delgado_fernandez' created successfully.
Dropped all existing tables.
Executed SQL file: tables_dw\01_Route.sql
Executed SQL file: tables_dw\02_Aircraft.sql
Executed SQL file: tables_dw\03_Flights.sql
Executed SQL file: tables_dw\04_Scheduled_routes.sql
Executed SQL file: tables_dw\05_Flight_Issues.sql
Executed SQL file: tables_dw\06_Maintenance_time.sql
Executed SQL file: tables_dw\07_ADOS.sql


It is possible to connect to this database through DBeaver after it has been created by creating a new PostgreSQL connection and specifying the adequate fields *name*, *user*, *password*, *host* and *port*.

In [22]:
# Connection to the data warehouse
outputDB = duckdb.connect(database=':memory:')

## Extracting the AIMS data

In [23]:
# SQL query
query_flights_aims = "SELECT * FROM Flights;"

# Creating a cursor and executing the query
cursor = aims.cursor()
cursor.execute(query_flights_aims)

# Fetch all rows from the result
flights_aims = cursor.fetchall()

# Extract column names from cursor.description
column_names = [desc[0] for desc in cursor.description]

# Close the cursor
cursor.close()

# Convert the data to a DataFrame
df_flights = pd.DataFrame(flights_aims, columns=column_names)

In [24]:
df_flights

Unnamed: 0,id,aircraftregistration,scheduleddeparture,scheduledarrival,kind,flightid,departureairport,arrivalairport,actualdeparture,actualarrival,cancelled,delaycode,passengers,cabincrew,flightcrew
0,1,XY-RJL,2023-08-03 11:03:03.875940,2023-08-03 14:03:03.875940,Flight,230803-NRN-JMK-9129-XY-RJL,NRN,JMK,NaT,NaT,True,,98,3,2
1,2,XY-OZE,2023-07-26 14:50:15.569812,2023-07-26 15:50:15.569812,Flight,230726-HAU-SAW-9867-XY-OZE,HAU,SAW,NaT,NaT,True,,107,4,3
2,3,XY-SJZ,2023-11-20 09:01:04.308766,2023-11-20 13:01:04.308766,Flight,231120-HER-VAA-6975-XY-SJZ,HER,VAA,NaT,NaT,True,,137,3,2
3,4,XY-OXK,2023-06-13 09:55:02.480695,2023-06-13 11:55:02.480695,Flight,230613-VNO-EGC-9468-XY-OXK,VNO,EGC,NaT,NaT,True,,102,4,2
4,5,XY-DGU,2023-03-12 00:41:45.198279,2023-03-12 03:41:45.198279,Flight,230312-BLL-BVA-9815-XY-DGU,BLL,BVA,NaT,NaT,True,,174,4,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151828,151829,XY-ZZE,2023-12-26 15:17:52.027524,2023-12-26 16:17:52.027524,Flight,231226-NUE-MAH-9264-XY-ZZE,NUE,MAH,2023-12-26 15:26:52.027524,2023-12-26 16:26:52.027524,False,,96,4,3
151829,151830,XY-ZZE,2023-12-27 00:01:09.300599,2023-12-27 05:01:09.300599,Flight,231227-PAD-BIQ-9447-XY-ZZE,PAD,BIQ,2023-12-27 00:05:09.300599,2023-12-27 05:05:09.300599,False,,112,4,3
151830,151831,XY-ZZE,2023-12-27 18:55:33.495694,2023-12-27 19:55:33.495694,Flight,231227-SUF-MAD-1652-XY-ZZE,SUF,MAD,2023-12-27 19:02:33.495694,2023-12-27 20:02:33.495694,False,,175,4,2
151831,151832,XY-ZZE,2023-12-28 00:09:38.859140,2023-12-28 04:09:38.859140,Flight,231228-AAL-INN-7485-XY-ZZE,AAL,INN,2023-12-28 00:22:38.859140,2023-12-28 04:22:38.859140,False,,128,3,3


In [35]:
# SQL query
query_slots_aims = "SELECT * FROM Slots;"

# Creating a cursor and executing the query
cursor = aims.cursor()
cursor.execute(query_slots_aims)

# Fetch all rows from the result
slots_aims = cursor.fetchall()

# Extract column names from cursor.description
column_names = [desc[0] for desc in cursor.description]

# Close the cursor
cursor.close()

# Convert the data to a DataFrame
df_slots_aims = pd.DataFrame(slots_aims, columns=column_names)

In [36]:
df_slots_aims.to_csv("df__slots_aims.csv",index=False)

## Transforming the AIMS data

### For Flights (F) table

In [25]:
#R1 requirements calculation

df = df_flights.copy()

# Ensure 'actualdeparture' and 'actualarrival' are in datetime format
df['actualdeparture'] = pd.to_datetime(df['actualdeparture'], errors='coerce')
df['actualarrival'] = pd.to_datetime(df['actualarrival'], errors='coerce')

# Filter out only non-cancelled flights
df_valid_flights = df[df['cancelled'] == False].copy()

# Drop rows with NaT in 'actualdeparture' or 'actualarrival'
df_valid_flights = df_valid_flights.dropna(subset=['actualdeparture', 'actualarrival'])

# Calculate flight duration in hours
df_valid_flights['flight_duration_hours'] = (
    (df_valid_flights['actualarrival'] - df_valid_flights['actualdeparture']).dt.total_seconds() / 3600
)

# Ensure 'scheduleddeparture' is treated as a date column
df_valid_flights['scheduleddeparture_date'] = pd.to_datetime(
    df_valid_flights['scheduleddeparture'], errors='coerce'
).dt.date

# Group by 'aircraftregistration' and 'scheduleddeparture_date' to calculate total flight hours and cycles
total_flight_summary = (
    df_valid_flights
    .groupby(['aircraftregistration', 'scheduleddeparture_date'])
    .agg(
        total_flight_hours=('flight_duration_hours', 'sum'),  # Sum flight hours
        total_flight_cycles=('flight_duration_hours', 'count')  # Count valid flights (cycles)
    )
    .reset_index()
)

# Rename columns for clarity
total_flight_summary.columns = ['aircraftregistration', 'date', 'total_flight_hours', 'total_flight_cycles']

# Display the result
print(total_flight_summary)


      aircraftregistration        date  total_flight_hours  \
0                   XY-AAB  2023-01-01                 8.0   
1                   XY-AAB  2023-01-03                 0.0   
2                   XY-AAB  2023-01-04                 1.0   
3                   XY-AAB  2023-01-06                 9.0   
4                   XY-AAB  2023-01-08                 4.0   
...                    ...         ...                 ...   
82355               XY-ZZE  2023-12-24                 4.0   
82356               XY-ZZE  2023-12-25                 6.0   
82357               XY-ZZE  2023-12-26                 2.0   
82358               XY-ZZE  2023-12-27                 6.0   
82359               XY-ZZE  2023-12-28                 5.0   

       total_flight_cycles  
0                        2  
1                        1  
2                        1  
3                        2  
4                        1  
...                    ...  
82355                    2  
82356                  

In [26]:
#Flights-F data:
total_flight_summary.to_csv('total_flight_summary.csv',index=False)

### For Flight Issues (F)

In [27]:
df = df_flights.copy()

# Convert 'scheduleddeparture' to a datetime format and extract month and year
df['scheduleddeparture'] = pd.to_datetime(df['scheduleddeparture'], errors='coerce', format='%H:%M.%S')
df['year_month'] = pd.to_datetime(df['scheduleddeparture']).dt.to_period('M')

# Handle NaN or invalid 'scheduleddeparture' by removing rows without valid dates
df = df.dropna(subset=['year_month'])

# Grouping the data by 'year_month' and 'aircraftregistration'
summary = df.groupby(['year_month', 'aircraftregistration']).agg(
    total_flights=('id', 'count'),
    cancelled_flights=('cancelled', lambda x: (x == True).sum()),
    non_cancelled_flights=('cancelled', lambda x: (x == False).sum()),
    delayed_flights=('delaycode', lambda x: x.notna().sum())
).reset_index()

# Calculating delay_rate and cancellation_rate
summary['delay_rate'] = summary['delayed_flights'] / summary['non_cancelled_flights']
summary['cancellation_rate'] = summary['cancelled_flights'] / summary['total_flights']

# Final DataFrame: flight_issues
flight_issues = summary[['year_month', 'aircraftregistration', 'delay_rate', 'cancellation_rate']]
flight_issues = flight_issues.rename(columns={'year_month': 'date'})

# Display the result
print(flight_issues)

         date aircraftregistration  delay_rate  cancellation_rate
0     2023-01               XY-AAB    0.045455           0.022222
1     2023-01               XY-ACY    0.071429           0.142857
2     2023-01               XY-AFD    0.027027           0.026316
3     2023-01               XY-AGF    0.073171           0.046512
4     2023-01               XY-ALX    0.127660           0.078431
...       ...                  ...         ...                ...
3391  2023-12               XY-ZKT    0.160714           0.081967
3392  2023-12               XY-ZPA    0.142857           0.023256
3393  2023-12               XY-ZUS    0.085714           0.027778
3394  2023-12               XY-ZWY    0.142857           0.092593
3395  2023-12               XY-ZZE    0.048780           0.023810

[3396 rows x 4 columns]


In [28]:
flight_issues.to_csv("flight_issues.csv",index=False)

## Extracting the AMOS data

In [29]:
## Maintenance Events
Q2 = "SELECT * FROM MaintenanceEvents"

df_maintenance_amos = sqlio.read_sql_query(Q2, amos)

  df_maintenance_amos = sqlio.read_sql_query(Q2, amos)


In [30]:
df_maintenance_amos.to_csv("df_maintenance_amos.csv",index=False)

In [31]:
## Operational interruption

Q3 = "SELECT * FROM OperationInterruption"

df_OI_amos = sqlio.read_sql_query(Q3, amos)

  df_OI_amos = sqlio.read_sql_query(Q3, amos)


In [34]:
df_OI_amos.to_csv("df_OI_amos.csv")

In [33]:
## Work Orders

Q4 = "SELECT * FROM WorkOrders"
df_Work_Orders_amos = sqlio.read_sql_query(Q4, amos)

df_Work_Orders_amos.to_csv("df_Work_Orders_amos.csv")

  df_Work_Orders_amos = sqlio.read_sql_query(Q4, amos)


In [None]:
## Forecasted Orders

In [None]:
## Technical log book orders


## Loading data into a Database table

Lastly, we can load the resulting data frame into a database table.

More info on inporting data to  DuckDB from Pandas:
https://duckdb.org/docs/guides/python/import_pandas.html

In [45]:
# We first store the schema and content of our dataframe into a temp DB table.
outputDB.execute("CREATE TABLE IF NOT EXISTS temp AS SELECT * FROM dfFiltered")


# We can then read the stored DB table into a new data frame and print its content
outputDB.execute("select * from temp").df()

Unnamed: 0,workorderid,aircraftregistration,executiondate,executionplace,workpackage,kind,aircraft_reg_code,manufacturer_serial_number,aircraft_model,aircraft_manufacturer


###Finalize work

Once you finish working with the notebook, please execute the code below to close the connection with the database.

In [46]:
aims.close()
amos.close()