## 📌 Extract & Transform: Fetching and Processing Booking Data

This block of code connects to a PostgreSQL database, extracts facility booking data, and processes it using Pandas. 

It calculates the total booking duration for each facility by converting slots to minutes and grouping the data accordingly. 

The transformed data is then prepared for loading into an analytical database.

In [None]:
import psycopg2
import pandas as pd
from dotenv import load_dotenv
import os

load_dotenv()

conn_params = {
    'host': os.getenv('DB_HOST'),
    'dbname': os.getenv('DB_NAME'),
    'user': os.getenv('DB_USER'),
    'password': os.getenv('DB_PASSWORD')
}

def fetch_data(query):
    try:
        with psycopg2.connect(**conn_params) as conn:
            with conn.cursor() as cur:
                cur.execute("SELECT current_database();")
                db_name = cur.fetchone()[0]
                print("Connected to database:", db_name)
                cur.execute(query)
                data = cur.fetchall()
                colnames = [desc[0] for desc in cur.description]
        return colnames, data
    except psycopg2.Error as e:
        print("Error fetching data from source", e)

query = 'SELECT b.facid, f.name, b.slots FROM bookings b JOIN facilities f on b.facid = f.facid'

data = fetch_data(query)
print(data)

df = pd.DataFrame(data[1], columns=['facility_id', 'facility_name', 'slots_reserved_per_booking'])
df['total_booking_duration'] = df['slots_reserved_per_booking'] * 60

grouped_df = df.groupby(['facility_id', 'facility_name']).sum()
aggredated_grouped_df = grouped_df.reset_index()

aggredated_grouped_df = aggredated_grouped_df[['facility_id', 'total_booking_duration']]


display(aggredated_grouped_df)

analytical_database_conn_params = {
    'host': 'localhost',
    'dbname': 'etl_bites',
    'user': 'olikelly',
    'password': 'i_am_a_password'
}

conn_string = "dbname=etl_bites user=olikelly password=i_am_a_password host=localhost port='5432'"


def execute_query_postgresql(conn_string, query):
    try:
        with connect(conn_string) as conn:
            with conn.cursor() as cur:
                cur.execute(query)
                conn.commit()
    except psycopg2.Error as e:
        print("Error executing Postgres query:", e)



create_booking_duration_data_table = '''
DROP TABLE IF EXISTS total_booking_duration;
CREATE TABLE IF NOT EXISTS total_booking_duration (
facility_id INTEGER NOT NULL,
total_booking_duration INTEGER NOT NULL)
'''


execute_query_postgresql(conn_string, create_booking_duration_data_table)

def insert_data(parameters, table_name, data, columns):
    try:
        with psycopg2.connect(**parameters) as conn:
            with conn.cursor() as cur:
                for row in data.itertuples(index=False):
                    insert_query = f"INSERT INTO {table_name} ({', '.join(columns)}) VALUES ({', '.join(['%s'] * len(columns))});"
                    cur.execute(insert_query, row)
                    print("row inserted", row)
                conn.commit()
    except psycopg2.Error as e:
        print("Error inserting data into DB", e)


insert_data(analytical_database_conn_params, 'total_booking_duration', aggredated_grouped_df, ['facility_id', 'total_booking_duration'])

most_frequent_bookers = '''
SELECT m.firstname, m.surname, b.memid, COUNT(b.memid) AS total_bookings 
FROM members m 
JOIN bookings b 
ON b.memid = m.memid
GROUP BY m.firstname, m.surname, b.memid
ORDER BY total_bookings DESC
LIMIT 6;
'''

fetch_data(most_frequent_bookers)

# RESULT:
# (['firstname', 'surname', 'memid', 'total_bookings'],
#  [('GUEST', 'GUEST', 0, 883),
#   ('Tim', 'Rownam', 3, 408),
#   ('Darren', 'Smith', 1, 261),
#   ('Tracy', 'Smith', 2, 210),
#   ('Tim', 'Boothe', 8, 188),
#   ('Burton', 'Tracy', 6, 176)])





Connected to database: postgres
(['facid', 'name', 'slots'], [(3, 'Table Tennis', 2), (4, 'Massage Room 1', 2), (6, 'Squash Court', 2), (7, 'Snooker Table', 2), (8, 'Pool Table', 1), (8, 'Pool Table', 1), (0, 'Tennis Court 1', 3), (0, 'Tennis Court 1', 3), (4, 'Massage Room 1', 2), (4, 'Massage Room 1', 2), (4, 'Massage Room 1', 2), (6, 'Squash Court', 2), (6, 'Squash Court', 2), (6, 'Squash Court', 2), (7, 'Snooker Table', 2), (8, 'Pool Table', 1), (8, 'Pool Table', 1), (1, 'Tennis Court 2', 3), (2, 'Badminton Court', 3), (3, 'Table Tennis', 2), (3, 'Table Tennis', 2), (4, 'Massage Room 1', 2), (6, 'Squash Court', 2), (6, 'Squash Court', 2), (7, 'Snooker Table', 2), (8, 'Pool Table', 1), (0, 'Tennis Court 1', 3), (0, 'Tennis Court 1', 3), (0, 'Tennis Court 1', 3), (2, 'Badminton Court', 3), (3, 'Table Tennis', 2), (4, 'Massage Room 1', 2), (6, 'Squash Court', 2), (7, 'Snooker Table', 2), (7, 'Snooker Table', 2), (8, 'Pool Table', 1), (0, 'Tennis Court 1', 3), (0, 'Tennis Court 1', 3),

Unnamed: 0,facility_id,total_booking_duration
0,0,79200
1,1,76680
2,2,72540
3,3,49800
4,4,84240
5,5,13680
6,6,66240
7,7,54480
8,8,54660


row inserted Pandas(facility_id=0, total_booking_duration=79200)
row inserted Pandas(facility_id=1, total_booking_duration=76680)
row inserted Pandas(facility_id=2, total_booking_duration=72540)
row inserted Pandas(facility_id=3, total_booking_duration=49800)
row inserted Pandas(facility_id=4, total_booking_duration=84240)
row inserted Pandas(facility_id=5, total_booking_duration=13680)
row inserted Pandas(facility_id=6, total_booking_duration=66240)
row inserted Pandas(facility_id=7, total_booking_duration=54480)
row inserted Pandas(facility_id=8, total_booking_duration=54660)
Connected to database: postgres


(['firstname', 'surname', 'memid', 'total_bookings'],
 [('GUEST', 'GUEST', 0, 883),
  ('Tim', 'Rownam', 3, 408),
  ('Darren', 'Smith', 1, 261),
  ('Tracy', 'Smith', 2, 210),
  ('Tim', 'Boothe', 8, 188),
  ('Burton', 'Tracy', 6, 176)])

## 📌 Extract & Transform: Fetching and Processing API Data  

This block of code retrieves post and user data from an API, merges them by user ID, and adds author names to posts. 

The transformed data is then prepared for loading into a PostgreSQL database for further analysis.


In [202]:
import requests
from psycopg2 import connect, sql
from dotenv import load_dotenv
import os


load_dotenv()


# Configure your PostgreSQL connection string
conn_string = "dbname='etl_bites' user='olikelly' password='i_am_a_password' host='localhost' port='5432'"

def get_data_from_api(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        return data 
    except requests.exceptions.RequestException as e:
        print("Error extracting data from API", e)
        return None
        
  
posts_url = os.getenv('POSTS_URL')
users_url = os.getenv('USERS_URL')

posts_data = get_data_from_api(posts_url)

if posts_data:
    print("posts_data successfully received from API")

users_data = get_data_from_api(users_url)

if users_data:
        print("users_data successfully received from API")
    

def join_posts_and_users(posts, users):
    for post in posts:
        for user in users:
            if post['userId'] == user['id']:
                post['author'] = user['name']
    return posts

combined_data = join_posts_and_users(posts_data, users_data)


  # Create tables in analytical DB
  # This could also be done manually via a GUI (e.g. TablePlus) or with a SQL script
def execute_query_postgresql(conn_string, query):
    try:
        with connect(conn_string) as conn:
            with conn.cursor() as cur:
                cur.execute(query)
                conn.commit()
    except psycopg2.Error as e:
        print("Error executing DB query", e)

create_api_data_table = '''
  CREATE TABLE IF NOT EXISTS api_data
    (
      post_id INTEGER NOT NULL UNIQUE,
      title TEXT NOT NULL,
      body TEXT NOT NULL,
      user_id INTEGER NOT NULL,
      author TEXT NOT NULL
  )
  ;
  '''

execute_query_postgresql(conn_string, 'DROP TABLE IF EXISTS api_data;')
execute_query_postgresql(conn_string, create_api_data_table)

def insert_data_to_postgresql(conn_string, table_name, data):
    try:
        with connect(conn_string) as conn:
            with conn.cursor() as cur:
                for item in data:
                    query = sql.SQL("INSERT INTO {} (post_id, title, body, user_id, author) VALUES (%s, %s, %s, %s, %s) ON CONFLICT(post_id) DO NOTHING").format(sql.Identifier(table_name))
                    cur.execute(query, (item['id'], item['title'], item['body'], item['userId'], item['author']))
            conn.commit()
    except psycopg2.Error as e:
        print("Error loading data into DB", e)

table_name = "api_data"
insert_data_to_postgresql(conn_string, table_name, combined_data)






posts_data successfully received from API
users_data successfully received from API
