## OLTP Data Preprocessing

In [1]:
import pandas as pd
import cx_Oracle

Defining function to make connection to Oracle DB (SQL developer)

In [2]:
def extract_oltp(table_names):
    
    # {username}/{password}
    conStr = 'system/oracle@localhost:1521/xe'
    conn = None
    
    try:
        # create a connection object    
        conn = cx_Oracle.connect(conStr)
        cur = conn.cursor()

        for table_name in table_names:

            sqlTxt = f'select * from "SSP_RES".{table_name}'

            # execute query and fetch results
            cur.execute(sqlTxt)
            record = cur.fetchall()

            df= pd.DataFrame.from_records(record, columns = [x[0] for x in cur.description] )

            # lower case column names because they are all upper case
            df = df.rename(str.lower, axis='columns')
            
            variable = f'df_{table_name}'
            globals()[variable] = df

            print(f"{variable} succesfully loaded!")        

    except Exception as err:
        print('Error while connecting to the db')
        print(err)    
        
    finally:
        if(conn):
            # close the cursor object to avoid memory leaks
            cur.close()
            conn.close()

Defining table names from OLTP Schema and loading data to corresponding data frames

In [3]:
table_names = [
    "SRC_Country", "SRC_City", "SRC_Guest", "SRC_Building", "SRC_Apartment_Type", "SRC_Apartment",
    "SRC_Reservation", "SRC_Content", "SRC_Contains", "SRC_Apartment_Ocupancy", "SRC_Pricelist"]
    
extract_oltp(table_names)

df_SRC_Country succesfully loaded!
df_SRC_City succesfully loaded!
df_SRC_Guest succesfully loaded!
df_SRC_Building succesfully loaded!
df_SRC_Apartment_Type succesfully loaded!
df_SRC_Apartment succesfully loaded!
df_SRC_Reservation succesfully loaded!
df_SRC_Content succesfully loaded!
df_SRC_Contains succesfully loaded!
df_SRC_Apartment_Ocupancy succesfully loaded!
df_SRC_Pricelist succesfully loaded!


Extracting guest origin from SRC_Guest

In [4]:
df_guest, df_country = df_SRC_Guest.copy(), df_SRC_Country.copy()

df_guest_country = pd.merge(df_guest[["guest_id", "guest_firstname",  "guest_lastname", "country_id"]],
                            df_country[["country_id", "country_name"]], on = "country_id", how = 'left')

df_guest_country["guest_name"] = df_guest_country["guest_firstname"] + " " + df_guest_country["guest_lastname"]
df_guest_country.drop(columns = ["country_id", "guest_firstname", "guest_lastname"], inplace = True)

df_guest_country.head(3)

Unnamed: 0,guest_id,country_name,guest_name
0,1,Serbia,Alexa Milenovic
1,2,France,Mike Johnson
2,3,United States,Emily Davis


Extracting Number of Beds for each apartment

In [5]:
df_content, df_contains, df_apartment = df_SRC_Content.copy(), df_SRC_Contains.copy(), df_SRC_Apartment.copy()

df_content_beds = df_content.loc[df_content["content_name"] == "beds"]

df_contains_content = pd.merge(df_contains, df_content_beds, on="content_id", how="inner")

df_apartment_contains = pd.merge(df_contains_content, df_apartment[["apartment_id", "apartment_name"]], on="apartment_id", how="inner")
df_apartment_contains.drop(columns = ["content_id", "content_name"], inplace = True)

df_apartment_contains.head(3)

Unnamed: 0,apartment_id,content_quantity,apartment_name
0,1,4,One-Bedroom Apartment with Balcony and Sea View
1,4,4,One-Bedroom Apartment with Patio and Sea View
2,2,2,Studio with Patio


Calculating additional necessary attributes from data so it will be in accordance to airbnb and booking tables

In [6]:
df_reservation = df_SRC_Reservation.copy()

# formating dates
df_reservation["start_date"] = pd.to_datetime(df_reservation["start_date"])
df_reservation["end_date"] = pd.to_datetime(df_reservation["end_date"])
df_reservation["booked"] = pd.to_datetime(df_reservation["booked"])

df_reservation.drop(columns = ["reserviation_id", "payment_method"], inplace = True)

df_reservation["# of nights"] = (df_reservation["end_date"] - df_reservation["start_date"]).dt.days
df_reservation["# of days pre booked"] = (df_reservation["start_date"] - df_reservation["booked"]).dt.days

# calculating number of days prior to reservation cancelled for all cancelled reservations, otherwise set value to null
df_reservation["# of days cancel"] = df_reservation.apply(lambda row: (row['start_date'] - row['cancellation_date']).days if not pd.isnull(row['cancellation_date']) else pd.NaT, axis=1)

df_reservation["Tax"] = df_reservation["number_of_guests"] * df_reservation["# of nights"]
df_reservation["Earnings after Tax"] = df_reservation["price"] - df_reservation["Tax"]

df_reservation["advertiser"] = "OLTP"

Combining all tables into one

In [7]:
df_reservation_guest = pd.merge(df_reservation, df_guest_country, on = "guest_id", how = 'left')
df_merged = pd.merge(df_reservation_guest, df_apartment_contains, on = "apartment_id", how = 'left')

df_merged.head(3)

Unnamed: 0,booked,start_date,end_date,cancellation_date,status,price,number_of_guests,advertiser,guest_id,apartment_id,# of nights,# of days pre booked,# of days cancel,Tax,Earnings after Tax,country_name,guest_name,content_quantity,apartment_name
0,2022-04-01,2022-04-05,2022-04-10,NaT,Confirmed,500.0,2,OLTP,2,1,5,4,NaT,10,490.0,France,Mike Johnson,4,One-Bedroom Apartment with Balcony and Sea View
1,2022-05-20,2022-06-01,2022-06-08,NaT,Confirmed,750.0,3,OLTP,3,1,7,12,NaT,21,729.0,United States,Emily Davis,4,One-Bedroom Apartment with Balcony and Sea View
2,2022-01-05,2022-01-20,2022-01-25,NaT,Confirmed,400.0,1,OLTP,4,1,5,15,NaT,5,395.0,Italy,Daniel Lee,4,One-Bedroom Apartment with Balcony and Sea View


Renaming and rearranging columns

In [8]:
df_merged.rename(columns = {"country_name": "Origin", "guest_name": "Guest name", "status" : "Status", "number_of_guests" : "# of guests", "booked" : "Booked", 
                            "start_date" : "Start date", "end_date" : "End date", "advertiser" : "Advertiser", "price" : "Earnings", "apartment_name" : "Listing name",
                            "content_quantity": "# of beds", 'cancellation_date': "Cancel date"}, inplace = True)

df = df_merged[["Status", "Guest name", "Origin", "# of guests", "Booked", "Start date", "End date", "# of nights", "# of days pre booked", "Listing name", "# of beds", 
                "Earnings", "Tax", "Earnings after Tax", "Advertiser", "Cancel date", "# of days cancel"]]

Exporting data frame to excel file

In [9]:
exportPath = "./preprocessed files/"

df.to_excel(exportPath + "Final_Oltp.xlsx")
df.tail(3)

Unnamed: 0,Status,Guest name,Origin,# of guests,Booked,Start date,End date,# of nights,# of days pre booked,Listing name,# of beds,Earnings,Tax,Earnings after Tax,Advertiser,Cancel date,# of days cancel
19,Cancelled,Emily Davis,United States,1,2022-01-01,2022-06-10,2022-06-12,2,160,One-Bedroom Apartment with Patio and Sea View,4,140.0,2,138.0,OLTP,2022-06-07,3
20,Cancelled,Emily Davis,United States,1,2022-01-01,2022-06-10,2022-06-12,2,160,Studio with Patio,2,210.0,2,208.0,OLTP,2022-06-07,3
21,Cancelled,Emily Davis,United States,1,2022-01-01,2022-06-10,2022-06-12,2,160,Studio with Patio,2,430.0,2,428.0,OLTP,2022-06-07,3
