In [1]:
import cx_Oracle
import pandas as pd

Defining function to make connection to Oracle DB (SQL developer)

In [2]:
def extract_oltp(table_names):

    conStr = 'system/oracle@localhost:1521/xe'
    conn = None

    try:

        # create a connection object    
        conn = cx_Oracle.connect(conStr)

        cur = conn.cursor()

        for table_name in table_names:

            sqlTxt = f'select * from "SSP_RES".{table_name}'

            # execute query and fetch results
            cur.execute(sqlTxt)
            record = cur.fetchall()

            df= pd.DataFrame.from_records(record, columns = [x[0] for x in cur.description] )

            variable = f'df_{table_name}'
            globals()[variable] = df

            print(f"{variable} succesfully loaded!")        

    except Exception as err:
        print('Error while connecting to the db')
        print(err)    
        
    finally:
        if(conn):
            cur.close()
            conn.close()

Defining table names from OLTP Schema, and loading data to corresponding data frames

In [3]:
table_names = [
    "SRC_Country", "SRC_City", "SRC_Guest", "SRC_Building", "SRC_Apartment_Type", "SRC_Apartment",
    "SRC_Reservation", "SRC_Content", "SRC_Contains", "SRC_Apartment_Ocupancy", "SRC_Pricelist"]
    
extract_oltp(table_names)

df_SRC_Country succesfully loaded!
df_SRC_City succesfully loaded!
df_SRC_Guest succesfully loaded!
df_SRC_Building succesfully loaded!
df_SRC_Apartment_Type succesfully loaded!
df_SRC_Apartment succesfully loaded!
df_SRC_Reservation succesfully loaded!
df_SRC_Content succesfully loaded!
df_SRC_Contains succesfully loaded!
df_SRC_Apartment_Ocupancy succesfully loaded!
df_SRC_Pricelist succesfully loaded!


Extracting guest_origin from SRC_Guest

In [4]:
df_guest, df_country = df_SRC_Guest.copy(), df_SRC_Country.copy()

# merged df_guest and df_country
df_guest_country = pd.merge(df_guest[["GUEST_ID", "GUEST_FIRSTNAME",  "GUEST_LASTNAME", "COUNTRY_ID"]],
                            df_country[["COUNTRY_ID", "COUNTRY_NAME"]], on = "COUNTRY_ID", how = 'left')

df_guest_country["GUEST_NAME"] = df_guest_country["GUEST_FIRSTNAME"] + " " + df_guest_country["GUEST_LASTNAME"]
df_guest_country.drop(columns = ["COUNTRY_ID", "GUEST_FIRSTNAME", "GUEST_LASTNAME"], inplace = True)
df_guest_country.head()

Unnamed: 0,GUEST_ID,COUNTRY_NAME,GUEST_NAME
0,1,Serbia,Alexa Milenovic
1,2,France,Mike Johnson
2,3,United States,Emily Davis
3,4,Italy,Daniel Lee
4,5,France,Sarah Smith


Extracting Number of Beds for each apartment

In [5]:
df_content, df_contains, df_apartment = df_SRC_Content.copy(), df_SRC_Contains.copy(), df_SRC_Apartment.copy()

df_content_beds = df_content.loc[df_content["CONTENT_NAME"] == "beds"]

df_contains_content = pd.merge(df_contains, df_content_beds, on="CONTENT_ID", how="inner")

df_apartment_contains = pd.merge(df_contains_content, df_apartment[["APARTMENT_ID", "APARTMENT_NAME"]], on="APARTMENT_ID", how="inner")
df_apartment_contains.drop(columns = ["CONTENT_ID", "CONTENT_NAME"], inplace = True)
df_apartment_contains.head()

Unnamed: 0,APARTMENT_ID,CONTENT_QUANTITY,APARTMENT_NAME
0,1,4,One-Bedroom Apartment with Balcony and Sea View
1,4,4,One-Bedroom Apartment with Patio and Sea View
2,2,2,Studio with Patio
3,3,2,Studio with Patio and Sea View


Calculating additional necessary attributes from data

In [6]:
df_reservation = df_SRC_Reservation.copy()

# formating dates
df_reservation["START_DATE"] = pd.to_datetime(df_reservation["START_DATE"])
df_reservation["END_DATE"] = pd.to_datetime(df_reservation["END_DATE"])
df_reservation["BOOKED"] = pd.to_datetime(df_reservation["BOOKED"])

df_reservation.drop(columns = ["RESERVIATION_ID", "PAYMENT_METHOD"], inplace = True)

df_reservation["# of nights"] = (df_reservation["END_DATE"] - df_reservation["START_DATE"]).dt.days
df_reservation["# of days pre booked"] = (df_reservation["START_DATE"] - df_reservation["BOOKED"]).dt.days
df_reservation["# of days cancel"] = df_reservation.apply(lambda row: (row['START_DATE'] - row['CANCELLATION_DATE']).days if not pd.isnull(row['CANCELLATION_DATE']) else pd.NaT, axis=1)

df_reservation["Tax"] = df_reservation["NUMBER_OF_GUESTS"] * df_reservation["# of nights"]
df_reservation["Earnings after Tax"] = df_reservation["PRICE"] - df_reservation["Tax"]

df_reservation["ADVERTISER"] = "OLTP"

Combining all tables into one

In [7]:
df_reservation_guest = pd.merge(df_reservation, df_guest_country, on = "GUEST_ID", how = 'left')

df_merged = pd.merge(df_reservation_guest, df_apartment_contains, on = "APARTMENT_ID", how = 'left')
df_merged.head()

Unnamed: 0,BOOKED,START_DATE,END_DATE,CANCELLATION_DATE,STATUS,PRICE,NUMBER_OF_GUESTS,ADVERTISER,GUEST_ID,APARTMENT_ID,# of nights,# of days pre booked,# of days cancel,Tax,Earnings after Tax,COUNTRY_NAME,GUEST_NAME,CONTENT_QUANTITY,APARTMENT_NAME
0,2022-04-01,2022-04-05,2022-04-10,NaT,Confirmed,500.0,2,OLTP,2,1,5,4,NaT,10,490.0,France,Mike Johnson,4,One-Bedroom Apartment with Balcony and Sea View
1,2022-05-20,2022-06-01,2022-06-08,NaT,Confirmed,750.0,3,OLTP,3,1,7,12,NaT,21,729.0,United States,Emily Davis,4,One-Bedroom Apartment with Balcony and Sea View
2,2022-01-05,2022-01-20,2022-01-25,NaT,Confirmed,400.0,1,OLTP,4,1,5,15,NaT,5,395.0,Italy,Daniel Lee,4,One-Bedroom Apartment with Balcony and Sea View
3,2022-02-10,2022-02-15,2022-02-20,NaT,Confirmed,600.0,2,OLTP,5,1,5,5,NaT,10,590.0,France,Sarah Smith,4,One-Bedroom Apartment with Balcony and Sea View
4,2022-08-01,2022-08-15,2022-08-22,NaT,Confirmed,900.0,3,OLTP,6,1,7,14,NaT,21,879.0,United States,Ryan Nguyen,4,One-Bedroom Apartment with Balcony and Sea View


Renaming and rearranging columns

In [8]:
df_merged.rename(columns = {"COUNTRY_NAME": "Origin", "GUEST_NAME": "Guest name", "STATUS" : "Status", "NUMBER_OF_GUESTS" : "# of guests", "BOOKED" : "Booked", "START_DATE" : "Start date", 
                            "END_DATE" : "End date", "ADVERTISER" : "Advertiser", "PRICE" : "Earnings", "APARTMENT_NAME" : "Listing name", "CONTENT_QUANTITY": "# of beds",
                            'CANCELLATION_DATE': "Cancel date"}, inplace = True)

df = df_merged[["Status", "Guest name", "Origin", "# of guests", "Booked", "Start date", "End date", "# of nights", "# of days pre booked", "Listing name", "# of beds", "Earnings", "Tax", "Earnings after Tax", "Advertiser", "Cancel date", "# of days cancel"]]

Exporting data frame to excel file

In [9]:
exportPath = "./preprocessed files/"

df.to_excel(exportPath + "Final_Oltp.xlsx")
df.tail()

Unnamed: 0,Status,Guest name,Origin,# of guests,Booked,Start date,End date,# of nights,# of days pre booked,Listing name,# of beds,Earnings,Tax,Earnings after Tax,Advertiser,Cancel date,# of days cancel
17,Cancelled,Lucas Rodriguez,Italy,3,2022-06-01,2022-07-15,2022-07-25,10,44,Studio with Patio,2,600.0,30,570.0,OLTP,2022-07-13,2
18,Cancelled,Ryan Nguyen,United States,5,2022-04-04,2022-04-11,2022-04-18,7,7,Studio with Patio and Sea View,2,430.0,35,395.0,OLTP,2022-04-03,8
19,Cancelled,Emily Davis,United States,1,2022-01-01,2022-06-10,2022-06-12,2,160,One-Bedroom Apartment with Patio and Sea View,4,140.0,2,138.0,OLTP,2022-06-07,3
20,Cancelled,Emily Davis,United States,1,2022-01-01,2022-06-10,2022-06-12,2,160,Studio with Patio,2,210.0,2,208.0,OLTP,2022-06-07,3
21,Cancelled,Emily Davis,United States,1,2022-01-01,2022-06-10,2022-06-12,2,160,Studio with Patio,2,430.0,2,428.0,OLTP,2022-06-07,3
