In [15]:
import pandas as pd
import pyodbc
from sqlalchemy import create_engine

In [16]:
def extract_excels():
    try:
        path =  '../preprocessing/preprocessed files/'

        df_Airbnb = pd.read_excel(path + 'Final_Airbnb.xlsx')
        print("df_Airbnb succesfully loaded!")   

        df_Booking = pd.read_excel(path + 'Final_Booking.xlsx')
        print("df_Booking succesfully loaded!")  

        df_Oltp = pd.read_excel(path + 'Final_Oltp.xlsx')
        print("df_Oltp succesfully loaded!")

    except Exception as err:
        print('Error while connecting to the db')
        print(err) 

    return df_Airbnb, df_Booking, df_Oltp

In [17]:
df_Airbnb, df_Booking, df_Oltp = extract_excels()

df_Airbnb succesfully loaded!
df_Booking succesfully loaded!
df_Oltp succesfully loaded!


In [18]:
df_merged = pd.concat([df_Airbnb, df_Booking, df_Oltp]).sort_values("Start date")
df_merged.head()

Unnamed: 0.1,Unnamed: 0,Status,Guest name,Origin,# of guests,Booked,Start date,End date,# of nights,# of days pre booked,Listing name,# of beds,Earnings,Tax,Earnings after Tax,Advertiser
1,1,Confirmed,Youstina Daoud,Egypt,2,2019-03-30,2019-06-03,2019-06-05,2,65,Studio with Patio,2,67.9,4.0,63.9,Airbnb
6,6,Confirmed,Kristina Holm Jensen,Denmark,2,2019-05-05,2019-06-05,2019-06-07,2,31,Studio with Patio,2,67.9,4.0,63.9,Airbnb
2,2,Confirmed,Öznur Balaban,Turkey,2,2019-04-09,2019-06-08,2019-06-09,1,60,Studio with Patio,2,33.95,2.0,31.95,Airbnb
10,10,Confirmed,Steve Qj,United States,1,2019-06-01,2019-06-10,2019-06-11,1,9,Studio with Patio,2,33.95,1.0,32.95,Airbnb
5,5,Confirmed,Quinten Spakman,Netherlands,2,2019-05-04,2019-06-13,2019-06-15,2,40,Studio with Patio,2,67.9,4.0,63.9,Airbnb


## First DW theme

In [19]:
df = pd.DataFrame()

In [20]:
dim_Apartment = pd.concat([df.copy(), pd.DataFrame(columns=['apartment_id', 'apartment_name'])], axis=1)

df_merged["apartment_id"] = df_merged["Listing name"].rank(method='dense').astype(int)

dim_Apartment = df_merged[['apartment_id', 'Listing name']].drop_duplicates().sort_values("apartment_id")
dim_Apartment.head()


Unnamed: 0,apartment_id,Listing name
7,1,One-Bedroom Apartment with Balcony and Sea View
22,2,One-Bedroom Apartment with Patio and Sea View
1,3,Studio with Patio
23,4,Studio with Patio and Sea View


Dim_Advertiser

In [21]:
dim_Advertiser = pd.concat([df.copy(), pd.DataFrame(columns=['advertiser_id', 'advertiser_name'])], axis=1)

df_merged["advertiser_id"] = df_merged["Advertiser"].rank(method='dense').astype(int)

dim_Advertiser = df_merged[['advertiser_id', 'Advertiser']].drop_duplicates().sort_values("advertiser_id")
dim_Advertiser.head()

Unnamed: 0,advertiser_id,Advertiser
1,1,Airbnb
22,2,Booking


Dim_Time

In [23]:
dim_Time = pd.concat([df.copy(), pd.DataFrame(columns=['time_id', 'date', 'day', 'month', 'year'])], axis=1)

dates = pd.date_range(start='2016-01-01', end='2026-12-31')

dim_Dates = pd.DataFrame({'date': dates, 'day': dates.day, 'month': dates.month, 'quarter': dates.quarter, 'year': dates.year})
dim_Dates['week_of_year'] = dim_Dates['date'].dt.isocalendar().week

dim_Dates = dim_Dates.reset_index(drop = True)
dim_Dates['date_id'] = dim_Dates.index + 1

dim_Dates.head()

Unnamed: 0,date,day,month,quarter,year,week_of_year,date_id
0,2016-01-01,1,1,1,2016,53,1
1,2016-01-02,2,1,1,2016,53,2
2,2016-01-03,3,1,1,2016,53,3
3,2016-01-04,4,1,1,2016,1,4
4,2016-01-05,5,1,1,2016,1,5


Fct_Cashflow

In [24]:
fct_Cashflow = pd.concat([df.copy(), pd.DataFrame(columns=['cashflow_id', 'apartment_id', 'advertiser_id', 'time_id', 'earnings_for_day'])], axis=1)

fct_Cashflow = df_merged.loc[df_merged["Status"] == "Confirmed"].copy()

fct_Cashflow["Price per day"] = (fct_Cashflow["Earnings after Tax"] / fct_Cashflow["# of nights"])

fct_Cashflow = fct_Cashflow[["Start date", "End date", "apartment_id" , "advertiser_id", "Earnings after Tax", "# of nights", "Price per day"]]
fct_Cashflow['date'] = fct_Cashflow.apply(lambda x: pd.date_range(start=x['Start date'], end=x['End date'] - pd.Timedelta(days=1)), axis=1)
fct_Cashflow.head()


Unnamed: 0,Start date,End date,apartment_id,advertiser_id,Earnings after Tax,# of nights,Price per day,date
1,2019-06-03,2019-06-05,3,1,63.9,2,31.95,"DatetimeIndex(['2019-06-03', '2019-06-04'], dt..."
6,2019-06-05,2019-06-07,3,1,63.9,2,31.95,"DatetimeIndex(['2019-06-05', '2019-06-06'], dt..."
2,2019-06-08,2019-06-09,3,1,31.95,1,31.95,"DatetimeIndex(['2019-06-08'], dtype='datetime6..."
10,2019-06-10,2019-06-11,3,1,32.95,1,32.95,"DatetimeIndex(['2019-06-10'], dtype='datetime6..."
5,2019-06-13,2019-06-15,3,1,63.9,2,31.95,"DatetimeIndex(['2019-06-13', '2019-06-14'], dt..."


In [25]:
fct_Cashflow["Price per day"] = pd.to_numeric(fct_Cashflow["Price per day"], errors='coerce')
fct_Cashflow["Price per day"] = fct_Cashflow["Price per day"].round(2)

In [26]:
fct_Cashflow = fct_Cashflow.explode('date')
fct_Cashflow = fct_Cashflow.drop(['Start date', 'End date', 'Earnings after Tax', '# of nights'], axis=1)

fct_Cashflow = fct_Cashflow.merge(dim_Dates[["date_id", "date"]], on='date', how='left')

fct_Cashflow = fct_Cashflow.reset_index(drop = True)
fct_Cashflow['cashflow_id'] = fct_Cashflow.index + 1

fct_Cashflow = fct_Cashflow[["cashflow_id", "advertiser_id", "apartment_id", "date_id", "Price per day"]]
fct_Cashflow.head()

Unnamed: 0,cashflow_id,advertiser_id,apartment_id,date_id,Price per day
0,1,1,3,1250,31.95
1,2,1,3,1251,31.95
2,3,1,3,1252,31.95
3,4,1,3,1253,31.95
4,5,1,3,1255,31.95


## Load tables to DW

In [27]:
def load(df, tbl):
    try:
        rows_imported = 0
        engine = create_engine(f'postgresql://{uid}:{pwd}@{server}:5432/DW_Reservation_System')
        print(f'importing rows {rows_imported} to {rows_imported + len(df)}... for table {tbl}')
        # save df to postgres
        df.to_sql(f'{tbl}', engine, if_exists = "replace", index = False)
        rows_imported += len(df)
        print('Data imported sucessfully!')
    except Exception as e:
        print('Data load error: ' + str(e))

In [28]:
uid, pwd, server = "etl", "pass", "localhost"

dataframes_to_load = [dim_Apartment, dim_Advertiser, dim_Dates, fct_Cashflow]
table_names = ["dim_Apartment", "dim_Advertiser", "dim_Dates", "fct_Cashflow"]

In [29]:
for df, tbl in zip(dataframes_to_load, table_names):
    load(df, tbl)

importing rows 0 to 4... for table dim_Apartment
Data imported sucessfully!
importing rows 0 to 2... for table dim_Advertiser
Data imported sucessfully!
importing rows 0 to 4018... for table dim_Dates
Data imported sucessfully!
importing rows 0 to 783... for table fct_Cashflow
Data imported sucessfully!
