# EDA for Prices Dataset

Initial look at data and data cleaning

## Import necassary libraries

In [14]:
# Import necassary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
from scipy import stats

In [15]:
#Load datasets
df_prices = pd.read_csv('../data/prices_dataset.csv')

df_inquiries = pd.read_csv('../data/master_inquiries_20210715.csv')

## Correcting date and time formatting.

In [16]:
df_prices['date_from'] = pd.to_datetime(df_prices['date_from'], errors = 'coerce')
df_prices['date_to'] = pd.to_datetime(df_prices['date_to'], errors = 'coerce')
df_inquiries['date'] = pd.to_datetime(df_inquiries['time'], errors = 'coerce')
df_inquiries['time'] = pd.to_datetime(df_inquiries['date'], errors = 'coerce')
df_inquiries['arrival_date'] = pd.to_datetime(df_inquiries['arrival_date'], errors = 'coerce')
df_inquiries['departure_date'] = pd.to_datetime(df_inquiries['departure_date'], errors = 'coerce')

## Initial look at datasets


In [17]:
df_inquiries.head()

Unnamed: 0.1,Unnamed: 0,listing_id,title,date,time,adult_count,children_count,pets_count,arrival_date,departure_date,inquiry_price,length_stay
0,1,b334776d-0cf1-51f1-8cdc-37535b280f3d,Anfrage,2021-07-20 00:07:31,2021-07-20 00:07:31,6.0,2.0,0.0,2019-07-24,2019-07-31,1601.0,7
1,3,2d71d636-34d3-567a-b21a-244adf0505c0,Anfrage,2021-07-20 00:11:58,2021-07-20 00:11:58,3.0,0.0,0.0,2019-06-20,2019-07-04,0.0,14
2,5,80f54c2a-c1f9-5744-a2c8-b8764c0bde87,Anfrage,2021-07-20 00:24:34,2021-07-20 00:24:34,2.0,1.0,0.0,2019-10-13,2019-10-20,0.0,7
3,6,6a263b0f-cfd6-53fd-aa76-eae6662a4aa2,Anfrage,2021-07-20 00:28:19,2021-07-20 00:28:19,2.0,0.0,0.0,2019-06-29,2019-07-13,1380.0,14
4,12,202306bf-f261-5975-8356-11719a53a063,Anfrage,2021-07-20 01:06:48,2021-07-20 01:06:48,2.0,0.0,0.0,2019-04-30,2019-05-03,237.0,3


In [18]:
df_prices.head() 

Unnamed: 0.1,Unnamed: 0,listing_id,min_days,date_from,date_to,filled_in_price_per_day,filled_in_price_per_week,month,price_catagory
0,0,fbd74bce-23dd-53e7-b988-8beb85160d6f,5.0,2020-04-01,2020-06-15,70.0,490.0,2020-04,Day
1,1,fbd74bce-23dd-53e7-b988-8beb85160d6f,5.0,2020-06-15,2020-07-05,78.0,546.0,2020-06,Day
2,2,fbd74bce-23dd-53e7-b988-8beb85160d6f,4.0,2020-09-13,2020-11-01,70.0,490.0,2020-09,Day
3,3,fbd74bce-23dd-53e7-b988-8beb85160d6f,2.0,2020-11-01,2020-12-24,58.0,406.0,2020-11,Day
4,4,fbd74bce-23dd-53e7-b988-8beb85160d6f,5.0,2020-12-24,2021-01-07,80.0,560.0,2020-12,Day


## Deleting empty columns

In [19]:
del df_inquiries['Unnamed: 0']
del df_prices['Unnamed: 0']

## Merging datasets

In [27]:
df_master = pd.merge(df_inquiries, df_prices, on='listing_id', how='outer', suffixes=('','_key'))
df_master = df_master[(df_master.arrival_date >= df_master.date_from) & (df_master.arrival_date <= df_master.date_to)]
df_master = pd.merge(df_inquiries, df_master, on=[
    'listing_id', 'date'], how='left')
df_master

Unnamed: 0,listing_id,title_x,date,time_x,adult_count_x,children_count_x,pets_count_x,arrival_date_x,departure_date_x,inquiry_price_x,...,departure_date_y,inquiry_price_y,length_stay_y,min_days,date_from,date_to,filled_in_price_per_day,filled_in_price_per_week,month,price_catagory
0,b334776d-0cf1-51f1-8cdc-37535b280f3d,Anfrage,2021-07-20 00:07:31,2021-07-20 00:07:31,6.0,2.0,0.0,2019-07-24,2019-07-31,1601.0,...,2019-07-31,1601.0,7.0,7.0,2019-06-14,2019-09-01,188.0,1316.0,2019-06,Both
1,2d71d636-34d3-567a-b21a-244adf0505c0,Anfrage,2021-07-20 00:11:58,2021-07-20 00:11:58,3.0,0.0,0.0,2019-06-20,2019-07-04,0.0,...,NaT,,,,NaT,NaT,,,,
2,80f54c2a-c1f9-5744-a2c8-b8764c0bde87,Anfrage,2021-07-20 00:24:34,2021-07-20 00:24:34,2.0,1.0,0.0,2019-10-13,2019-10-20,0.0,...,2019-10-20,0.0,7.0,2.0,2019-09-27,2019-10-13,75.0,500.0,2019-09,Both
3,80f54c2a-c1f9-5744-a2c8-b8764c0bde87,Anfrage,2021-07-20 00:24:34,2021-07-20 00:24:34,2.0,1.0,0.0,2019-10-13,2019-10-20,0.0,...,2019-10-20,0.0,7.0,2.0,2019-10-13,2019-10-28,75.0,500.0,2019-10,Both
4,6a263b0f-cfd6-53fd-aa76-eae6662a4aa2,Anfrage,2021-07-20 00:28:19,2021-07-20 00:28:19,2.0,0.0,0.0,2019-06-29,2019-07-13,1380.0,...,NaT,,,,NaT,NaT,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1318325,33931be0-2aa9-5b4d-89f7-4e3f7779678f,Verbindliche Anfrage,2021-07-20 16:51:57,2021-07-20 16:51:57,2.0,0.0,0.0,2021-10-09,2021-10-23,1120.0,...,2021-10-23,1120.0,14.0,3.0,2021-10-03,2021-12-16,80.0,560.0,2021-10,Day
1318326,242a498a-e77a-56e1-85a3-8170fee3384f,Verbindliche Anfrage,2021-07-20 18:22:31,2021-07-20 18:22:31,2.0,1.0,0.0,2021-08-02,2021-08-07,550.0,...,2021-08-07,550.0,5.0,5.0,2021-06-19,2021-09-12,90.0,630.0,2021-06,Both
1318327,638ab1f2-a60c-5b34-8c89-2c66de69c111,Verbindliche Anfrage,2021-07-20 21:26:52,2021-07-20 21:26:52,2.0,1.0,0.0,2021-08-01,2021-08-08,1267.0,...,2021-08-08,1267.0,7.0,6.0,2021-06-01,2021-10-02,153.0,1071.0,2021-06,Day
1318328,8e8344f9-2c96-50e4-bb6f-700d1b82a6b7,Verbindliche Anfrage,2021-07-20 23:38:06,2021-07-20 23:38:06,2.0,0.0,0.0,2021-07-25,2021-08-06,1787.0,...,2021-08-06,1787.0,12.0,5.0,2021-06-12,2021-09-04,140.0,980.0,2021-06,Day


In [28]:
df_inquiries.columns

Index(['listing_id', 'title', 'date', 'time', 'adult_count', 'children_count',
       'pets_count', 'arrival_date', 'departure_date', 'inquiry_price',
       'length_stay'],
      dtype='object')

In [29]:
df_master = df_master.drop(['title_y', 'time_y', 'adult_count_y', 'children_count_y',
       'pets_count_y', 'arrival_date_y', 'departure_date_y', 'inquiry_price_y',
       'length_stay_y'], axis=1)

In [30]:
df_master.head()

Unnamed: 0,listing_id,title_x,date,time_x,adult_count_x,children_count_x,pets_count_x,arrival_date_x,departure_date_x,inquiry_price_x,length_stay_x,min_days,date_from,date_to,filled_in_price_per_day,filled_in_price_per_week,month,price_catagory
0,b334776d-0cf1-51f1-8cdc-37535b280f3d,Anfrage,2021-07-20 00:07:31,2021-07-20 00:07:31,6.0,2.0,0.0,2019-07-24,2019-07-31,1601.0,7,7.0,2019-06-14,2019-09-01,188.0,1316.0,2019-06,Both
1,2d71d636-34d3-567a-b21a-244adf0505c0,Anfrage,2021-07-20 00:11:58,2021-07-20 00:11:58,3.0,0.0,0.0,2019-06-20,2019-07-04,0.0,14,,NaT,NaT,,,,
2,80f54c2a-c1f9-5744-a2c8-b8764c0bde87,Anfrage,2021-07-20 00:24:34,2021-07-20 00:24:34,2.0,1.0,0.0,2019-10-13,2019-10-20,0.0,7,2.0,2019-09-27,2019-10-13,75.0,500.0,2019-09,Both
3,80f54c2a-c1f9-5744-a2c8-b8764c0bde87,Anfrage,2021-07-20 00:24:34,2021-07-20 00:24:34,2.0,1.0,0.0,2019-10-13,2019-10-20,0.0,7,2.0,2019-10-13,2019-10-28,75.0,500.0,2019-10,Both
4,6a263b0f-cfd6-53fd-aa76-eae6662a4aa2,Anfrage,2021-07-20 00:28:19,2021-07-20 00:28:19,2.0,0.0,0.0,2019-06-29,2019-07-13,1380.0,14,,NaT,NaT,,,,


In [31]:
df_master = df_master.dropna()

In [34]:
df_master.head()

Unnamed: 0,listing_id,title_x,date,time_x,adult_count_x,children_count_x,pets_count_x,arrival_date_x,departure_date_x,inquiry_price_x,length_stay_x,min_days,date_from,date_to,filled_in_price_per_day,filled_in_price_per_week,month,price_catagory
0,b334776d-0cf1-51f1-8cdc-37535b280f3d,Anfrage,2021-07-20 00:07:31,2021-07-20 00:07:31,6.0,2.0,0.0,2019-07-24,2019-07-31,1601.0,7,7.0,2019-06-14,2019-09-01,188.0,1316.0,2019-06,Both
2,80f54c2a-c1f9-5744-a2c8-b8764c0bde87,Anfrage,2021-07-20 00:24:34,2021-07-20 00:24:34,2.0,1.0,0.0,2019-10-13,2019-10-20,0.0,7,2.0,2019-09-27,2019-10-13,75.0,500.0,2019-09,Both
3,80f54c2a-c1f9-5744-a2c8-b8764c0bde87,Anfrage,2021-07-20 00:24:34,2021-07-20 00:24:34,2.0,1.0,0.0,2019-10-13,2019-10-20,0.0,7,2.0,2019-10-13,2019-10-28,75.0,500.0,2019-10,Both
5,202306bf-f261-5975-8356-11719a53a063,Anfrage,2021-07-20 01:06:48,2021-07-20 01:06:48,2.0,0.0,0.0,2019-04-30,2019-05-03,237.0,3,3.0,2018-09-30,2019-06-01,79.0,553.0,2018-09,Day
11,afcd1d75-762c-59e0-b59a-70bd6432cd8d,Anfrage,2021-07-20 03:30:20,2021-07-20 03:30:20,2.0,1.0,0.0,2019-07-06,2019-07-14,705.0,8,3.0,2019-07-06,2019-07-07,105.0,735.0,2019-07,Day


In [32]:
df_master.isna().sum()

listing_id                  0
title_x                     0
date                        0
time_x                      0
adult_count_x               0
children_count_x            0
pets_count_x                0
arrival_date_x              0
departure_date_x            0
inquiry_price_x             0
length_stay_x               0
min_days                    0
date_from                   0
date_to                     0
filled_in_price_per_day     0
filled_in_price_per_week    0
month                       0
price_catagory              0
dtype: int64

## Save dataset as CSV


In [33]:
df_master.to_csv('../data/merged_inquiries_and_prices.csv')