# This notebook will serve for the initial EDA for the inquiries data for the TFW project

In [None]:
# Import necassary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Setup axis for plots
sns.set_context("talk", font_scale=1.5)

In [None]:
# load dataset
df_inquiries = pd.read_csv('../data/inquiries_20210713.csv')

In [None]:
# Shape of the dataset
print('The dataset contains %s inquiries and %s features' %(df_inquiries.shape[0], df_inquiries.shape[1]))

In [None]:
# Have a first look at the dataset
df_inquiries.head(2)

In [None]:
# First look at the info
df_inquiries.info()

In [None]:
# First description of the numerical features
round(df_inquiries.describe(),3)

In [None]:
# Looking for categorical features
df_inquiries.nunique()

In [None]:
df_inquiries.isna().sum()

In [None]:
# Duplicate rows
# Select duplicate rows except first occurrence based on all columns
duplicateRowsDF = df_inquiries[df_inquiries.duplicated()]
print("Duplicate Rows except first occurrence based on all columns are :")
print(duplicateRowsDF)

We have 83 duplicated rows. We will remove them in the cleaning (see beolw).Exception

## Convert date and time features to the right data type

In [None]:
# Convert column contract_end to datetime
df_inquiries['date'] = pd.to_datetime(df_inquiries['date'])
df_inquiries['time'] = pd.to_datetime(df_inquiries['time'],format= '%H:%M:%S' ).dt.time
df_inquiries['arrival_date'] = pd.to_datetime(df_inquiries['arrival_date'])
df_inquiries['departure_date'] = pd.to_datetime(df_inquiries['departure_date'])

## Looking for correlations

In [None]:
# generate the heatmap
corr = df_inquiries.corr()
fig, ax = plt.subplots(figsize=(20, 20))
sns.heatmap(
    corr,
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

In [None]:
# Generate table with correlations 
corr.style.background_gradient(cmap='coolwarm')

## Plotting distribution of the features

In [None]:
# Plotting the feature title
df_inquiries.title.hist()

In [None]:
# Plotting histograms of numerical features 
df_inquiries.hist(figsize = (20,15))
plt.show()

## Closer look to all features

### 1. title

In [None]:
# Number of unique values in feature title
print('Number of different title:', df_inquiries.title.nunique())
print('Unique values:', df_inquiries.title.unique())
print('Count of unique values:\n', df_inquiries.title.value_counts())

### 2. Counts Adults, Children and Pets

Looking for outliers

In [None]:
# boxplots
df_inquiries.boxplot(column=['adult_count', 'children_count', 'pets_count'])

In these features are outliers that we have to handle. We decided to go with the statistical procedure and cut off with quantile 0.95 like in all notebooks. This step we will do in the cleaning (see below).

Looking for zero values.

In [None]:
# Looking for zero values
print('Number of zero values in adult count:', df_inquiries.adult_count.isin([0]).sum())
print('Number of zero values in children count:', df_inquiries.children_count.isin([0]).sum())
print('Number of zero values in pets count:', df_inquiries.pets_count.isin([0]).sum())

### 3. Dates: date, time, arrival date, and departure date

In [None]:
# Looking for the range in the features date
print('Min date:', df_inquiries.date.min())
print('Max date:', df_inquiries.date.max())
print('---------------------------')
print('Min time:', df_inquiries.time.min())
print('Max time:', df_inquiries.time.max())
print('---------------------------')
print('Min arrival date:', df_inquiries.arrival_date.min())
print('Max arrival date:', df_inquiries.arrival_date.max())
print('---------------------------')
print('Min departure date:', df_inquiries.departure_date.min())
print('Max departure date:', df_inquiries.departure_date.max())
print('---------------------------')

The arrival and departure date are far in the future. We decide to keep the arrival date (so we don't interrupt the length of stay) until 31.12.2021. The last date for inquiries is 31.12.2020, but since inquiries are made for the future, often well in advance, we decided for one year, as it seems most realistic that a booking will also be made from this. This step we will do in the cleaning (see below).

#### Closer view to arrival date

In [None]:

df_inquiries['arrival_date_yearmonth'] = pd.to_datetime(df_inquiries['arrival_date'],format='%Y-%m', errors='coerce').dt.to_period('m')

In [None]:
plot_arrival_date = df_inquiries.groupby('arrival_date_yearmonth').count().reset_index()[['arrival_date_yearmonth', 'listing_id']]

In [None]:
plot_arrival_date['arrival_date_yearmonth'] = plot_arrival_date.arrival_date_yearmonth.astype(str)

In [None]:
plot_arrival_date['arrival_date_yearmonth'] = pd.to_datetime(plot_arrival_date['arrival_date_yearmonth'])

In [None]:
# Split dataset: 2019 and 2020
start_date = "2019-01"
end_date = "2020-12"

after_start_date = plot_arrival_date["arrival_date_yearmonth"] >= start_date
before_end_date = plot_arrival_date["arrival_date_yearmonth"] <= end_date
between_two_dates = after_start_date & before_end_date
filtered_dates = plot_arrival_date.loc[between_two_dates]

In [None]:
# Plotting a lineplot
fig, ax = plt.subplots(figsize=(15,5))
sns.lineplot(data=filtered_dates, x="arrival_date_yearmonth", y="listing_id")
ax.set(xlabel='Year - Month', ylabel='Inquiry count')
plt.title('Overall Inquiries per Month (2019 - 2020)')

### 3. Inquiries prices

Looking for outliers

In [None]:
# boxplots
df_inquiries.boxplot(column=['inquiry_price'])

Because of the high values, we also will go with the cut off with percentile 0.95. This step we will do in the cleaning (see below).

Looking for zero values.

In [None]:
# Zero values
print('Number of zero values in inquiry price:', df_inquiries.inquiry_price.isin([0]).sum())

## Feature engineering

With the features arrival date and departure date we calculate a new feature length stay. The length of a stay influcences the price.

In [None]:
# Create new feature length of stay
df_inquiries['length_stay'] = df_inquiries.departure_date - df_inquiries.arrival_date

In [None]:
# Looking for the range in the features length stay
print('Min length stay:', df_inquiries.length_stay.min())
print('Max length stay:', df_inquiries.length_stay.max())

In [None]:
# Plotting the distribution
df_inquiries['length_stay'].dt.days.hist(bins=1000)

We have a wide distribution of the inquire length of stay. We will first going throw the other cleaning steps and following have a second look on this distribution again.

## Clean the dataset

### 1. Duplicated rows

In [None]:
# Remove duplicated rows
df_inquiries.drop_duplicates(keep='first', inplace=True)

### 2. Count adults, children, pets and inquiry price

In [None]:
# Calculate .95 quantile for the features
print('Quantiles 0.95 for:')
print('Adults:', df_inquiries.adult_count.quantile([.95]))
print('Children:', df_inquiries.children_count.quantile([.95]))
print('Pets:', df_inquiries.pets_count.quantile([.95]))
print('inquiry_price:', df_inquiries.inquiry_price.quantile([.95]))

Drop all rows with a number of adults greater than 6.

In [None]:
# How many rows do we drop?
print('Number of dropping rows:', df_inquiries.query('adult_count > 6').shape[0])

In [None]:
# Get names of indexes for which column adult_count has value greater than 6
indexNames_adult_count = df_inquiries[df_inquiries['adult_count'] > 6].index
# Delete these row indexes from dataset
df_inquiries.drop(indexNames_adult_count , inplace=True)

Drop all rows with a number of children greater than 2.

In [None]:
# How many rows do we drop?
print('Number of dropping rows:', df_inquiries.query('children_count > 2').shape[0])

In [None]:
# Get names of indexes for which column children_count has value greater than 2
indexNames_children_count = df_inquiries[df_inquiries['children_count'] > 2].index
# Delete these row indexes from dataset
df_inquiries.drop(indexNames_children_count , inplace=True)

Drop all rows with a number of pets greater than 1.

In [None]:
# How many rows do we drop?
print('Number of dropping rows:', df_inquiries.query('pets_count > 1').shape[0])

In [None]:
# Get names of indexes for which column pets_count has value greater than 1
indexNames_pets_count = df_inquiries[df_inquiries['pets_count'] > 1].index
# Delete these row indexes from dataset
df_inquiries.drop(indexNames_pets_count , inplace=True)

Drop all rows with a number of inquiry price greater than 1820.0.

In [None]:
# How many rows do we drop?
print('Number of dropping rows:', df_inquiries.query('inquiry_price > 1820.0').shape[0])

In [None]:
# Get names of indexes for which column inquiry_price has value greater than 1820.0
indexNames_inquiry_price = df_inquiries[df_inquiries['inquiry_price'] > 1820.0].index
# Delete these row indexes from dataset
df_inquiries.drop(indexNames_inquiry_price , inplace=True)

### 3. Arrival

Remove dates that are greater than 31.12.2021.

In [None]:
# Remove dates greater than 31.12.2021 
df_inquiries = df_inquiries[pd.to_datetime(df_inquiries.arrival_date) <= pd.to_datetime('2021-12-31')]

### 4. Length of stay

In [None]:
# Looking for the range in the features length stay
print('Min length stay:', df_inquiries.length_stay.min())
print('Max length stay:', df_inquiries.length_stay.max())

In [None]:
# Plotting the distribution
df_inquiries['length_stay'].dt.days.hist(bins=1000)

Because the distribution doen't change, we decide to clean for this column too.

In [None]:
# Calculate .95 quantile for feature
print('Length stay:', df_inquiries.length_stay.quantile([.95]))

In [None]:
# Convert timedelta to integer
df_inquiries['length_stay'] = df_inquiries.length_stay.dt.days

In [None]:
# How many rows do we drop?
print('Number of dropping rows:', df_inquiries.query('length_stay > 14').shape[0])

In [None]:
# Get names of indexes for which column length_stay has value greater than 14
indexNames_length_stay = df_inquiries[df_inquiries['length_stay'] > 14].index
# Delete these row indexes from dataset
df_inquiries.drop(indexNames_length_stay , inplace=True)

Shape of the cleaned dataset.

In [None]:
# Shape of the dataset
print('The dataset contains %s inquries and %s features' %(df_inquiries.shape[0], df_inquiries.shape[1]))

## Save clean dataset as new CSV

In [None]:
# Export csv
#df_inquiries.to_csv('../data/master_inquiries_20210715.csv')

# Merge original datasets inquiries and listings

In [None]:
# Load the datasets
df_listings = pd.read_csv('../data/listings_20210707.csv')
df_inquiries = pd.read_csv('../data/inquiries_20210713.csv')

In [None]:
# Merging datasets
df = pd.merge(df_inquiries, df_listings, on='listing_id', how='left')

In [None]:
# generate year and month from inquiry date and arrival date

df['inq_year'] = pd.to_datetime(df['date']).dt.year
df['inq_month'] = pd.to_datetime(df['date']).dt.month

df['arr_year'] = pd.to_datetime(df['arrival_date']).dt.year
df['arr_month'] = pd.to_datetime(df['arrival_date']).dt.month

In [None]:
# Filter for year 2019

df = df[(df['inq_year'] == 2019) & (df['arr_year'] == 2019)]

In [None]:
# TOP5 regions with the highest inquiry count
df.region.value_counts()[:5]

In [None]:
# TOP5 holiday regions with the highest inquiry count
df.holiday_region.value_counts()[:5]

In [None]:
# TOP5 regions with the largest number of properties
df.groupby('region').listing_id.nunique().sort_values(ascending=False)[:5]

In [None]:
# TOP5 holiday regions with the largest number of properties
df.groupby('holiday_region').listing_id.nunique().sort_values(ascending=False)[:5]

In [None]:
# TOP5 regions with highest inquiry count per property
round(df.region.value_counts() / df.groupby('region').listing_id.nunique()).sort_values(ascending=False)[:5]

In [None]:
# TOP5 holiday regions with highest inquiry count per property
round(df.holiday_region.value_counts() / df.groupby('holiday_region').listing_id.nunique()).sort_values(ascending=False)[:5]

In [None]:
# Aggregate inquiry count
inquiry_count = df.groupby(['listing_id','inq_year','inq_month']).agg(['count'])['region'].reset_index()

In [None]:
# rename count to inquiry_count
inquiry_count.rename(columns={'count':'inquiry_count'},inplace=True)

# merge these counts back into master set 

df = pd.merge(inquiry_count, df, left_on=['listing_id','inq_year','inq_month'], right_on=['listing_id','inq_year','inq_month'], how='right')

In [None]:
# Create region lists for plots: inquiry count per month
region_list = ['Ostsee', 'Nordsee', 'Oberbayern', 'Allgäu', 'Mecklenburgische Seenplatte']
holiday_region_list = ['Ostsee', 'Nordsee', 'Oberbayern', 'Schwarzwald', 'Oberallgäu']
region_inquiry_property_list = ['Sächsische Schweiz', 'Spreewald', 'Bodensee', 'Thüringer Wald', 'Harz']

In [None]:
# Create sub datasets for plots: inquiry count per month
region = df[df['region'].isin(region_list)]
holiday_region = df[df['holiday_region'].isin(holiday_region_list)]
region_inquiry_property = df[df['region'].isin(region_inquiry_property_list)]

In [None]:
# Plot: region
ax = sns.lineplot(x="arr_month", y="inquiry_count", data=region, hue='region')
ax.legend(title="Region", loc='center right', bbox_to_anchor=(2.5, 0.5))
ax.set(xlabel='Month', ylabel='Mean inquiry count')
ax.set_xticks(range(1,13))
ax.set_xticklabels(['J','F','M','A','M','J','J','A','S','O','N','D']);

In [None]:
# Plot: holiday_region
ax = sns.lineplot(x="arr_month", y="inquiry_count", data=holiday_region, hue='holiday_region')
ax.legend(title="Holiday region", loc='center right', bbox_to_anchor=(1.9, 0.5))
ax.set(xlabel='Month', ylabel='Mean inquiry count')
ax.set_xticks(range(1,13))
ax.set_xticklabels(['J','F','M','A','M','J','J','A','S','O','N','D']);

In [None]:
# Plot: Region inquiry per property
ax = sns.lineplot(x="arr_month", y="inquiry_count", data=region_inquiry_property, hue='region')
ax.legend(title="Region", loc='center right', bbox_to_anchor=(2.1, 0.5))
ax.set(xlabel='Month', ylabel='Mean inquiry count')
ax.set_xticks(range(1,13))
ax.set_xticklabels(['J','F','M','A','M','J','J','A','S','O','N','D']);