# This notebook will serve for the initial EDA for the statistics data for the TFW project

In [None]:
# import necassary libriries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the dataset
df_statistics = pd.read_csv('../data/statistics_20210707.csv')

In [None]:
# Shape of the dataset
print('The dataset contains %s different accommodations and %s features' %(df_statistics.shape[0], df_statistics.shape[1]))

In [None]:
# Have a first look at the dataset
df_statistics.head()

In [None]:
# First look at the info
df_statistics.info()

In this dataset are no categorical features.

In [None]:
# First description of the numerical features
df_statistics.describe()

## Feature converting

First, convert the date feature `yearmonth` to datetime.

In [None]:
# Convert column yearmonth to datetime
df_statistics['yearmonth'] = pd.to_datetime(df_statistics['yearmonth'])

## Looking for correlations

In [None]:
# Generate the heatmap
corr = df_statistics.corr()
ax = sns.heatmap(
    corr,
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

In [None]:
corr.style.background_gradient(cmap='coolwarm')

## Grouping the statistic

### 1. Grouping by `yearmonth`

In [None]:
# Using groupby to summarize the statistics per yearmonth
statistic_per_month = df_statistics.groupby('yearmonth').sum().reset_index()
statistic_per_month

### 2. Grouping by accommodation ID

In [None]:
# Using groupby to summarize the statistics per accommodation
statistic_per_id = df_statistics.groupby('listing_id').sum().reset_index()
statistic_per_id

### 3. Grouping by property and month

In [None]:
# Using groupby to summarize the statistics per accommodation
statistic_id_month = df_statistics.groupby(['listing_id', 'yearmonth']).sum().reset_index()
statistic_id_month

In [None]:
# Create a new dataframe for merging with master dataset
statistic_id_month = statistic_id_month.drop(['inquiry_count', 'inquiry_volume', 'binding_inquiry_count', 'binding_inquiry_volume', 'direct_booking_count', 'direct_booking_volume', 'phone_clicks'], axis=1)

In [None]:
# create two columns year and month
statistic_id_month['year'] = pd.DatetimeIndex(statistic_id_month['yearmonth']).year
statistic_id_month['month'] = pd.DatetimeIndex(statistic_id_month['yearmonth']).month

In [None]:
# Export csv
statistic_id_month.to_csv('../data/statistic_id_month_20210721.csv')

## Plotting distribution of the features by month

#### First, plotting the distribution of `count_inquiries` per month.

In [None]:
# Plotting a lineplot
fig, ax = plt.subplots(figsize=(15,5))
sns.lineplot(data=statistic_per_month, x="yearmonth", y="inquiry_count")

It seems to be, that the corona pandemic influenced the inquiries. In 2019 the inquiries stay nearly at the same level and vary around 60000. From January 2020 to April 2020 (Lockdown) the inquiries decrease up to around 20000 and starts with April 2020 to increase more steeply to 130000. In September 2020 and October 2020 the inquiries decrease again back to 20000. During the corona pandemic many people didn't go on a journey or if they do, tried to stay in Germany.

#### Second, plotting the distribution of `direct_booking_count` per month

In [None]:
# Sum of direct bookings
print('Sum of direct bookings:', df_statistics.direct_booking_count.sum())

In [None]:
# Plotting a lineplot
fig, ax = plt.subplots(figsize=(15,5))
sns.lineplot(data=statistic_per_month, x="yearmonth", y="direct_booking_count")

#### Third, plotting the distribution of `result_view`, `expose_view`, and `phone_clicks` per month

In [None]:
# Line plot result views
fig, ax = plt.subplots(figsize=(15,5))
sns.lineplot(data=statistic_per_month, x="yearmonth", y="result_views")
sns.lineplot(data=statistic_per_month, x="yearmonth", y="expose_views")
sns.lineplot(data=statistic_per_month, x="yearmonth", y="phone_clicks")
ax.set(xlabel='Year - Month', ylabel='Views count')
plt.ticklabel_format(style='plain', axis='y')
plt.legend(labels=["Result views","Expose views", "Phone clicks"])
plt.title('Count of result views, expose views and phone clicks per month for all properties', fontsize=14);

## Plotting distribution of the features by listing ID

#### First, plot the `inquiry_count` per accommodation 

In [None]:
# Plotting a histogram
fig, ax = plt.subplots(figsize=(15,5))
statistic_per_id.inquiry_count.hist(bins=500)

In [None]:
# Plotting a boxplot
statistic_per_id.boxplot(column=['inquiry_count'])

In [None]:
# Description of the feature inquiry_count
statistic_per_id.inquiry_count.describe()

### 1. Assumption: average length of stay: 7 days
### 2. Assumption: a good occupancy rate is 50% of the year
### 3. Assumption: each third request generate a booking
### We need:  52 weeks per year / 50 % occupancy = 26 weeks per year --> 26 weeks per year * 3 request generate a booking = 78 inquiries are perfect!

How many houses have between 70 and 90 inquiries?

In [None]:
# Create a list with houses with perfect number of inquiries and count
property_inquiry_70_90 = statistic_per_id.query('inquiry_count >= 70 & inquiry_count <= 90')
len(property_inquiry_70_90)

2268 houses have a perfect inquiry count! That's are 7.5 %.

How many houses have less and more inquiries?

In [None]:
# Count inquiries
print(statistic_per_id.query('inquiry_count >= 60 & inquiry_count <= 70')['listing_id'].count(), 'properties have between 60 and 70 inquiries.')
print(statistic_per_id.query('inquiry_count > 60')['listing_id'].count(), 'properties have less than 60 inquiries.')
print(statistic_per_id.query('inquiry_count >= 90 & inquiry_count <= 100')['listing_id'].count(), 'properties have between 90 and 100 inquiries.')
print(statistic_per_id.query('inquiry_count > 100')['listing_id'].count(), 'properties have more than 100 inquiries.')
print(statistic_per_id.query('inquiry_count >= 60 & inquiry_count <= 100')['listing_id'].count(), 'properties have between 60 and 100 inquiries.')

#### Second, plot the `result_views`, `expose_views`, and `phone_clicks` per accommodation

In [None]:
# Boxplot views and phone clicks
statistic_per_id.boxplot(column=['result_views', 'expose_views', 'phone_clicks']);

### Split the dataset into the years 2019 and 2020

In [None]:
# Split dataset: 2019
start_date = "2019-01-01"
end_date = "2019-12-1"

after_start_date = df_statistics["yearmonth"] >= start_date
before_end_date = df_statistics["yearmonth"] <= end_date
between_two_dates = after_start_date & before_end_date
filtered_dates_2019 = df_statistics.loc[between_two_dates]

In [None]:
# Split dataset: 2020
start_date = "2020-01-01"
end_date = "2020-12-31"

after_start_date = df_statistics["yearmonth"] >= start_date
before_end_date = df_statistics["yearmonth"] <= end_date
between_two_dates = after_start_date & before_end_date
filtered_dates_2020 = df_statistics.loc[between_two_dates]

In [None]:
# Using groupby to summarize the statistics per year and accommodation
statistic_per_id_2019 = filtered_dates_2019.groupby('listing_id').sum().reset_index()
statistic_per_id_2020 = filtered_dates_2020.groupby('listing_id').sum().reset_index()

In [None]:
# Plotting a histogram
fig, ax = plt.subplots(figsize=(15,5))
statistic_per_id_2019.inquiry_count.hist(bins=500, alpha=0.5)
statistic_per_id_2020.inquiry_count.hist(bins=500, alpha=0.5)
ax.legend(['2019', '2020'])

In [None]:
print(statistic_per_id_2019.inquiry_count.describe())
print('--------------------------------------------')
print(statistic_per_id_2020.inquiry_count.describe())