# This notebook will serve for the initial EDA for the statistics data for the TFW project

In [None]:
# import necassary libriries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the dataset
df_statistics = pd.read_csv('../data/statistics_20210707.csv')

In [None]:
# Shape of the dataset
print('The dataset contains %s different accommodations and %s features' %(df_statistics.shape[0], df_statistics.shape[1]))

In [None]:
# Have a first look at the dataset
df_statistics.head()

In [None]:
# First look at the info
df_statistics.info()

In this dataset are no categorical features.

In [None]:
# First description of the numerical features
df_statistics.describe()

## Feature converting

First, convert the date feature `yearmonth` to datetime.

In [None]:
# Convert column yearmonth to datetime
df_statistics['yearmonth'] = pd.to_datetime(df_statistics['yearmonth'])

## Looking for correlations

In [None]:
# Generate the heatmap
corr = df_statistics.corr()
ax = sns.heatmap(
    corr,
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

In [None]:
corr.style.background_gradient(cmap='coolwarm')

## Grouping the statistic

### 1. Grouping by `yearmonth`

In [None]:
# Using groupby to summarize the statistics per yearmonth
statistic_per_month = df_statistics.groupby('yearmonth').sum().reset_index()
statistic_per_month

### 2. Grouping by accommodation ID

In [None]:
# Using groupby to summarize the statistics per accommodation
statistic_per_id = df_statistics.groupby('listing_id').sum().reset_index()
statistic_per_id

## Plotting distribution of the features

#### First, plotting the distribution of `count_inquiries` per month.

In [None]:
# Plotting a lineplot
fig, ax = plt.subplots(figsize=(15,5))
sns.lineplot(data=statistic_per_month, x="yearmonth", y="inquiry_count")

It seems to be, that the corona pandemic influenced the inquiries. In 2019 the inquiries stay nearly at the same level and vary around 60000. From January 2020 to April 2020 (Lockdown) the inquiries decrease up to around 20000 and starts with April 2020 to increase more steeply to 130000. In September 2020 and October 2020 the inquiries decrease again back to 20000. During the corona pandemic many people didn't go on a journey or if they do, tried to stay in Germany.

#### Second, plotting the distribution of `direct_booking_count` per month

In [None]:
# Sum of direct bookings
print('Sum of direct bookings:', df_statistics.direct_booking_count.sum())

In [None]:
# Plotting a lineplot
fig, ax = plt.subplots(figsize=(15,5))
sns.lineplot(data=statistic_per_month, x="yearmonth", y="direct_booking_count")

#### Plot the `inquiry_count` per accommodation 

In [None]:
# Plotting a histogram
fig, ax = plt.subplots(figsize=(15,5))
statistic_per_id.inquiry_count.hist(bins=500)

In [None]:
# Description of the feature inquiry_count
statistic_per_id.inquiry_count.describe()

In [None]:
# HOw many accommodations have more than 58 (75%) inquiries in two years?
statistic_per_id.query('inquiry_count >= 58').count()

### Split the dataset in the two years 2019 and 2020

In [None]:
# Split dataset: 2019
start_date = "2019-01-01"
end_date = "2019-12-1"

after_start_date = df_statistics["yearmonth"] >= start_date
before_end_date = df_statistics["yearmonth"] <= end_date
between_two_dates = after_start_date & before_end_date
filtered_dates_2019 = df_statistics.loc[between_two_dates]

In [None]:
# Split dataset: 2020
start_date = "2020-01-01"
end_date = "2020-12-31"

after_start_date = df_statistics["yearmonth"] >= start_date
before_end_date = df_statistics["yearmonth"] <= end_date
between_two_dates = after_start_date & before_end_date
filtered_dates_2020 = df_statistics.loc[between_two_dates]

In [None]:
# Using groupby to summarize the statistics per year and accommodation
statistic_per_id_2019 = filtered_dates_2019.groupby('listing_id').sum().reset_index()
statistic_per_id_2020 = filtered_dates_2020.groupby('listing_id').sum().reset_index()

In [None]:
# Plotting a histogram
fig, ax = plt.subplots(figsize=(15,5))
statistic_per_id_2019.inquiry_count.hist(bins=500, alpha=0.5)
statistic_per_id_2020.inquiry_count.hist(bins=500, alpha=0.5)
ax.legend(['2019', '2020'])

In [None]:
print(statistic_per_id_2019.inquiry_count.describe())
print('--------------------------------------------')
print(statistic_per_id_2020.inquiry_count.describe())