# This notebook will serve the EDA for the cleaned dataset that we used for modeling

## 1. Libraries and loading CSV

In [1]:
# Import necassary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Setup axis for plots
sns.set_context("talk", font_scale=1.5)

In [2]:
# load dataset
df_master = pd.read_csv('../data/super_master.csv')

In [None]:
# First look at the dataset
df_master.head()

## 2. Remove unnecassary columns and show shape of dataframe

In [3]:
# Remove unnecassary unnamed columns
df_master.drop(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0_x', 'Unnamed: 0_y'], axis=1, inplace=True)

In [None]:
# Shape of the dataset
print('The dataset contains %s oberservations and %s features' %(df_master.shape[0], df_master.shape[1]))

## 3. Convert date features in the right data type and show first description

In [4]:
# Convert column arrival_date to datetime
df_master['arrival_date'] = pd.to_datetime(df_master['arrival_date'])

In [None]:
# First description of the numerical features
round(df_master.describe(),3)

## 4. Number of properties and filtering for year 2019

In [None]:
# Number of unique properties, the included years and months
print('Number of unique properties:', df_master.listing_id.nunique())
print('The included years are', df_master.year.unique())

In [None]:
# Number of oberservations per year
print(df_master.query('year == 2019').shape[0])
print(df_master.query('year == 2019').shape[0] / df_master.shape[0] * 100)
print(df_master.query('year == 2020').shape[0])
print(df_master.query('year == 2020').shape[0] / df_master.shape[0] * 100)

The dataset included inquiries from the years 2019 and 2020. We have the data from 17,185 different properties. Of a total of 6,081,983 observations, 1,881,180 observations are from 2019 (30.9%) and 4,200,803 observations are from 2020 (69.1%).

Due to the influence of the corona pandemic on the inquiries 2020 (as we colud see in the EDA inquiries), we are focussing on the year 2019. 

In [None]:
# Filter dataset for year 2019
df_master_2019 = df_master.query('year == 2019')

In [None]:
# Save Master 2019 as csv
df_master_2019.to_csv('../data/super_master_2019.csv')

# Import Master 2019
#df_master_2019 = pd.read_csv('../data/super_master_2019.csv')

In [None]:
# Number of unique properties in year 2019
print('Number of unique properties in the year 2019:', df_master_2019.listing_id.nunique())

The dataset includes 1,881,180 inquiries for 17,000 different properties in the year 2019.

## 5. Grouping / Clustering features by inquiry rate 

### 1. We will define three categories of inquiry rate: low, middle, high. Inquiry rate was calculated by expose views and inquiry count. Let's see the distribution of inquiry rate.

In [None]:
# Boxplot inquiry rate
ax = sns.boxplot(x=df_master_2019["inquiry_rate"])

In [None]:
# Inquiry rate per month
ax = sns.boxplot(x="month", y="inquiry_rate", data=df_master_2019)

We define the category "low" as the lowest 25% inquiry rates, the category "high" as the highest 25% inquiry rates and the category "middle" as the inquiry rates between the lowest and highest group.

In [None]:
# Calculate inquiry_rate for quartiles to define categorical groups
print(df_master_2019.inquiry_rate.describe())
print(df_master_2019.inquiry_rate.quantile(.25))
print(df_master_2019.inquiry_rate.quantile(.75))

In [None]:
# Create new column with the categories for inquiry rate

# Create a list of our conditions
conditions = [
    (df_master_2019['inquiry_rate'] <= df_master_2019.inquiry_rate.quantile(.25)),
    (df_master_2019['inquiry_rate'] > df_master_2019.inquiry_rate.quantile(.25)) & (df_master_2019['inquiry_rate'] < df_master_2019.inquiry_rate.quantile(.75)),
    (df_master_2019['inquiry_rate'] >= df_master_2019.inquiry_rate.quantile(.75))
]

# create a list of the values we want to assign for each condition
values = ['low', 'middle', 'high']

# create a new column and use np.select to assign values to it using our lists as arguments
df_master_2019['cat_inquiry_rate'] = np.select(conditions, values)

### 2. Group features by category inquiry rate

## 4. Correlations between some features

In [None]:
# generate the heatmap
corr = df_master.corr()
fig, ax = plt.subplots(figsize=(20, 20))
sns.heatmap(
    corr,
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

In [None]:
# Generate table with correlations 
df_master.corr().style.background_gradient(cmap='coolwarm')