# This notebook will serve the EDA for the cleaned dataset that we used for modeling

## 1. Libraries and loading CSV

In [None]:
# Import necassary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Setup axis for plots
sns.set_context("talk", font_scale=1.5)

In [None]:
# load dataset
df_master = pd.read_csv('../data/tiny_master.csv')

In [None]:
# First look at the dataset
df_master.head()

In [None]:
del df_master['Unnamed: 0']

In [None]:
# First description of the numerical features
round(df_master.describe(),3)

In [None]:
# First look at the info
df_master.info()

In [None]:
# Looking for missing values
df_master.isna().sum()

## 2. Convert features in right data type

In [None]:
# First delete not redundant date features
df_master = df_master.drop(['yearmonth', 'year', 'month', 'date'],axis=1)

In [None]:
# Convert column contract_end to datetime
df_master['arrival_date'] = pd.to_datetime(df_master['arrival_date'])
df_master['departure_date'] = pd.to_datetime(df_master['departure_date'])
df_master['year_arrival'] = pd.DatetimeIndex(df_master['arrival_date']).year
df_master['month_arrival'] = pd.DatetimeIndex(df_master['arrival_date']).month

## 3. Add new feature `inquiry_count` - the target 

In [None]:
# Calculate the new feature
inquiry_count = df_master.groupby(['listing_id','month_arrival']).agg(['count'])['expose_views'].reset_index()

# Merge new column to dataframe
df_master = pd.merge(inquiry_count, df_master, left_on=['listing_id','month_arrival'], right_on=['listing_id','month_arrival'], how='right')

# Rename new column
df_master.rename(columns={'count': 'inquiry_count'}, inplace=True)

## 4. Add new feature `price_per_day`, `mean_price_per_day`, `min_price_per_day`, and `max_price_per_day`


In [None]:
# Calculate the new feature `price_per_day`
df_master['price_per_day'] = df_master['inquiry_price']/df_master['length_stay']

# Generate the feautres `mean_price_per_day`, `min_price_per_day`, and `max_price_per_day`
price_agg = df_master.groupby(['listing_id','month_arrival']).agg(['mean','max','min'])['price_per_day'].reset_index()

# Merge the new columns to the dataframe
df_master = pd.merge(price_agg, df_master, left_on=['listing_id','month_arrival'], right_on=['listing_id','month_arrival'], how='right')

# Rename the new columns
df_master.rename(columns={'mean': 'mean_price_per_day', 'min': 'min_price_per_day', 'max':'max_price_per_day'}, inplace=True)

In [None]:
# Shape of the dataset
print('The dataset contains %s inquiries and %s features' %(df_master.shape[0], df_master.shape[1]))

## 5. Correlations

In [None]:
# generate the heatmap
corr = df_master.corr()
fig, ax = plt.subplots(figsize=(20, 20))
sns.heatmap(
    corr,
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

In [None]:
# Generate table with correlations 
corr.style.background_gradient(cmap='coolwarm')

## 6. Closer Look: Features `inquiry_count` and `price_per_day`

### Inquiry count

In [None]:
## 4. Mean values for the different years
print('Mean inquiry count for the years 2019 and 2020:', round(df_master.inquiry_count.mean(), 3))
print('Mean inquiry count for the year 2019:', round(df_master.query('year_arrival == 2019').inquiry_count.mean(), 3))
print('Mean inquiry count for the year 2020:', round(df_master.query('year_arrival == 2020').inquiry_count.mean(), 3))


In [None]:
# Looking for the distribution of inquiry count
df_master.inquiry_count.hist(bins=100)
plt.title('Distribution inquiry count');

### Price per day

In [None]:
# Description price per day
round(df_master.price_per_day.describe(), 3)

In [None]:
# Zero values in inquiry_price
print('Number of zero values in price per day:', df_master.query('price_per_day == 0').inquiry_price.count())

# Percentage of zero values
print(round(df_master.query('price_per_day == 0').inquiry_price.count() / df_master.shape[0] * 100, 1), '% of the prices per day are zero.')

In [None]:
# Values less than 10 € in inquiry_price
print('Number of values less than 10 € in price per day:', df_master.query('price_per_day < 10').inquiry_price.count())

# Percentage of values less than 10 €
print(round(df_master.query('price_per_day < 10').inquiry_price.count() / df_master.shape[0] * 100, 1), '% of the prices per day are less than 10 €.')

In [None]:
# Looking for the distribution of price per day
df_master.price_per_day.hist(bins=100)
plt.title('Distribution price per day');

## 7. Scatterplots with target inquiry count

In [None]:

sns.scatterplot(data=df_master, x="inquiry_count", y="result_views");

In [None]:
#deeper look at the features
#average inquiry count for test dataset

In [None]:
test = df_master[['listing_id','yearmonth','result_views', 'expose_views', 'inquiry_count']]
test

In [None]:
test['inquiry_rate'] = round(test.inquiry_count / test.expose_views, 2)
#test = test.query('inquiry_rate <= 1.00 and inquiry_rate != inf')

In [None]:
test.query('inquiry_rate > 1.00 and inquiry_rate != inf')