# This notebook will serve for the initial EDA for the listings data for the TFW project

In [None]:
# Import necassary libraries
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt

ROOT = os.environ.get('PWD')

In [None]:
# load dataset
listings = pd.read_csv('../data/listings_20210707.csv')

In [None]:
# Shape of the dataset
print('The dataset contains %s different accommodations and %s features' %(df_listings.shape[0], df_listings.shape[1]))

In [None]:
# Have a first look at the dataset
df_listings.head()

In [None]:
# First look at the info
df_listings.info()

In [None]:
# First description of the numerical features
df_listings.describe()

In [None]:
# Looking for categorical features
df_listings.nunique()

The dataset contains many categorical features that we need to process further.

## First cleaning steps

Like Traum-Ferienwohnungen told, we've got a dataset with accomodations located in Germany as we can see in the feature country_title. Because of this, we can drop this column.

In [None]:
# Drop the column country_title
df_listings = df_listings.drop('country_title', axis=1)

The feature `pets` includes only missing values and zeros. In my opinion, this column records the number of pets that are allowed. If pets are allowed or not (or on request) are covered in following columns: `option_holiday_with_your_pet`, `option_holiday_with_your_horse`, `option_holiday_with_your_dog`. For this reason, we decided to drop this column too.

In [None]:
# Drop the column pets
print(df_listings.pets.unique())
df_listings = df_listings.drop('pets', axis=1)

## Feature converting

First, convert the date feature `contract_end` to datetime.

In [None]:
# Convert column contract_end to datetime
df_listings['contract_end'] = pd.to_datetime(df_listings['contract_end'])

The feature `living_area` contains values with range. Like Traum-Ferienwohnungen recommends, we take the first number as correct one and convert them to integers.

In [None]:
# Replace range of `living_area` with the first number
df_listings.replace(['70-280', '50-100', '50-70', '24-49', '16 - 26', '70-280', '18 - 26', '88-100', '46-73', '50-80', '52-65', '50-60'], ['70', '50', '50', '24', '16', '70', '18', '88', '46', '50', '52', '50'], inplace=True)

In [None]:
# Convert column `living_area`to integer
df_listings['living_area'] = df_listings['living_area'].astype(float)

To use the option features in the model, we convert the booleans / categories to integers as following:

- False / no / Not allowed >> 0
- True / yes / Allowed >> 1
- On request >> 2
- Unset >> 3

In [None]:
# Replacement to integers 
df_listings.replace(['False', 'no', 'not allowed', 'True', 'yes', 'allowed', 'on request', 'unset'], [0, 0, 0, 1, 1, 1, 2, 3], inplace=True)

## Looking for correlations

In [None]:
# generate the heatmap
corr = df_listings.corr()
fig, ax = plt.subplots(figsize=(20, 20))
sns.heatmap(
    corr,
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

In [None]:
# Generate table with correlations 
corr.style.background_gradient(cmap='coolwarm')

## Plotting distribution of the features

In [None]:
# Plotting the feature state
df_listings.state.hist()

In [None]:
# Plotting the feature contract end
df_listings.contract_end.hist()

In [None]:
# Plotting histograms of numerical features 
df_listings.hist(bins=50, figsize = (30,30))
plt.show()

## Bathrooms

In [None]:
# Count values for categories
print('option_non_smoking_only:\n', df_listings['option_non_smoking_only'].value_counts())
print('smoking:\n',df_listings['smoking'].value_counts())

In [None]:
# Create a sub dataset that contains only True / False values for the columns
smoking = df_listings.query("option_non_smoking_only == [0 ,1] & smoking == [1, 0]")

In [None]:
# Check for contrary columns 
comparison_column_smoking = np.where(smoking["option_non_smoking_only"] != smoking["smoking"], True, False)
print(np.all(comparison_column_smoking))

In [None]:
# Drop the column wheelchairaccess
df_listings = df_listings.drop('smoking', axis=1)

3. The histograms of features `close_to_the_beach` and `close_to_the_water` look very similar. All accommodations close to the beach are close to the water too. But because the differences between close to the water and close to the beach, like a dike, a habour or a lake are important for guests, we stay with both features.


4. For the features `close_to_the_beach`, `close_to_the_water`, `option_close_to_the_skilift`, `option_railway_station` and `option_airport` the amount of unset values is high: 

In [None]:
# Calculate amount of unset values
print('Percent of unset values in feature')
print('Beach nearby:', round(df_listings.query('option_close_to_the_beach == 3').count()[1]/df_listings.shape[0]*100, 1))
print('Water nearby:', round(df_listings.query('option_close_to_the_water == 3').count()[1]/df_listings.shape[0]*100, 1))
print('Ski lift nearby:', round(df_listings.query('option_close_to_the_ski_lift == 3').count()[1]/df_listings.shape[0]*100, 1))
print('Railway station:', round(df_listings.query('option_railway_station == 3').count()[1]/df_listings.shape[0]*100, 1))
print('Airport:', round(df_listings.query('option_airport == 3').count()[1]/df_listings.shape[0]*100, 1))

Because of high number of unset values in the features `option_railway_station` (95.2%) and `option_airport` (98.2%) we will drop these columns because it gives us no important information. At the moment we will keep the features ` option_close_to_the_beach` (67%), `option_close_to_the_water` (63.8%) and `option_close_to_the_ski_lift` (80.7%) because they could be important for the clsutering model and these features are an important information for the guest to decide for their right accommodation.

In [None]:
# Drop the column option_railway_station and option_airport
df_listings = df_listings.drop(['option_railway_station', 'option_airport'], axis=1)

5. We decided to keep only the information internet and not the single features `wifi` and `internet`. Because it's important to know if a property have internet and not which kind of internet. For this reason we change a no / 0 for feature `internet` to yes / 1, if in the feature `wifi` is a yes / 1.

In [None]:
# Replace value from column internet with column wifi
df_listings['internet'] = np.where(df_listings['internet'] == 0, df_listings['wifi'], df_listings['internet'])

In [None]:
# Drop the column wifi
df_listings = df_listings.drop('wifi', axis=1)

## Bedrooms

In [None]:
# Description of the feature bathrooms
df_listings.bathrooms.describe()

In [None]:
# Number of accommodation per bathroom number
df_listings.groupby('bathrooms')['listing_id'].count()

We have a few accommodations with a high number of bathrooms and we have to decide how we want to handle this.

In [None]:
# Because of a higher correlation between the numerical features, we're looking for the median value for all features per bathroom number to see if there is a connection 
numerical_features = df_listings[['bathrooms', 'bedrooms', 'max_guests', 'living_area']]
numerical_features.groupby('bathrooms').median()

## Max guests

In [None]:
# Description of the feature bedrooms
df_listings.bedrooms.describe()

In [None]:
# Number of accommodation per bedroom number
df_listings.groupby('bedrooms')['listing_id'].count()

In [None]:
# Because of a higher correlation between the numerical features, we're looking for the median value for all features per bedroom number to see if there is a connection 
numerical_features.groupby('bedrooms').median()

With a higher number of bedrooms the number of bathrooms, maximum guests and living area also increase.

#### Maximum guests

In [None]:
# Description of the feature maximum guests
df_listings.max_guests.describe()

In [None]:
# Number of accommodation per maximum guest number
df_listings.groupby('max_guests')['listing_id'].count()

In [None]:
# Because of a higher correlation between the numerical features, we're looking for the median value for all features per maximum guests number to see if there is a connection 
numerical_features.groupby('max_guests').median()

With a higher number of maximum guests the number of bathrooms, bedrooms and living area increase not constantly. There is no pattern.

#### Living area

In [None]:
# Description of the feature maximum guests
df_listings.living_area.describe()

## Living area

In [None]:
# Number and names of region
print(' Number of different regions:', df_listings.region.nunique())
print(df_listings.region.unique())

In [None]:
# Distribution
fig, ax = plt.subplots(figsize=(15,5))
df_listings.region.hist(bins= 29)
plt.xticks(rotation='vertical');

In [None]:
# Sort regions by highest value
df_listings.groupby('region')['listing_id'].count().sort_values(ascending=False)

## Regions

In [None]:
# Number and names of subregions
print('Number of different subregions:', df_listings.subregion.nunique())
print(df_listings.subregion.unique())

In [None]:
# Distribution
fig, ax = plt.subplots(figsize=(30,5))
df_listings.subregion.hist(bins=196)
plt.xticks(rotation='vertical');

In [None]:
# Sort subregion by highest value
df_listings.groupby('subregion')['listing_id'].count().sort_values(ascending=False)

### 3. Holiday region

In [None]:
# Number and names of holiday regions
print('Number of different holdiday regions:', df_listings.holiday_region.nunique())
print(df_listings.holiday_region.unique())

In [None]:
# Distribution
fig, ax = plt.subplots(figsize=(15,5))
df_listings.holiday_region.hist(bins=29)
plt.xticks(rotation='vertical');

In [None]:
# Sort holiday region by highest value
df_listings.groupby('holiday_region')['listing_id'].count().sort_values(ascending=False)

## Type accomodation

In [None]:
# Number and names of zipcodes
print('Number of different zipcodes:', df_listings.zip.nunique())
print(df_listings.zip.unique())

In [None]:
# Distribution
fig, ax = plt.subplots(figsize=(30,5))
df_listings.zip.hist(bins=302)
plt.xticks(rotation='vertical');

In [None]:
# Sort zipcode by highest value
df_listings.groupby('zip')['listing_id'].count().sort_values(ascending=False)

## EDA: property type

In [None]:
# Number and names of property types
print('Number of different property type:', df_listings.property_type.nunique())
print(df_listings.property_type.unique())

In [None]:
# Distribution
fig, ax = plt.subplots(figsize=(15,5))
df_listings.property_type.hist(bins=25)
plt.xticks(rotation='vertical');

In [None]:
# Sort property type by highest value
df_listings.groupby('property_type')['listing_id'].count().sort_values(ascending=False)

# Data Cleaning
## 1. Outliers in numerical values

First, we look at the boxplots to get an overview about the outliers in the numerical features. After this we look for different quantiles to decide the cut off threshold.

In [None]:
# boxplots bathrooms, bedrooms
df_listings.boxplot(column=['bathrooms', 'bedrooms'])

In [None]:
# boxplot maximum guests
df_listings.boxplot(column='max_guests')

In [None]:
# boxplot living area
df_listings.boxplot(column='living_area')

We decided to go with the statistical procedure and cut off with quantile 0.95. For this, we look for the cut off threshold for all numerical features. With this procedure we drop extreme houses and set the focus to the 'standard' houses. The probability that a new customer has a 'standard' house is much higher than having a special house. So we want to model a good price calculator for 'standard' houses because for special houses the calculator would be inaccurate no matter if we keep them or not. 

In [None]:
# Calculate .95 quantile for numerical features
print('Quantiles 0.95 for:')
print('Bathrooms:', df_listings.bathrooms.quantile([.95]))
print('Bedrooms:', df_listings.bedrooms.quantile([.95]))
print('Maximum guests:', df_listings.max_guests.quantile([.95]))
print('Living_area:', df_listings.living_area.quantile([.95]))

### Cleaning procedure Outliers: we start with the bedrooms, continue with bathrooms, maximum guests and living area.

**A. Outliers bedrooms**

Drop all rows with a number of bedrooms greater than 3.

In [None]:
# How many rows do we drop?
print('Number of dropping rows:', df_listings.query('bedrooms > 3').shape[0])

In [None]:
# Get names of indexes for which column bedrooms has value greater than 3
indexNames_bedrooms = df_listings[df_listings['bedrooms'] > 3].index
# Delete these row indexes from dataset
df_listings.drop(indexNames_bedrooms , inplace=True)

**B. Outliers bathrooms**

Drop all rows with a number of bathrooms greater than 2.

In [None]:
# How many rows do we drop?
print('Number of dropping rows:',df_listings.query('bathrooms > 2').shape[0])

In [None]:
# Get names of indexes for which column bathrooms has value greater than 2
indexNames_bathrooms = df_listings[df_listings['bathrooms'] > 2].index
# Delete these row indexes from dataset
df_listings.drop(indexNames_bathrooms , inplace=True)

** C. Outliers maximum guests**

We drop all rows with a greater number of maximum guests than 8.

In [None]:
# How many rows do we drop?
print('Number of dropping rows:', df_listings.query('max_guests > 8').shape[0])

In [None]:
# Get names of indexes for which column maximum guests has value greater than 8
indexNames_guests = df_listings[df_listings['max_guests'] > 8].index
# Delete these row indexes from dataset
df_listings.drop(indexNames_guests , inplace=True)

** D. Outliers living area**

We drop all rows with a greater number of living area than 140.

In [None]:
# How many rows do we drop?
print('Number of dropping rows:', df_listings.query('living_area > 140').shape[0])

In [None]:
# Get names of indexes for which column living area has value greater than 140
indexNames_living_area = df_listings[df_listings['living_area'] > 140].index
# Delete these row indexes from dataset
df_listings.drop(indexNames_living_area , inplace=True)

### Description of numerical features after cleaning the dataset for outliers

In [None]:
# Looking for the description of the numerical features after dropping the outliers
df_listings[numerical_features.columns].describe()

## 3. Property type

If we want to model a good price calculator for 'standard' houses we have to look at the property type again. As we saw in the EDA, the distribution is widely.

In [None]:
# Distribution
fig, ax = plt.subplots(figsize=(15,5))
df_listings.property_type.hist(bins=25)
plt.xticks(rotation='vertical');

In [None]:
# Sort property type by highest value
df_listings.groupby('property_type')['listing_id'].count().sort_values(ascending=False)

We will keep the first three property types (holiday apartment, holiday houses, apartment, and bungalow >> Traum-Ferienwohnungen mentioned that holiday apartment and apartment practically are the same) because they have the highest amount in the dataset and the other property types seems to be special houses.

In [None]:
# Drop special property types
df_listings = df_listings[df_listings['property_type'].isin(['holiday_apartment', 'holiday_houses', 'apartment', 'bungalow'])]
print('The clean dataset contains:', df_listings.shape[0], 'unique properties.')

## 2. Missing values

In [None]:
# Looking for missing values
df_listings.isna().sum()

We have one missing value in `subregion`, 430 missing values in `bathrooms`, `bedrooms`, and `max_guests` and 431 missing values in `living_area`.

First, we drop the 430 rows with missing values for `bathrooms`, `bedrooms` and `max_guests`. 

In [None]:
# Drop the missing values
df_listings.dropna(subset=['bathrooms'], inplace=True)

Second, we are looking for the two rows with missing values to decide how to replace them.

In [None]:
# Show rows with missing values to decide how to replace them
df_listings[df_listings.isnull().any(axis=1)]

We decided to replace the subregion with the mode value for this zipcode (237--).

In [None]:
# Looking for the mode value for this zipcode and replace it
df_listings[df_listings['zip'] == '237--']['subregion'].value_counts()

Lübecker Bucht is the mode. So we replace with this value.

In [None]:
# Replace missing value with mode value for this zipcode
df_listings["subregion"] = df_listings["subregion"].fillna("Lübecker Bucht")

We calculate the mean living area for a property with 1 bathroom, 2 bedrooms, and a number of maximum guests of 4.

In [None]:
# Replace missing value with mean value for the property room numbers
df_listings["living_area"] = df_listings["living_area"].fillna(round(df_listings.query('bathrooms == 1 & bedrooms == 2 & max_guests == 4').mean()['living_area'], 1))

## 3. Zero values
In the dataset are also zero values for bathrooms and bedrooms. Traum-Ferienwohnungen mentioned that this is possible for different reasons, so we keep this values.

In [None]:
# Zero values
print('Zero values')
print('Bathrooms:', df_listings.query('bathrooms == 0').shape[0])
print('Bedrooms:', df_listings.query('bedrooms == 0').shape[0])

## New EDA after cleaning

### Distribution of the numerical features

In [None]:
# Plotting histograms for the features 
df_listings.hist(bins=50, figsize = (30,30))
plt.show()

#### Closer look to some features

In [None]:
# Count values for categories
print('Non Smoking:\n', df_listings['option_non_smoking_only'].value_counts())
print('--------------------')
print('Airconditioner:\n', df_listings['airconditioner'].value_counts())
print('--------------------')
print('Family travel:\n', df_listings['option_family_travel'].value_counts())

In [None]:
print('Percent of false values in feature')
print('Non smoking:', round(df_listings.query('option_non_smoking_only == 1').count()[1]/df_listings.shape[0]*100, 1))
print('Airconditioner:', round(df_listings.query('airconditioner == 0').count()[1]/df_listings.shape[0]*100, 1))
print('Family travel:', round(df_listings.query('option_family_travel == 1').count()[1]/df_listings.shape[0]*100, 1))

Because of the high value for no airconditioner (98.9%) we drop this column because it has no information for us. We still keep the feature non smoking (96.6% true) because it's an important information for the guests.

In [None]:
# Drop airconditioner
df_listings = df_listings.drop('airconditioner', axis=1)

### Overview about the Regions

In [None]:
# Number and names of region
print(' Number of different regions:', df_listings.region.nunique())
print(' Number of different subregions:', df_listings.subregion.nunique())
print(' Number of different holiday regions:', df_listings.holiday_region.nunique())
print(' Number of different zipcodes:', df_listings.zip.nunique())

We lost three subregions and eleve zipcodes with our data cleaning.

### Shape of the cleaned dataset

In [None]:
# Shape of the dataset
print('The dataset contains %s different properties and %s features.' %(df_listings.shape[0], df_listings.shape[1]))
print('The %s properties are owned by %s customers.' %(df_listings.shape[0], df_listings.customer_id.nunique()))
print('So in average each customer owns %s properties.' %(df_listings.shape[0]/df_listings.customer_id.nunique()))

# Merge mean latitude and longitude for zip code and merge with listings

In [None]:
# Load the dataset
df_zipcode = pd.read_csv('../data/plz_geocoord.csv', delimiter=';')

In [None]:
# Rename the unnamed column
df_zipcode.rename(columns={"Unnamed: 0": "zip"}, inplace=True)

In [None]:
# Remove last two digits in the feature postal_code to merge with dataset listings
df_zipcode['zip'] = df_zipcode['zip'].floordiv(100)

In [None]:
# Add leading Zeros to postal code to get a postal code of 5 digits
df_zipcode['zip'] = df_zipcode['zip'].apply(lambda x: '{0:0>3}'.format(x))

In [None]:
# Replace comma with dot
df_zipcode = df_zipcode.apply(lambda x: x.str.replace(',','.'))

In [None]:
# Convert latitude and longitude to numerical values
df_zipcode['lat'] = pd.to_numeric(df_zipcode['lat'])
df_zipcode['lng'] = pd.to_numeric(df_zipcode['lng'])

In [None]:
# groupby zipcode to calculate the mean for latitude and longitude
df_zipcode = df_zipcode.groupby('zip').mean().reset_index()

In [None]:
# Export csv master zipcode
#df_zipcode.to_csv('../data/master_zipcode_20210715.csv')

In [None]:
# Replace comma with dot in dataset listings for merging
df_listings['zip'] = df_listings['zip'].apply(lambda x: x.replace('--',''))

In [None]:
# merge both datasets listings and zipcode
df_master = pd.merge(df_listings, df_zipcode, how='left', on='zip')

In [None]:
# Plot for checking if everything looks fine
# Extract the data we're interested in
lat, lon = df_master['lat'], df_master['lng']
properties, zipcode = df_master['listing_id'], df_master['zip']

# Scatter the points, using size and color but no label
plt.scatter(lon, lat, label=None,
            cmap='viridis',
            linewidth=0, alpha=0.5)
#plt.axis(aspect='equal')
plt.xlabel('longitude')
plt.ylabel('latitude')

plt.title('Traum-Ferienwohnungen: Properties and zipcode');

# Save cleaned dataset in a new CSV

In [None]:
# Export csv
#df_listings.to_csv('../data/master_listings_20210715.csv')