In [12]:
# Load the "autoreload" extension so that code can change
# Always reload modules so that as soon as code changes in src, it gets automatically reloaded without kernel relaunch
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np

import sys
sys.path.append('../') 

from src.utils import datacollector
from src.utils import constants as cst

from src.preprocessing import cleaning

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# REMINDER - OBJECTIVE
As a reminder, **our objective for modeling will be to be able to predict the price for a given listing**.  
We have seen that the price can be rather different depending on the period of the year. Of course, it can varies due to some other features, each one with its own importance. We have observed for example that the neighbourhood is one of them.  
To build something which will more reflect what happens in real life, I should merge the `listings.csv.gz` dataset with the `calendar.csv.gz` one but it will give me a too huge dataset with more than 20 millions of rows and hundred of features.  
**For computation reasons and as the goal of this project is more to validate an approach than building the perfectly perfect model, I choose to keep only the `listings.csv.gz` dataset and its 64K rows and my goal will be to try to predict the mean price for each element of the listing**.

In next sections below you will find all preliminary steps that are mandatory before starting the modeling phase.  
This will be based on what we have discovered during the [Data Understanding](1_Data_Understanding.ipynb) phase (see the 'summary' part for a reminder).

# 1. Load dataset & drop unnecessary features
The easiest thing to start with.

In [8]:
df_lst_full = pd.read_csv(datacollector.get_data_file(cst.LISTING_FULL_FILE), sep=',', header=0, low_memory=False)

In [9]:
def drop_cols(df, cols_to_drop):
    df_lst_reduced = df.drop(cols_to_drop, axis=1)
    # Coherence control
    assert df_lst_reduced.shape[1] == df.shape[1] - len(cols_to_drop)
    print("After column dropping, new shape is now {}".format(df_lst_reduced.shape))
    return df_lst_reduced

In [10]:
cols_to_drop = ['listing_url', 'scrape_id', 'last_scraped', 'experiences_offered', 'notes', 'transit', 'interaction', 
                'house_rules', 'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url', 'host_id', 'host_name', 
                'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_since', 
                'host_location', 'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 
                'host_verifications', 'host_url', 'host_thumbnail_url', 'host_picture_url', 'host_has_profile_pic', 
                'calculated_host_listings_count', 'calculated_host_listings_count_entire_homes', 
                'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms', 'street', 
                'neighbourhood', 'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market', 'smart_location', 
                'country_code', 'country', 'latitude', 'longitude', 'property_type', 'square_feet', 'has_availability', 
                'calendar_updated', 'calendar_last_scraped', 'first_review', 'last_review', 'requires_license', 
                'is_business_travel_ready', 'require_guest_profile_picture', 'require_guest_phone_verification']
df_lst_reduced = drop_cols(df_lst_full, cols_to_drop)

After column dropping, new shape is now (64293, 51)


# 2. Features transformation
## Text extraction or drop ?
I said in [Data Understanding](1_Data_Understanding.ipynb) conclusion that depending on the use case, perhaps we could try to extract some keywords from `name`, `summary`, `space`, `description`, `neighborhood_overview`, `access`.  
We are not in NLP problem, as those features contains only text which is moreover sometimes in french sometimes in english I propose to start with a simple approach and just drop those features also.

In [11]:
txt_cols_to_drop = ['name', 'summary', 'space', 'description', 'neighborhood_overview', 'access']
df_lst_reduced_notxt = drop_cols(df_lst_reduced, txt_cols_to_drop)

After column dropping, new shape is now (64293, 45)


## From 't'/'f' to binary 0/1
Here we will deal with columns that contains the now famous "t/f" categorical nominal values.

In [13]:
tf_cols = ['host_is_superhost', 'host_identity_verified', 'is_location_exact', 'instant_bookable']
for feat in  tf_cols:
    df_lst_reduced_notxt = cleaning.transform_t_f(df_lst_reduced_notxt, feat)
df_lst_reduced_notxt.shape

(64293, 45)

In [14]:
df_lst_reduced_notxt.host_is_superhost.value_counts()
# TODO il aurait fallu recuperer les valeurs avant la transfo, les stocker puis assert apres transfo que la somme des 1 = la value_counts des 't' par exemple

0.0    56276
1.0     7953
Name: host_is_superhost, dtype: int64

We have also the `license` feature to deal with: create a new binary feature which will stand for 'license provided yes/no'.

In [None]:
# TODO

## Handle rows with missing value
* 'bathrooms', 'bedrooms', 'beds': handle missing values (imputation, replace with 0 ?)

## Handle 'currency' columns
* price elements ('price', 'weekly_price', 'monthly_price', 'security_deposit', 'cleaning_fee', 'extra_people'): remove currency symbol/Handle thousands separator and convert to float

## 1-hot encoding of categorical features
* 'neighbourhood_cleansed': perform 1-hot encoding over the 20 values
* 'room_type': perform 1-hot encoding over the 3 values
* 'bed_type': perform 1-hot encoding over the 5 values. No missing value.
* 'cancellation_policy': perform 1-hot encoding over the 6 values

## Last but not least, specific treatment
* 'amenities': extract useful information and depending on the number of distinct values, perform 1-hot encoding over the values
* 'jurisdiction_names': administrative information about Paris. Try to extract the name then transform as dummies.