# Code Clean-up

In [2]:
import ast
import pandas as pd
import numpy as np 
import xgboost as xgb

import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', 1000)

In [3]:
def test_shared_bath(x):
    if "shared" in str(x):
        return 1
    else:
        return 0
    
def get_nbr_bathrooms(x):
    if x is np.NaN:
        return 0
    elif "half-bath" in x.lower():
        return 0.5
    else:
        return float(x.split(" ")[0])
    
def count_verification_methods(x):
    x = ast.literal_eval(x)
    if x is not None:
        return len(x)
    else:
        return 0

In [4]:
def preprocess(input_file="data/listings.csv"):
    # read data
    df = pd.read_csv(input_file, low_memory=False)

    # anchor date
    todays_date = pd.to_datetime("2022-11-20")

    # clean price variable
    df['price'] = df.price.str.replace("\$|,", "", regex=True).astype("float32")

    # remove price = 0 rows
    df = df[df.price != 0]

    # log transform of the target
    df['target'] = np.log(df.price)

    # Feature enginnering numeric variables
    df['host_response_rate'] = df.host_response_rate.str.replace("%", "").astype("float32")

    df['host_acceptance_rate'] = df.host_acceptance_rate.str.replace("%", "").astype("float32")

    lkp_boolean = {"t":1, "f":0}
    df['host_is_superhost'] = df.host_is_superhost.map(lkp_boolean)
    df['instant_bookable'] = df.instant_bookable.map(lkp_boolean)
    df['shared_bathrooms'] = df.bathrooms_text.apply(test_shared_bath)
    df['nbr_bathrooms'] = df.bathrooms_text.apply(get_nbr_bathrooms)
    df['host_lives_nbh'] = (df.host_neighbourhood == df.neighbourhood_cleansed).astype("int8")

    # Feature engineering - categorical variables
    host_response_time_lkp = {'within an hour':"hour", 
                              'within a day':"one_day", 
                              'within a few hours':"few_hours", 
                              'a few days or more':"few_days"}
    df['host_response_time'] = df.host_response_time.map(host_response_time_lkp)

    df['nbr_host_verifications'] = df.host_verifications.apply(count_verification_methods)

    # handle date features
    df['days_since_host'] = (todays_date - pd.to_datetime(df.host_since)).dt.days
    df['days_since_first_review'] = (todays_date - pd.to_datetime(df.first_review)).dt.days
    df['days_since_last_review'] = (todays_date - pd.to_datetime(df.last_review)).dt.days

    # Feature list
    numeric_features = ["host_response_rate", "host_acceptance_rate", "host_is_superhost", 
                       "host_listings_count", "instant_bookable", 
                       "latitude", "longitude", "accommodates", "bedrooms", "beds", 
                        "nbr_bathrooms", "shared_bathrooms", "host_lives_nbh", 
                       "nbr_host_verifications", "days_since_host", 
                       "days_since_first_review", "days_since_last_review"]

    categorical_features = ["host_response_time", "neighbourhood_group_cleansed", 
                            "neighbourhood_cleansed", "property_type", "room_type"]
    
    target_variable = 'target'
    all_columns = [target_variable] + numeric_features + categorical_features
    df = df[all_columns]

    return df, numeric_features, categorical_features, target_variable

In [5]:
df, numeric_features, categorical_features, target_var = preprocess(input_file="data/listings.csv")

In [6]:
df.head()

Unnamed: 0,target,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,instant_bookable,latitude,longitude,accommodates,bedrooms,beds,nbr_bathrooms,shared_bathrooms,host_lives_nbh,nbr_host_verifications,days_since_host,days_since_first_review,days_since_last_review,host_response_time,neighbourhood_group_cleansed,neighbourhood_cleansed,property_type,room_type
0,5.700444,100.0,90.0,0,9.0,0,40.64529,-73.97238,2,1.0,1.0,1.0,1,0,2,5187.0,2543.0,1493.0,hour,Brooklyn,Kensington,Private room in rental unit,Private room
1,5.164786,75.0,23.0,0,6.0,0,40.75356,-73.98559,1,,1.0,1.0,0,1,3,5185.0,4747.0,152.0,one_day,Manhattan,Midtown,Entire rental unit,Entire home/apt
2,4.094345,100.0,100.0,1,2.0,0,40.68535,-73.95512,2,1.0,1.0,0.0,0,1,2,5038.0,4924.0,1084.0,hour,Brooklyn,Bedford-Stuyvesant,Private room in rental unit,Private room
3,6.052089,100.0,19.0,0,7.0,0,40.70309,-73.89963,16,5.0,10.0,2.5,0,1,2,4478.0,3974.0,1104.0,hour,Queens,Ridgewood,Entire townhouse,Entire home/apt
4,5.616771,,33.0,0,1.0,0,40.66265,-73.99454,4,2.0,2.0,1.5,0,0,2,5038.0,3244.0,102.0,,Brooklyn,Sunset Park,Entire rental unit,Entire home/apt


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39851 entries, 0 to 39880
Data columns (total 23 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   target                        39851 non-null  float32
 1   host_response_rate            26314 non-null  float32
 2   host_acceptance_rate          27999 non-null  float32
 3   host_is_superhost             39851 non-null  int64  
 4   host_listings_count           39801 non-null  float64
 5   instant_bookable              39851 non-null  int64  
 6   latitude                      39851 non-null  float64
 7   longitude                     39851 non-null  float64
 8   accommodates                  39851 non-null  int64  
 9   bedrooms                      36098 non-null  float64
 10  beds                          38997 non-null  float64
 11  nbr_bathrooms                 39851 non-null  float64
 12  shared_bathrooms              39851 non-null  int64  
 13  h