# Appendix B

In [1]:
# import necessary packages
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import mutual_info_regression
from sklearn.decomposition import PCA

In [None]:
# read in the raw data
raw_train = pd.read_csv("data/train.csv", parse_dates=['host_since', 'first_review', 'last_review'])
raw_test = pd.read_csv("data/test.csv", parse_dates=['host_since', 'first_review', 'last_review'])

## Model 1 Feature Engineering
### Training Dataset

In [None]:
# features that are ready to go out of the box
good_to_go_train = raw_train[['host_total_listings_count', 'calculated_host_listings_count',
                              'accommodates',
                              'availability_30', 'availability_60', 'availability_90', 'availability_365',
                              'minimum_nights', 'maximum_nights',
                              'number_of_reviews', 'number_of_reviews_ltm', 'number_of_reviews_l30d']].copy()

# features that require transformation from boolean to indicator
bools_train = raw_train[['host_is_superhost', "host_has_profile_pic", 'host_identity_verified', 'has_availability', 'instant_bookable']].copy()
bools_train.loc[bools_train["host_is_superhost"].isna(), 'host_is_superhost'] = False
bools_train.loc[bools_train["has_availability"].isna(), 'has_availability'] = False
bools_train = bools_train.astype(int)

# coordinate and rotated coordinate features
# Creating the rotated version such that Manhattan is exactly vertical
# so trees can roughly split along streets and avenues
coords_train = raw_train[['longitude', 'latitude']].copy()
# rotation matrix
theta = np.radians(32)
rotation_matrix = np.array([
    [np.cos(theta), -np.sin(theta)],
    [np.sin(theta), np.cos(theta)]
])

coords = coords_train[['longitude', 'latitude']].values
rotated_coords = coords @ rotation_matrix.T
coords_train['Rotated Longitude'] = rotated_coords[:, 0]
coords_train['Rotate Latitude'] = rotated_coords[:, 1]

# date variables (transform to "days since")
dates_train = (raw_train["host_since"].max() - raw_train["host_since"]).dt.days

# create dummies for categorical vairables
categorical_train = raw_train[['neighbourhood_group_cleansed', 'host_response_time', 'room_type']].copy()
categorical_train = pd.get_dummies(categorical_train, prefix=["Neighborhood Group", "Response Time", "Room Type"],
                                   prefix_sep = " : ", dummy_na=True, drop_first=False).drop(columns=["Neighborhood Group : nan", "Room Type : nan"]).astype(int)

# Create Indicators for Amenities that appear more than 500 times in training set (maybe add more for more complex models)
amenities_as_lists = raw_train['amenities'].apply(lambda x: x.replace('[', '').replace(']', '').replace('"', '').split(', '))

unique_amenities = {}

for list in amenities_as_lists:
    for item in list:
        if item in unique_amenities:
            unique_amenities[item] = unique_amenities[item] + 1
        else:
            unique_amenities[item] = 1

amenitities_to_dummy = [amenity for amenity, count in unique_amenities.items() if count >= 500]

amenity_train = pd.DataFrame({f'Amenity : {amenity}': amenities_as_lists.apply(lambda x: amenity in x) for amenity in amenitities_to_dummy}).astype(int)
amenity_train["Total Amenities"] = amenities_as_lists.apply(lambda x : len(x))

# features that require imputation and other wrangling (need to revisit imputation in more complex models)
wrangle_train = raw_train[["host_response_rate", "host_acceptance_rate", "bathrooms", "bedrooms", "beds"]].copy()
# impute missingness as its own value
wrangle_train.loc[wrangle_train["host_response_rate"].isna(), "host_response_rate"] = -1
wrangle_train.loc[wrangle_train["host_acceptance_rate"].isna(), "host_acceptance_rate"] = -1
# 
wrangle_train.loc[wrangle_train["bathrooms"].isna(), "bathrooms"] = 1
wrangle_train.loc[wrangle_train["bedrooms"].isna(), "bedrooms"] = 1
wrangle_train.loc[wrangle_train["beds"].isna(), "beds"] = 1

wrangle_train["Shared Baths"] = raw_train["bathrooms_text"].apply(lambda x : "shared" in str(x).lower()).astype(int)
wrangle_train["Calculated Host Proportion : Entire Homes/Apts"] = raw_train['calculated_host_listings_count_entire_homes'] / raw_train['calculated_host_listings_count']
wrangle_train["Calculated Host Proportion : Private Rooms"] = raw_train['calculated_host_listings_count_private_rooms'] / raw_train['calculated_host_listings_count']
wrangle_train["Calculated Host Proportion : Shared Rooms"] = raw_train['calculated_host_listings_count_shared_rooms'] / raw_train['calculated_host_listings_count']

# join for final dataframe
model1_train = pd.concat([good_to_go_train, bools_train, coords_train, dates_train, categorical_train, amenity_train, wrangle_train], axis=1)