First, we want to download the data and split it into a train and validation split. Before I do the splitting, I will remove any rows with missing data. 

In [2]:
import os
import ast
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.tokenize import RegexpTokenizer

train_data = pd.read_csv(os.path.join("data", "train.csv"))
test_data = pd.read_csv(os.path.join("data", "test.csv"))

for col in train_data.columns: 
    print(col, train_data[col].isna().sum())
    
for col in test_data.columns: 
    print(col, test_data[col].isna().sum())

id 0
scrape_id 0
last_scraped 0
name 0
description 225
picture_url 0
host_id 0
host_name 0
host_since 0
host_is_superhost 428
host_listings_count 0
host_total_listings_count 0
host_verifications 0
host_has_profile_pic 0
host_identity_verified 0
neighbourhood_cleansed 0
neighbourhood_group_cleansed 0
latitude 0
longitude 0
property_type 0
room_type 0
accommodates 0
bathrooms_text 8
beds 119
amenities 0
price 0
minimum_nights 0
maximum_nights 0
minimum_minimum_nights 0
maximum_minimum_nights 0
minimum_maximum_nights 0
maximum_maximum_nights 0
minimum_nights_avg_ntm 0
maximum_nights_avg_ntm 0
has_availability 0
availability_30 0
availability_60 0
availability_90 0
availability_365 0
calendar_last_scraped 0
number_of_reviews 0
number_of_reviews_ltm 0
number_of_reviews_l30d 0
instant_bookable 0
calculated_host_listings_count 0
calculated_host_listings_count_entire_homes 0
calculated_host_listings_count_private_rooms 0
calculated_host_listings_count_shared_rooms 0
id 0
scrape_id 0
last_scrap

In [3]:
# drop all nan values from train set before splitting, there are no nan values in test set 
train_data.dropna(inplace=True)

X, y = train_data.drop(["price"], axis=1), train_data["price"]


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=42)
X_test = test_data

# Assert that we're looking at same feature for train and test datasets 
assert X_train.columns.all() == X_test.columns.all()

# Display shapes 
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)
print(X_test.shape)

(11740, 47) (11740,)
(2935, 47) (2935,)
(6291, 47)


We would like to preprocess some features. To prevent data leakage between the train and validation splits, this is a series of preprocessing steps that is based only on the training data. 
1. Delete the `id` since that is just an identification number for each row, `scrape_id` since it is the same for all rows, `last_scraped` (and equivalently, `calendar_last_scraped`) since it is also the same. 
2. One hot encode all the True (t)/False (f) values into $0$ or $1$ to make them numeric. 
3. Convert the values of `bathrooms_text` to floating point values (`3.5 baths -> 3.5`)
4. Delete `host_name` since `host_id` contains all relevant information and more (since there can be multiple hosts with the same name, but only one with the same id)
5. Take `host_since` and count the number of days until `last_scraped`. 
6. Take `host_verifications` and `amenities` and one-hot encode them. 
7. (?) `neighbourhood_cleansed` and `neighbourhood_group_cleansed` 
8. (?) `property_type` and `room_type` 
9. Take `description` and compute readability scores on it. 
10. Take `picture_url`, train a sentiment classifier CNN. 

In [36]:
def contains(x): 
    return int("shared" in str(x).lower())

def numBath(x): 
    if "half" in x.lower(): 
        return 0.5 
    else: 
        return float(x.split()[0]) 
    
def wordCount(x): 
    tokenizer = RegexpTokenizer(r'\w+') 
    tokens = tokenizer.tokenize(x)
    return len(tokens) 
    

def preprocess(df:pd.DataFrame): 
    
    df['host_since'] = pd.to_datetime(df['host_since'])
    df["last_scraped"] = pd.to_datetime(df['last_scraped'])
    
    df["num_days_hosted"] = (df["last_scraped"] - df['host_since']).dt.days
    
    df = df.drop(["id", "scrape_id", "last_scraped", "calendar_last_scraped", "host_since"], axis=1)
    
    df.loc[df["host_is_superhost"] == "f", "host_is_superhost"] = -1 
    df.loc[df["host_is_superhost"] == "t", "host_is_superhost"] = 1 
    df["host_is_superhost"] = df["host_is_superhost"].fillna(0) 

    for col in ["host_has_profile_pic", "host_identity_verified", "has_availability", "instant_bookable"]: 
        df.loc[df[col] == "f", col] = 0
        df.loc[df[col] == "t", col] = 1
    
    df["shared_bath"] = df["bathrooms_text"].apply(contains)
    df["num_baths"] = df["bathrooms_text"].apply(numBath)
    df = df.drop(["bathrooms_text"], axis=1)
    
    df = df.drop(["host_name"], axis=1)
    
    # one hot encode 
    dummies = df['host_verifications'].apply(lambda x: pd.Series({veri: 1 for veri in ast.literal_eval(x)}))
    dummies = dummies.fillna(0).astype(int)
    df = df.join(dummies)
    
    # dummies = df["amenities"].apply(lambda x : pd.Series({amen: 1 for amen in ast.literal_eval(x)}))
    # dummies = dummies.fillna(0).astype(int)
    # df = df.join(dummies)
    df["amenities"] = df["amenities"].apply(lambda x : len(ast.literal_eval(x))) 
    df = df.drop(["host_verifications", "amenities"], axis=1)
    
    # property_type and room_type 
    
    dummies = pd.get_dummies(df["room_type"]).astype(int)
    df = df.join(dummies) 
    df = df.drop(["property_type", "room_type"], axis=1)
    
    # description 
    df["description"] = df["description"].apply(wordCount)
    
    # image 
    df = df.drop(["picture_url", "name", "neighbourhood_cleansed", "neighbourhood_group_cleansed"], axis=1)
    
    return df.to_numpy().astype(float)

In [38]:
x = preprocess(X_train)

In [39]:
x.iloc[1]

description                                           167
host_id                                           1376881
host_is_superhost                                       1
host_listings_count                                  13.0
host_total_listings_count                            21.0
host_has_profile_pic                                    1
host_identity_verified                                  1
latitude                                           34.184
longitude                                      -118.37728
accommodates                                            4
beds                                                  2.0
minimum_nights                                         31
maximum_nights                                        300
minimum_minimum_nights                                 31
maximum_minimum_nights                                 31
minimum_maximum_nights                                300
maximum_maximum_nights                                300
minimum_nights

In [41]:
x.to_numpy().astype(float)

array([[ 1.50000000e+01,  2.92884962e+08, -1.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 1.67000000e+02,  1.37688100e+06,  1.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 1.76000000e+02,  1.19514030e+07, -1.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       ...,
       [ 8.00000000e+01,  4.68884755e+08, -1.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 2.20000000e+01,  2.26891300e+07, -1.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 1.88000000e+02,  5.42613350e+07, -1.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00]])

Unnamed: 0,property_type,room_type
14365,Entire condo,Entire home/apt
14345,Entire home,Entire home/apt
12836,Entire home,Entire home/apt
3739,Entire home,Entire home/apt
9270,Entire villa,Entire home/apt
804,Entire home,Entire home/apt
9365,Entire home,Entire home/apt
8036,Entire home,Entire home/apt
10252,Entire home,Entire home/apt
11251,Private room in rental unit,Private room


In [18]:
set([k for k in x["room_type"]])

{'Entire home/apt', 'Hotel room', 'Private room', 'Shared room'}

In [24]:
res = {} 

for k in x["property_type"]: 
    if k not in res: 
        res[k] = 0 
    res[k] += 1 

res

{'Entire condo': 609,
 'Entire home': 4427,
 'Entire villa': 335,
 'Private room in rental unit': 318,
 'Entire rental unit': 2858,
 'Entire guesthouse': 498,
 'Private room in home': 739,
 'Entire townhouse': 280,
 'Room in hotel': 168,
 'Entire guest suite': 233,
 'Entire cottage': 53,
 'Room in boutique hotel': 92,
 'Tiny home': 24,
 'Entire bungalow': 176,
 'Room in aparthotel': 25,
 'Private room in guest suite': 33,
 'Private room in treehouse': 1,
 'Private room in condo': 89,
 'Shared room in rental unit': 24,
 'Private room in camper/rv': 3,
 'Shared room in hotel': 3,
 'Castle': 4,
 'Private room in townhouse': 67,
 'Entire serviced apartment': 210,
 'Private room in bungalow': 26,
 'Entire loft': 99,
 'Private room in bed and breakfast': 36,
 'Private room in hostel': 9,
 'Entire vacation home': 44,
 'Private room in loft': 9,
 'Private room in nature lodge': 3,
 'Shared room in home': 33,
 'Camper/RV': 35,
 'Shared room in camper/rv': 6,
 'Private room in casa particular': 

In [26]:
res = {} 

for k in X_test["room_type"]: 
    if k not in res: 
        res[k] = 0 
    res[k] += 1 

res

{'Entire home/apt': 5308,
 'Private room': 924,
 'Shared room': 49,
 'Hotel room': 10}