In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('dark_background')

from sklearn.model_selection import train_test_split

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor, HistGradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import Lasso

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

from typing import List, Union

In [5]:
df_train = pd.read_csv('data/train.csv', low_memory=False, index_col="id")
df_train = df_train[df_train["price"] != 0] # most of RMSE comes from underprediction. removing low outliers. 
# df_train = df_train[df_train["price"] >= 25]
df_test = pd.read_csv('data/test.csv', low_memory=False, index_col="id")
df_train.shape, df_test.shape

((33522, 64), (17337, 63))

In [6]:
class Transformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X: pd.DataFrame):
        return self

    def transform(self, X: pd.DataFrame):
        X["zipcode_trans"] = pd.to_numeric(
            X["zipcode"]
                .astype(str)
                .str.replace(r"\D", "", regex=True)
                .str.slice(0, 5), 
            errors="coerce"
        )
        X["extra_people_trans"] = X["extra_people"].str.replace("$", "", regex=False).astype(float)
        X["host_response_rate_trans"] = X["host_response_rate"].str.replace("%", "", regex=False).astype(float)
        X["host_is_superhost_trans"] = X["host_is_superhost"].map({"t": 1, "f": 0})
        X["host_has_profile_pic_trans"] = X["host_has_profile_pic"].map({"t": 1, "f": 0})
        X["host_identity_verified_trans"] = X["host_identity_verified"].map({"t": 1, "f": 0})
        
        X["first_review_trans"] = X["first_review"].str.replace("-", "", regex=False).astype(float)
        X["last_review_trans"] = X["first_review"].str.replace("-", "", regex=False).astype(float)
        X["host_since_trans"] = X["host_since"].str.replace("-", "", regex=False).astype(float)
        host_loc_strings = X["host_location"].str.replace(" ", "").str.split(",") # changed from df_train
        
        X["host_in_us"] = host_loc_strings.apply(lambda x: "US" in str(x) or "UnitedStates" in str(x) or "USA" in str(x))
        
        states = ['Alabama', 'AL', 'Alaska', 'AK', 'Arizona', 'AZ', 'Arkansas', 'AR', 'California', 'CA', 'Colorado', 'CO',
        'Connecticut', 'CT', 'Delaware', 'DE', 'Florida', 'FL', 'Georgia', 'GA', 'Hawaii', 'HI', 'Idaho', 'ID', 
        'Illinois', 'IL', 'Indiana', 'IN', 'Iowa', 'IA', 'Kansas', 'KS', 'Kentucky', 'KY', 'Louisiana', 'LA', 
        'Maine', 'ME', 'Maryland', 'MD', 'Massachusetts', 'MA', 'Michigan', 'MI', 'Minnesota', 'MN', 
        'Mississippi', 'MS', 'Missouri', 'MO', 'Montana', 'MT', 'Nebraska', 'NE', 'Nevada', 'NV', 
        'New Hampshire', 'NH', 'New Jersey', 'NJ', 'New Mexico', 'NM', 'New York', 'NY', 
        'North Carolina', 'NC', 'North Dakota', 'ND', 'Ohio', 'OH', 'Oklahoma', 'OK', 'Oregon', 'OR', 
        'Pennsylvania', 'PA', 'Rhode Island', 'RI', 'South Carolina', 'SC', 'South Dakota', 'SD', 
        'Tennessee', 'TN', 'Texas', 'TX', 'Utah', 'UT', 'Vermont', 'VT', 'Virginia', 'VA', 'Washington', 'WA', 
        'West Virginia', 'WV', 'Wisconsin', 'WI', 'Wyoming', 'WY']
        pattern = '|'.join(states)
        X["host_location_state"] = X['host_location'].str.extract('(' + pattern + ')', expand=False)
        
        X["host_in_cal"] = host_loc_strings.apply(lambda x: "California" in str(x) or "CA" in str(x))
        X["host_in_ny"] = host_loc_strings.apply(lambda x: "NewYork" in str(x) or "NY" in str(x))
        X["host_in_nj"] = host_loc_strings.apply(lambda x: "NewJersey" in str(x) or "NJ" in str(x))
        
        X["amenities_elevator"] = X["amenities"].str.lower().str.strip("{}").str.replace('"', "").str.contains("elevator")
        X["amenities_free_street_parking"] = X["amenities"].str.lower().str.strip("{}").str.replace('"', "").str.contains("free street parking")
        X["amenities_washer"] = X["amenities"].str.lower().str.strip("{}").str.replace('"', "").str.contains("washer")
        X["amenities_cable_tv"] = X["amenities"].str.lower().str.strip("{}").str.replace('"', "").str.contains("cable tv")
        X["amenities_essentials"] = X["amenities"].str.lower().str.strip("{}").str.replace('"', "").str.contains("essentials")
        X["amenities_shampoo"] = X["amenities"].str.lower().str.strip("{}").str.replace('"', "").str.contains("shampoo")
        X["amenities_tv"] = X["amenities"].str.lower().str.strip("{}").str.replace('"', "").str.contains("tv")
        X["amenities_wifi"] = X["amenities"].str.lower().str.strip("{}").str.replace('"', "").str.contains("wifi")
        X["amenities_family/kid_friendly"] = X["amenities"].str.lower().str.strip("{}").str.replace('"', "").str.contains("family/kid friendly")
        # buzzer/wireless intercom
        X["amenities_buzzer/wireless_intercom"] = X["amenities"].str.lower().str.strip("{}").str.replace('"', "").str.contains("buzzer/wireless intercom")
        # smoke detector
        X["amenities_smoke_detector"] = X["amenities"].str.lower().str.strip("{}").str.replace('"', "").str.contains("smoke detector")
        # air conditioning
        X["amenities_ac"] = X["amenities"].str.lower().str.strip("{}").str.replace('"', "").str.contains("air conditioning")
        # fire extinguisher
        X["amenities_fire_extinguisher"] = X["amenities"].str.lower().str.strip("{}").str.replace('"', "").str.contains("fire extinguisher")
        # internet
        X["amenities_internet"] = X["amenities"].str.lower().str.strip("{}").str.replace('"', "").str.contains("internet")
        # first aid kid
        X["amenities_first_aid_kit"] = X["amenities"].str.lower().str.strip("{}").str.replace('"', "").str.contains("first aid kit")
        # hangers
        X["amenities_hangers"] = X["amenities"].str.lower().str.strip("{}").str.replace('"', "").str.contains("hangers")
        # hot water
        X["amenities_hot_water"] = X["amenities"].str.lower().str.strip("{}").str.replace('"', "").str.contains("hot water")
        # hair dryer
        X["amenities_hair_dryer"] = X["amenities"].str.lower().str.strip("{}").str.replace('"', "").str.contains("hair dryer")
        # heating
        X["amenities_heating"] = X["amenities"].str.lower().str.strip("{}").str.replace('"', "").str.contains("heating")
        
        # X["amenities_wifi"] = X["amenities"].str.replace("{|}", "", regex=True).str.lower().str.split(",").apply(lambda x: "wifi" in x)
        # X["amenities_kitchen"] = X["amenities"].str.replace("{|}", "", regex=True).str.lower().str.split(",").apply(lambda x: "kitchen" in x)
        # X["amenities_ac"] = X["amenities"].str.replace("{|}", "", regex=True).str.lower().str.split(",").apply(lambda x: '"air conditioning"' in x)
        
        X["host_neighbourhood_trans"] = X["host_neighbourhood"].copy()
        X.loc[X.groupby("host_neighbourhood")["host_neighbourhood"].transform("count") == 1, "host_neighbourhood_trans"] = np.nan
        
        host_verification_types = df_train["host_verifications"].str.replace("[", "").str.replace("]", "").str.replace("'", "").str.replace(" ", "").str.split(",")
        X["host_gov_id"] = host_verification_types.apply(lambda x: "government_id" in str(x))
        
        X["instant_bookable_trans"] = X["instant_bookable"].map({"t": 1, "f": 0})
        X["is_business_travel_ready_trans"] = X["is_business_travel_ready"].map({"t": 1, "f": 0})
        
        X["require_guest_profile_picture_trans"] = X["require_guest_profile_picture"].map({"t": 1, "f": 0}) 
        X["require_guest_phone_verification_trans"] = X["require_guest_phone_verification"].map({"t": 1, "f": 0})
        
        X["bed_type_trans"] = X["bed_type"].map({"Real Bed": 5, "Futon": 4, "Pull-out Sofa": 3, "Airbed": 2, "Couch": 1}).fillna(0)
        X["room_type_trans"] = X["room_type"].map({"Entire home/apt": 3, "Private room": 2, "Shared room": 1}).fillna(0)
        X["host_response_time_trans"] = X["host_response_time"].map({"within an hour": 1, "within a few hours": 2, "within a day": 3, "a few days or more": 4}).fillna(0)
        X["cancellation_policy_trans"] = X["cancellation_policy"].map({"flexible": 1, "moderate": 2, "strict_14_with_grace_period": 3, "super_strict_30": 4, "super_strict_60": 5,  "long_term": 6, "strict": 7}).fillna(0)
        
        return X

preprocessor = Pipeline([
    ("trans", Transformer()),
])

In [7]:
features = [
    "neighbourhood_cleansed", "neighbourhood_group_cleansed", 
    "zipcode_trans", 
    "bathrooms", "bedrooms", "beds", 
    "room_type", "bed_type", 
    "guests_included", "extra_people_trans", 
    "minimum_nights", "maximum_nights", 
    "accommodates", 
    "property_type",
    
    "city", "state", "market",
    
    # "host_id", "host_name", "host_since", "host_location", "host_about",
    "host_since_trans", "host_location_state", 
    "host_in_us", "host_in_cal", "host_in_ny", "host_in_nj",
    "host_response_time", "host_response_rate_trans", "host_acceptance_rate",
    "host_is_superhost_trans", "host_listings_count", 
    # "host_verifications", - need to do separate processing to onehot
    "host_gov_id",
    # host_neighbourhood
    "host_neighbourhood_trans",
    "host_has_profile_pic_trans", "host_identity_verified_trans",
    
    # "amenities_wifi",
    # "amenities_ac",
    "amenities_elevator",
    "amenities_free_street_parking",
    "amenities_washer",
    "amenities_cable_tv",
    "amenities_essentials", 
    "amenities_shampoo",
    "amenities_tv",
    "amenities_wifi",
    
    # "amenities_family/kid_friendly",
    # "amenities_buzzer/wireless_intercom",
    # "amenities_smoke_detector",
    # "amenities_ac",
    # "amenities_fire_extinguisher",
    # "amenities_internet", 
    "amenities_first_aid_kit", # why does this work? 
    # "amenities_hangers",
    # "amenities_hot_water",
    # "amenities_hair_dryer",
    # "amenities_heating",
    
    # "number_of_reviews", 
    "reviews_per_month",
    "first_review_trans", "last_review_trans",
    "review_scores_rating", "review_scores_accuracy", "review_scores_cleanliness",
    "review_scores_checkin", "review_scores_communication", "review_scores_location",
    "review_scores_value", 
    
    "instant_bookable_trans", "is_business_travel_ready_trans", "cancellation_policy", 
    "require_guest_profile_picture_trans", "require_guest_phone_verification_trans",
    "calculated_host_listings_count",
]


trash_features = [
    "neighbourhood_group_cleansed",
    # "beds", 
    "bed_type",
    "city", "state", "market",
    
    # "host_in_us", 
    "host_in_cal", 
    # "host_in_ny",
    "host_in_nj",
    "host_location_state",
    # "host_gov_id",
    
    "host_acceptance_rate", # all null
    "is_business_travel_ready_trans", # all same value
    # "host_has_profile_pic", # too generic
    # "host_response_time", # doesn't seem to mean much
]

categorical_features = [
    "neighbourhood_cleansed",
    "neighbourhood_group_cleansed",
    "room_type",
    "bed_type",
    "property_type",
    
    "host_location_state",
    "host_neighbourhood_trans",
    
    "city", "state", "market",
    
    "host_response_time",
    "cancellation_policy",
]

features = [feature for feature in features if feature not in trash_features]
categorical_features = [feature for feature in categorical_features if feature not in trash_features]

# target = "price"
target = "log_price"

df_train["log_price"] = np.log(df_train.price)

dfs_train = preprocessor.fit_transform(df_train)[features + [target]]
dfs_test = preprocessor.fit_transform(df_test)[features]
dfs_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 33522 entries, 22267382 to 17414910
Data columns (total 48 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   neighbourhood_cleansed                  33522 non-null  object 
 1   zipcode_trans                           33034 non-null  float64
 2   bathrooms                               33462 non-null  float64
 3   bedrooms                                33489 non-null  float64
 4   beds                                    33491 non-null  float64
 5   room_type                               33522 non-null  object 
 6   guests_included                         33522 non-null  int64  
 7   extra_people_trans                      33522 non-null  float64
 8   minimum_nights                          33522 non-null  int64  
 9   maximum_nights                          33522 non-null  int64  
 10  accommodates                            33522 non-nul

In [8]:
X = dfs_train.drop(target, axis=1)
y_log = dfs_train[target]
y = df_train.price

In [9]:
ensemble = HistGradientBoostingRegressor(
    loss="squared_error",
    learning_rate=0.1,
    max_iter=150,
    max_leaf_nodes=31,
    max_depth=None,
    min_samples_leaf=20,
    l2_regularization=0.1,
    max_features=1.0,
    max_bins=255,
    categorical_features=categorical_features,
    monotonic_cst=None, # very interesting. worth exploring. 
    interaction_cst=None, # also worth looking into
    validation_fraction=0.1,
    early_stopping=True,
    # verbose=1,
    random_state=42,
    # random_state=23
)
ensemble.fit(X, y)
print(ensemble.n_iter_)

134


In [10]:
out_df = pd.DataFrame()
out_df["Id"] = df_test.index
out_df['Predicted'] = ensemble.predict(dfs_test)
out_df.to_csv("out/preds.csv", index = False)