# Train Model

In [2]:
from utils import *

In [2]:
df, numeric_features, categorical_features, target_var = preprocess("data/listings.csv")

In [3]:
df.head()

Unnamed: 0,target,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,instant_bookable,latitude,longitude,accommodates,bedrooms,beds,nbr_bathrooms,shared_bathrooms,host_lives_nbh,nbr_host_verifications,days_since_host,days_since_first_review,days_since_last_review,host_response_time,neighbourhood_group_cleansed,neighbourhood_cleansed,property_type,room_type
0,5.700444,100.0,90.0,0,9.0,0,40.64529,-73.97238,2,1.0,1.0,1.0,1,0,2,5187.0,2543.0,1493.0,hour,Brooklyn,Kensington,Private room in rental unit,Private room
1,5.164786,75.0,23.0,0,6.0,0,40.75356,-73.98559,1,,1.0,1.0,0,1,3,5185.0,4747.0,152.0,one_day,Manhattan,Midtown,Entire rental unit,Entire home/apt
2,4.094345,100.0,100.0,1,2.0,0,40.68535,-73.95512,2,1.0,1.0,0.0,0,1,2,5038.0,4924.0,1084.0,hour,Brooklyn,Bedford-Stuyvesant,Private room in rental unit,Private room
3,6.052089,100.0,19.0,0,7.0,0,40.70309,-73.89963,16,5.0,10.0,2.5,0,1,2,4478.0,3974.0,1104.0,hour,Queens,Ridgewood,Entire townhouse,Entire home/apt
4,5.616771,,33.0,0,1.0,0,40.66265,-73.99454,4,2.0,2.0,1.5,0,0,2,5038.0,3244.0,102.0,,Brooklyn,Sunset Park,Entire rental unit,Entire home/apt


In [4]:
df[categorical_features].nunique()

host_response_time                4
neighbourhood_group_cleansed      5
neighbourhood_cleansed          244
property_type                    79
room_type                         4
dtype: int64

# Define train/test split

In [5]:
from sklearn.model_selection import train_test_split

n = df.shape[0]
inx_all = np.arange(n)
y = df[[target_var]].values

In [6]:
# Train/test split
inx_train, inx_test, y_train, y_test = train_test_split(
    inx_all, y, test_size=0.3, random_state=42)

In [7]:
df_num = df[numeric_features].copy()
df_cat = df[categorical_features].copy()

In [8]:
df_cat['host_response_time'] = df_cat['host_response_time'].replace(np.NaN, "missing")

In [9]:
df_num = df_num.replace(np.NaN, -1)

In [10]:
df_num.isnull().sum()

host_response_rate         0
host_acceptance_rate       0
host_is_superhost          0
host_listings_count        0
instant_bookable           0
latitude                   0
longitude                  0
accommodates               0
bedrooms                   0
beds                       0
nbr_bathrooms              0
shared_bathrooms           0
host_lives_nbh             0
nbr_host_verifications     0
days_since_host            0
days_since_first_review    0
days_since_last_review     0
dtype: int64

In [11]:
df_cat.isnull().sum()

host_response_time              0
neighbourhood_group_cleansed    0
neighbourhood_cleansed          0
property_type                   0
room_type                       0
dtype: int64

# One-Hot Encoding of categorical variables

In [12]:
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import hstack

In [13]:
ohe = OneHotEncoder(handle_unknown="ignore")
ohe.fit(df_cat.iloc[inx_train])

OneHotEncoder(handle_unknown='ignore')

In [14]:
X_train_cat = ohe.transform(df_cat.iloc[inx_train])
X_test_cat = ohe.transform(df_cat.iloc[inx_test])

In [15]:
X_train_num = df_num.iloc[inx_train]
X_test_num = df_num.iloc[inx_test]

In [16]:
X_train = hstack((X_train_cat, X_train_num))
X_test = hstack((X_test_cat, X_test_num))

# Train model

In [17]:
# convert numpy arrays to xgb Dmatrix format
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [18]:
# specify validations set to watch performance
watchlist = [(dtrain, 'train'), (dtest, 'eval')]

In [19]:
def custom_mae(preds, dtrain):
    y_pred = np.array(preds)
    y_true = np.array(dtrain.get_label())
    
    y_pred = np.exp(y_pred)
    y_true = np.exp(y_true)
    
    mae = np.mean(np.abs(y_pred - y_true))
    return "PriceMAE", mae

In [20]:
# max_depth=6 - 64.81
# max_depth=12 - 62.72
# max_depth=12 - 63.61164 | lambda=100
# max_depth=15 - 62.43914 | lambda=50

In [21]:
param = {
    "max_depth": 15,
    "objective": "reg:squarederror",
    "subsample": 0.6667, 
    'colsample_bytree':0.5,
    "lambda": 50,
    'min_child_weight':1,
    'tree_method':'hist',
    "learning_rate": 0.05,
    "eval_metric": "mae"
}

num_round = 500

bst = xgb.train(params=param, dtrain=dtrain, num_boost_round=num_round, 
                feval=custom_mae, maximize=False, obj=None, 
                evals=watchlist, verbose_eval=20)

[0]	train-mae:4.20262	train-PriceMAE:196.84172	eval-mae:4.18692	eval-PriceMAE:192.84190
[20]	train-mae:1.52212	train-PriceMAE:167.83194	eval-mae:1.51138	eval-PriceMAE:163.98329
[40]	train-mae:0.59660	train-PriceMAE:115.30377	eval-mae:0.59237	eval-PriceMAE:112.14600
[60]	train-mae:0.35655	train-PriceMAE:85.23225	eval-mae:0.36219	eval-PriceMAE:83.81362
[80]	train-mae:0.29729	train-PriceMAE:73.25015	eval-mae:0.31295	eval-PriceMAE:74.13115
[100]	train-mae:0.27159	train-PriceMAE:66.86298	eval-mae:0.29695	eval-PriceMAE:70.12521
[120]	train-mae:0.25613	train-PriceMAE:62.83045	eval-mae:0.28912	eval-PriceMAE:67.99676
[140]	train-mae:0.24467	train-PriceMAE:59.90991	eval-mae:0.28466	eval-PriceMAE:66.76593
[160]	train-mae:0.23403	train-PriceMAE:57.33881	eval-mae:0.28136	eval-PriceMAE:65.87837
[180]	train-mae:0.22561	train-PriceMAE:55.26722	eval-mae:0.27897	eval-PriceMAE:65.25221
[200]	train-mae:0.21876	train-PriceMAE:53.62469	eval-mae:0.27748	eval-PriceMAE:64.86243
[220]	train-mae:0.21229	train-Pr

In [22]:
yhat = np.median(np.exp(y_train))
yhat

130.00003

In [23]:
baseline_mae = np.mean(np.abs(yhat - np.exp(y_test)))
baseline_mae

115.79675