# Kinetica Machine Learning: Predict Airbnb Listing Prices in Realtime

In [21]:
import pandas as pd
import seaborn as sns
from connector.kcudf import GPUdb
from connector.kmaputil import KMap
pd.options.display.max_columns = 100
pd.set_option('display.width', 10000)

# Init Connections 
TABLE = 'airbnb_historical_listings'
IPADDRESS = '<ipaddress>'
PORT = '9191'
USER = '<username>'
PASSWORD = '<password>'

# Server-Side WMS in Your Notebooks

In [22]:
# Init Connection to DB
obj = GPUdb(f'http://{IPADDRESS}:{PORT}')

# Sample SQL query -> table to df
df = obj.to_df(f'SELECT * FROM {TABLE};')

map = KMap(url=f'http://{IPADDRESS}:8080/gadmin',username=USER,password=PASSWORD)
map.renderWMS({
    'layers': 'airbnb_historical_listings',
    'style': 'heatmap',
    'x_attr': 'longitude',
    'y_attr': 'latitude',
    'size': 2,       # Point/Blue/Line size
    'height': 350,   # Map Widget Height
    'width': 940     # Map Widget Width
})

# Model Selection and Hyper-Parameter Tuning using GridSearch CV

In [20]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

# A parameter grid for XGBoost
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

# Initialize model
xb = XGBRegressor()
# Instantiate the grid search 
grid_search = GridSearchCV(estimator = xb, param_grid = params, cv = 3, n_jobs = -1, verbose = 2)

# Model Training + Validation + Serialization to Disk (Pickle)

In [29]:
import pickle
from xgboost import XGBRegressor as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
# Xgboost model using sub set of features that have already been engineered to work,
# applies standard scaling and trains model, serializes model to disk with pickle and 
# outputs test set predictions to validate the model is functioning as expected

FILENAME='model'
OUTFILE=open(FILENAME, 'wb')
SCALE='scaler'
SCALER=open(SCALE, 'wb')

# Create df for model training
y = df[['price']]
x = df[['accommodates','bedrooms','bathrooms','cleaning_fee','distance','size']]

# Create training/test set for training
sc = StandardScaler()
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)
x_train=sc.fit_transform(x_train)
pickle.dump(sc, SCALER)
SCALER.close()
x_test=sc.transform(x_test)

# Xgboost parameters hard coded after grid-search cross validation
booster=xgb(n_estimators=200,random_state=4,gamma=0.2,max_depth=6,learning_rate=0.1,
            colsample_bytree=0.7)

# Fit model make predictions on test set and output the metrics
booster.fit(x_train,y_train)
pickle.dump(booster,OUTFILE)
OUTFILE.close()

# Validate model is functioning
y_preds = booster.predict(x_test)
for i in y_preds:
    print("[INFO] $", round(i, 2))

[INFO] $ 38.63
[INFO] $ 21.64
[INFO] $ 58.37
[INFO] $ 47.22
[INFO] $ 79.41
[INFO] $ 42.41
[INFO] $ 36.96
[INFO] $ 62.94
[INFO] $ 36.89
[INFO] $ 57.02
[INFO] $ 118.87
[INFO] $ 32.96
[INFO] $ 62.69
[INFO] $ 40.59
[INFO] $ 74.06
[INFO] $ 55.63
[INFO] $ 34.84
[INFO] $ 40.31
[INFO] $ 28.57
[INFO] $ 38.6
[INFO] $ 33.85
[INFO] $ 58.72
[INFO] $ 40.24
[INFO] $ 40.29
[INFO] $ 34.01
[INFO] $ 65.15
[INFO] $ 70.96
[INFO] $ 27.67
[INFO] $ 28.11
[INFO] $ 37.33
[INFO] $ 48.14
[INFO] $ 46.58
[INFO] $ 32.17
[INFO] $ 61.83
[INFO] $ 50.58
[INFO] $ 42.07
[INFO] $ 28.03
[INFO] $ 37.88
[INFO] $ 171.01
[INFO] $ 68.39
[INFO] $ 79.1
[INFO] $ 26.44
[INFO] $ 37.33
[INFO] $ 76.06
[INFO] $ 48.9
[INFO] $ 24.8
[INFO] $ 55.75
[INFO] $ 34.85
[INFO] $ 79.27
[INFO] $ 253.16
[INFO] $ 41.31
[INFO] $ 138.98
[INFO] $ 165.6
[INFO] $ 42.3
[INFO] $ 67.08
[INFO] $ 40.55
[INFO] $ 42.34
[INFO] $ 48.9
[INFO] $ 38.29
[INFO] $ 42.26
[INFO] $ 40.24
[INFO] $ 47.1
[INFO] $ 44.04
[INFO] $ 48.56
[INFO] $ 79.56
[INFO] $ 57.5
[INFO] $ 63.87