# this notebook prepares the data and implements the code to run the LGBM Boost model for TFW

In [None]:
import sys
# adding to the path variables the one folder higher (locally, not changing system variables)
sys.path.append("..")
import pandas as pd
# from pandas_profiling import ProfileReport
import numpy as np
import warnings
import mlflow
from modeling.config import EXPERIMENT_NAME
TRACKING_URI = open("../.mlflow_uri").read().strip()

ROOT = os.environ.get('PWD')

warnings.filterwarnings('ignore')






# Data cleaning and feature engineering

In [None]:
# set global default to be able to see all columns

pd.set_option('display.max_rows', None)

In [None]:
# read from csv

master = pd.read_csv('../data/excellent_master.csv')

In [None]:
# check the state of the data

len(master)

In [None]:
list(master.columns.values)

In [None]:
# drop necessary columns
drop_list = ['Unnamed: 0','Unnamed: 0.1','Unnamed: 0_x','Unnamed: 0_y','arrival_date']

for i in drop_list:
    master.drop([i],axis=1,inplace=True)

In [None]:
master.nunique()

In [None]:
# select the filter that is appropriate for the data range. 
# NOTE: the range is not automatically reflected in the name of the dataframe--
# but it should be put into the parameters for ML Flow

# by default, the master set has both years, 2019 and 2020, so it needs no filter

# master_filter = master

# to filter only 2019 data
master_filter = master.query('year == 2019').reset_index()

# to filter only 2020 data
# master_filter = master.query('year == 2020')

In [None]:
master_filter.isna().sum()

## the data set has no missing values

In [None]:
# generate the feature mean_price_per_day 

master_filter_price_agg = master_filter.groupby(['listing_id','month']).agg(['mean'])['filled_in_price_per_day'].reset_index()

In [None]:

master_filter_1 = pd.merge(master_filter_price_agg, master_filter, left_on=['listing_id','month'], right_on=['listing_id','month'], how='right')

In [None]:
master_filter_1.rename(columns={'mean':'mean_price_per_day'},inplace=True)

In [None]:
# check the values

master_filter_1.nunique()

In [None]:
len(master_filter_1)

In [None]:
list(master_filter_1.columns.values)

In [None]:
# drop more things that need not be in the data set

drop_list = ['year',
 'option_holiday_with_your_pet','option_holiday_with_your_horse']

for i in drop_list:
    master_filter_1.drop([i],axis=1,inplace=True)

In [None]:
master_filter_1.drop(['index'],axis=1,inplace=True)

In [None]:
# save to csv if desired: this is the normed, filtered set with agg price data for the years in question

master_filter_1.to_csv('../data/master_filter_price_agg.csv')

In [None]:
master_filter_1 = pd.read_csv('../data/master_filter_price_agg.csv')

In [None]:
master_filter_1['property_type'].unique()

In [None]:
# convert property types into binary: holiday apartment = 0, and holiday_houses = 1
master_filter_1['prop_bin'] = 1


In [None]:
master_filter_1.loc[master_filter_1['property_type'] == 'holiday_apartment', 'prop_bin'] = 0

In [None]:
master_filter_1.head()

In [None]:
# drop unneeded columns

master_filter_1.drop(['filled_in_price_per_day','property_type','option_technicians'],axis=1,inplace=True)

In [None]:
# trying not to be a dummy by getting dummies
master_dummies = pd.get_dummies(master_filter_1, columns=['holiday_region','title','option_allergic',
 'option_non_smoking_only',
 'option_holiday_with_your_dog',
 'option_close_to_the_beach',
 'option_wheelchair_accessible',
 'option_family_travel',
 'option_close_to_the_water',
 'option_holiday_with_your_baby',
 'option_long_term_holiday',
 'option_fully_accessible',
 'option_close_to_the_ski_lift'])

In [None]:
list(master_dummies.columns_values)

In [None]:
# LGBM needs to have no unusual characters in the column names

master_dummies.columns = master_dummies.columns.str.replace("[_]", "")

In [None]:
master_dummies.head(2)

# move now to do the train test split and then the model

In [None]:
# create X and Y sets for train test split, this is especially important for LGBM, because it can overfit and we need to verify good fit by comparing to holdout data

X = master_dummies.drop(['listingid','inquirycount'],axis=1)
Y = master_dummies['inquirycount']

In [None]:
# train test split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=42)

## Trainining the model and tracking with MLFlow

In [None]:
# setting the MLFlow connection and experiment
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)
mlflow.start_run()
run = mlflow.active_run()

In [None]:
print("Active run_id: {}".format(run.info.run_id))

In [None]:
#training the model
# !pip install --upgrade pip
# !pip install lightgbm

from lightgbm import LGBMRegressor


In [None]:
# set LGBM parameters

clf = LGBMRegressor(
    n_estimators=400,
    learning_rate=0.01,
    colsample_bytree=.8,
    max_depth=10,
    reg_alpha=.1,
    reg_lambda=.1,
    min_split_gain=.01,
    min_child_weight=2,
    silent=-1,
    verbose=-1,
)

clf.fit(
    X_train, y_train, 
    eval_set= [(X_train, y_train), (X_test, y_test)], 
    eval_metric='rmse', verbose=100, early_stopping_rounds=100  #100
)

In [None]:
# install and use SHAP to check feature importance

# !pip install shap

import shap
import matplotlib.pyplot as pl 
shap_values = shap.TreeExplainer(clf.booster_).shap_values(X_train.iloc[:10000,:])
shap_values.shape


# get importances from the shap values
global_importances = np.abs(shap_values).mean(0)[:-1]


# make a bar chart that shows the global importance of the top 10 features
inds = np.argsort(-global_importances)
f = pl.figure(figsize=(5,10))
y_pos = np.arange(10)
inds2 = np.flip(inds[:10], 0)
pl.barh(y_pos, global_importances[inds2], align='center', color="#1E88E5")
pl.yticks(y_pos, fontsize=13)
pl.gca().set_yticklabels(X_train.columns[inds2])
pl.xlabel('mean abs. SHAP value (impact on model output)', fontsize=13)
pl.gca().xaxis.set_ticks_position('bottom')
pl.gca().yaxis.set_ticks_position('none')
pl.gca().spines['right'].set_visible(False)
pl.gca().spines['top'].set_visible(False)

In [None]:
# plot residuals

sns.residplot(x=y_test, y=y_test_pred)

plt.scatter(y=y_test, x=y_test_pred)
plt.plot(y_test_pred, y_test_pred, color='orange')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title(f'Actual VS Predicted Inquiries')
plt.show()

plt.axhline(0, c=(.5, .5, .5), ls='--')
plt.axvline(0, c=(.5, .5, .5), ls='--')
plt.scatter(x=y_test_pred, y=y_residual)



In [None]:
# set MLFlow parameters

params = {
      "model":"LightGBMBoost",
      "year":'2019',
      "train_test_split": 30,
      "normalized data": 'yes',
      "metric": 'rmse'
  }

In [None]:
mlflow.log_params(params)
mlflow.set_tag("running_from_jupyter", "baseline model 2019")
mlflow.log_metric("train -" + "RMSE", rmse_train)
mlflow.log_metric("test -" + "RMSE", rmse_test)
# mlflow.log_artifact("../models")
# mlflow.sklearn.log_model(reg, "model")
mlflow.end_run()

In [None]:
mlflow.get_run(run_id=run.info.run_id)

## Checking the experiments

while the next cell is running you will not be able to run other cells in the notebook

In [None]:
!mlflow ui