# This notebook serves to create a model using RandomForest Regressor for our TFW data

In [None]:
import sys
# adding to the path variables the one folder higher (locally, not changing system variables)
sys.path.append("..")
import pandas as pd
# from pandas_profiling import ProfileReport
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVR

import warnings
import mlflow
from modeling.config import EXPERIMENT_NAME
TRACKING_URI = open("../.mlflow_uri").read().strip()

ROOT = os.environ.get('PWD')

warnings.filterwarnings('ignore')

## Data cleaning and feature engineering

In [None]:
# read data from csv
master = pd.read_csv('../data/excellent_master.csv')

Because of the difference in the years 2019 and 2020 (corona pandemic), we decided only to use the year 2019.

In [None]:
# Filter for year 2019
master = master[master['year'] == 2019]

In [None]:
# Drop unnamed columns
master = master.drop(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0_x', 'Unnamed: 0_y', 'year', 'arrival_date'], axis=1)

Feature engineering to create a mean price per day and property

In [None]:

# generate the feature mean_price_per_day 
master_filter_price_agg = master.groupby(['listing_id','month']).agg(['mean'])['filled_in_price_per_day'].reset_index()

In [None]:
# merge 
master = pd.merge(master_filter_price_agg, master, left_on=['listing_id','month'], right_on=['listing_id','month'], how='right')

In [None]:
master.rename(columns={'mean':'mean_price_per_day'},inplace=True)

In [None]:
# drop more things that need not be in the data set
drop_list = ['option_holiday_with_your_pet','option_holiday_with_your_horse', 'filled_in_price_per_day', 'option_technicians']

for i in drop_list:
    master.drop([i],axis=1,inplace=True)

In [None]:
# convert property types into binary: holiday apartment = 0, and holiday_houses = 1
master['prop_bin'] = 1
master.loc[master['property_type'] == 'holiday_apartment', 'prop_bin'] = 0
master.drop(['property_type'],axis=1,inplace=True)

In [None]:
# create dummies for categorical features
master_dummies = pd.get_dummies(master, columns=['holiday_region','title','option_allergic',
 'option_non_smoking_only',
 'option_holiday_with_your_dog',
 'option_close_to_the_beach',
 'option_wheelchair_accessible',
 'option_family_travel',
 'option_close_to_the_water',
 'option_holiday_with_your_baby',
 'option_long_term_holiday',
 'option_fully_accessible',
 'option_close_to_the_ski_lift'])

## Split dataset in train and test

In [None]:
# create X and Y sets for train test split
X = master_dummies.drop(['listing_id','inquiry_count'],axis=1)
Y = master_dummies['inquiry_count']

In [None]:
# train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=42)

## Trainining the model and tracking with MLFlow

In [None]:
# setting the MLFlow connection and experiment
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)
mlflow.start_run()
run = mlflow.active_run()

In [None]:
print("Active run_id: {}".format(run.info.run_id))

In [None]:
# import RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor

In [None]:
# Create the model with 100 trees
forest = RandomForestRegressor(n_estimators=100, 
                               random_state=42,
                               max_depth=10, 
                               min_samples_split = 10,
                               n_jobs=-1, verbose = 1)

# Fit on training data
forest.fit(X_train, y_train)

In [None]:
# Calculate the metric: RMSE and R squared
from sklearn.metrics import mean_squared_error

score = forest.score(X_train, y_train)
y_pred_train = forest.predict(X_train)
y_pred_test = forest.predict(X_test)
 
rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)

print("R-squared:", score)
print("RMSE train:", rmse_train)
print("RMSE test:", rmse_test)

## Most important features

In [None]:
# Looking for the most important features
import shap
import matplotlib.pyplot as pl 
shap_values = shap.TreeExplainer(forest).shap_values(X_train.iloc[:10000,:])
shap_values.shape


# %%
global_importances = np.abs(shap_values).mean(0)[:-1]


# %%
# make a bar chart that shows the global importance of the top 10 features
inds = np.argsort(-global_importances)
f = pl.figure(figsize=(5,10))
y_pos = np.arange(10)
inds2 = np.flip(inds[:10], 0)
pl.barh(y_pos, global_importances[inds2], align='center', color="#1E88E5")
pl.yticks(y_pos, fontsize=13)
pl.gca().set_yticklabels(X_train.columns[inds2])
pl.xlabel('mean abs. SHAP value (impact on model output)', fontsize=13)
pl.gca().xaxis.set_ticks_position('bottom')
pl.gca().yaxis.set_ticks_position('none')
pl.gca().spines['right'].set_visible(False)
pl.gca().spines['top'].set_visible(False)

## Residual plots

In [None]:
# Residualplot
sns.residplot(x=y_pred_test, y=(y_test - y_pred_test))

In [None]:
# Scatterplot actual vs predicted values
plt.scatter(x=y_test, y=y_pred_test)
plt.plot(y_pred_test, y_pred_test, color='orange')
plt.xlabel('Actual')
plt.ylabel('predicted')
plt.title(f'Actual VS Predicted Inquiries')
plt.show()

In [None]:
params = {
      "model":"SVR on 1.5% of the data",
      "SVR params":"kernel=poly, C=1, epsilon=10",
      "year":'2019',
      "train_test_split": 30,
      "normalized data": 'yes',
      "metric": 'rmse and R2'
  }

In [None]:
mlflow.log_params(params)
mlflow.set_tag("running_from_jupyter", "SVR model 2019")
mlflow.log_metric("train -" + "RMSE", rmse_train)
mlflow.log_metric("test -" + "RMSE", rmse_test)
# mlflow.log_artifact("../models")
# mlflow.sklearn.log_model(reg, "model")
mlflow.end_run()

In [None]:
mlflow.get_run(run_id=run.info.run_id)