# this notebook implements the feature engineering and basic code features to develop the baseline model and implement MLFlow

In [None]:
import sys
# adding to the path variables the one folder higher (locally, not changing system variables)
sys.path.append("..")
import pandas as pd
# from pandas_profiling import ProfileReport
import numpy as np
import warnings
import mlflow
from modeling.config import EXPERIMENT_NAME
TRACKING_URI = open("../.mlflow_uri").read().strip()

ROOT = os.environ.get('PWD')

warnings.filterwarnings('ignore')






# Data cleaning and feature engineering

In [None]:
# set global default to be able to see all columns

pd.set_option('display.max_rows', None)

In [None]:
# read from csv

master = pd.read_csv('../data/super_master.csv')

In [None]:
list(master.columns.values)

In [None]:
# drop necessary columns
drop_list = ['Unnamed: 0','Unnamed: 0.1','Unnamed: 0_x','Unnamed: 0_y','arrival_date']

for i in drop_list:
    master.drop([i],axis=1,inplace=True)

In [None]:
# check to confirm that the data looks as it should

master.nunique()

In [None]:
# select the filter that is appropriate for the data range. 
# NOTE: the range is not automatically reflected in the name of the dataframe--
# but it should be put into the parameters for ML Flow

# by default, the master set has both years, 2019 and 2020, so it needs no filter

# master_filter = master

# to filter only 2019 data
master_filter = master.query('year == 2019')

# to filter only 2020 data
# master_filter = master.query('year == 2020')

In [None]:
master_filter.tail()

In [None]:
master_filter.isna().sum()

## the data set has no missing values

# we now need to normalize on time to remove seasonality 



In [None]:
import math 

# We normalize x values to match with the 0-2π cycle
master_filter["month_norm"] = 2 * math.pi * master_filter["month"] / master_filter["month"].max()

master_filter["cos_month"] = np.cos(master_filter["month_norm"])
master_filter["sin_month"] = np.sin(master_filter["month_norm"])

In [None]:
master_filter["month_norm"]

In [None]:
# save filtered, normed to csv

master_filter.to_csv('../data/master_filter.csv')

In [None]:
# read filtered, normed time data
#  
master_filter = pd.read_csv('../data/master_filter.csv')

In [None]:
# generate the feature mean_price_per_day 

master_filter_price_agg = master_filter.groupby(['listing_id','month']).agg(['mean'])['filled_in_price_per_day'].reset_index()

In [None]:
# merge aggregate data back into master set

master_filter_1 = pd.merge(master_filter_price_agg, master_filter, left_on=['listing_id','month'], right_on=['listing_id','month'], how='right')

In [None]:
# rename column so it is more easily recognizer

master_filter_1.rename(columns={'mean':'mean_price_per_day'},inplace=True)

In [None]:
# confirm rename

list(master_filter_1.columns.values)

In [None]:
# drop more things that need not be in the data set for the baseline model

drop_list = ['filled_in_price_per_day','option_wheelchair_accessible','option_family_travel',
'option_close_to_the_water','option_holiday_with_your_baby',
 'option_long_term_holiday',
 'option_fully_accessible',
 'option_technicians',
 'option_close_to_the_ski_lift','max_price_per_day',
 'min_price_per_day','month_norm','year','month',
 'option_holiday_with_your_dog','option_holiday_with_your_horse']

for i in drop_list:
    master_filter_1.drop([i],axis=1,inplace=True)

In [None]:
# save to csv if desired: this is the normed, filtered set with agg price data for the years in question

master_filter_1.to_csv('../data/master_filter_price_agg.csv')

In [None]:
# read in from csv in case memory proved to be an issue

master_filter_1 = pd.read_csv('../data/master_filter_price_agg.csv')

In [None]:
# convert property types into binary: holiday apartment = 0, and holiday_houses = 1

master_filter_1['property_type'].unique()
master_filter_1['prop_bin'] = 1

In [None]:
master_filter_1.loc[master_filter_1['property_type'] == 'holiday_apartment', 'prop_bin'] = 0

In [None]:
master_filter_1.head()

In [None]:
# more drops
drop_list =['property_type','Unnamed: 0']

for i in drop_list:
    master_filter_1.drop([i],axis=1,inplace=True)

In [None]:
# confirm variable title requires getting dummies

master_filter_1['title'].unique()

In [None]:
# trying not to be a dummy by getting dummies

master_dummies = pd.get_dummies(master_filter_1, columns=['holiday_region','title','option_allergic','option_non_smoking_only','option_holiday_with_your_pet','option_close_to_the_beach'])

In [None]:
# save to csv if desired: this is the normed, filtered set with agg price data for the years in question

master_dummies.to_csv('../data/master_with_dummies.csv')

In [None]:
# read from csv

master_dummies_1 = pd.read_csv('../data/master_with_dummies_1.csv')


In [None]:
master_dummies = master_dummies_1

In [None]:
# free up memory

import gc
del master_filter_1
gc.collect()

# move now to do the train test split and then the model

In [None]:
# create X and Y sets for train test split

X = master_dummies.drop(['listing_id','inquiry_count'],axis=1)
Y = master_dummies['inquiry_count']

In [None]:
# train test split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=42)

In [None]:
# we love to normalize our data!!

from sklearn.preprocessing import MinMaxScaler

the_list = ['mean_price_per_day','adult_count','children_count','pets_count','length_stay','bathrooms','bedrooms','max_guests','living_area']


In [None]:
# Scaling with MinMaxScaler, do at this stage to prevent data leakage
scaler_norm = MinMaxScaler()
X_train_scaled_norm = scaler_norm.fit_transform(X_train[the_list])
X_test_scaled_norm = scaler_norm.transform(X_test[the_list])

In [None]:
# Concatenating normalized columns 
X_train_preprocessed_norm = np.concatenate([X_train_scaled_norm, X_train.drop(the_list, axis=1)], axis=1)
X_test_preprocessed_norm = np.concatenate([X_test_scaled_norm, X_test.drop(the_list, axis=1)], axis=1)

In [None]:
# confirm things look as they should

X_train_preprocessed_norm.shape
X_test_preprocessed_norm.shape

## Trainining the model and tracking with MLFlow

In [None]:
# setting the MLFlow connection and experiment
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)
mlflow.start_run()
run = mlflow.active_run()

In [None]:
print("Active run_id: {}".format(run.info.run_id))

In [None]:
#training the model
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X_train_preprocessed_norm, y_train)

In [None]:
from sklearn.metrics import mean_squared_error
y_train_pred = reg.predict(X_train_preprocessed_norm)
rmse_train = mean_squared_error(y_train, y_train_pred,squared=False)
print(rmse_train)

In [None]:
y_test_pred = reg.predict(X_test_preprocessed_norm)
rmse_test = mean_squared_error(y_test, y_test_pred,squared=False)
print(rmse_test)

In [None]:
# MLFlow parameters

params = {
      "model":"linear regression",
      "year":'2019',
      "train_test_split": 30,
      "normalized data": 'yes',
      "metric": 'rmse'
  }

In [None]:
mlflow.log_params(params)
mlflow.set_tag("running_from_jupyter", "baseline model 2019")
mlflow.log_metric("train -" + "RMSE", rmse_train)
mlflow.log_metric("test -" + "RMSE", rmse_test)
# mlflow.log_artifact("../models")
# mlflow.sklearn.log_model(reg, "model")
mlflow.end_run()

In [None]:
mlflow.get_run(run_id=run.info.run_id)

## Checking the experiments

while the next cell is running you will not be able to run other cells in the notebook

In [None]:
# !mlflow ui