In [None]:
import sys
# adding to the path variables the one folder higher (locally, not changing system variables)
sys.path.append("..")
import pandas as pd
# from pandas_profiling import ProfileReport
import numpy as np
import warnings
import mlflow
from modeling.config import EXPERIMENT_NAME
TRACKING_URI = open("../.mlflow_uri").read().strip()

ROOT = os.environ.get('PWD')

warnings.filterwarnings('ignore')






# Data cleaning and feature engineering

In [None]:
# read from csv

tiny_master = pd.read_csv('../data/tiny_master.csv')

In [None]:
tiny_master.drop(['Unnamed: 0','result_views','expose_views'],axis=1,inplace=True)

In [None]:
# convert to datetime, this list can be varied depending on what stays in for a particular model

dates = ['arrival_date', 'departure_date', 'date']

for i in dates:
    tiny_master[i] = pd.to_datetime(tiny_master[i])

In [None]:
tiny_master['year'] = pd.DatetimeIndex(tiny_master['arrival_date']).year
tiny_master['month'] = pd.DatetimeIndex(tiny_master['arrival_date']).month

In [None]:
# merge correct reseult views and expose views, aggregated by month, into tiny master

df_stats_extract = pd.read_csv('../data/statistic_id_month_20210721.csv')

In [None]:
df_stats_extract.isna().sum()

In [None]:
df_stats_extract.drop(['Unnamed: 0'],axis=1,inplace=True)

In [None]:
tiny_master_plus = pd.merge(df_stats_extract, tiny_master, left_on=['listing_id','month','year'], right_on=['listing_id','month','year'], how='right')

In [None]:
tiny_master_2019 = tiny_master_plus.query('year==2019')

In [None]:
tiny_master_not_2021 = tiny_master_plus.query('year == 2019 or year== 2020')

In [None]:
tiny_master_not_2021.tail()

In [None]:
tiny_master_not_2021.isna().sum()

In [None]:
# save new tiny master

tiny_master_not_2021.to_csv('../data/new_tiny_master.csv')

In [None]:
# read in new tiny master

new_tiny_master = pd.read_csv('../data//new_tiny_master.csv')

In [None]:
# extract the data for arrival date in 2019 only

tiny_master_19 = new_tiny_master.query("year == 2019")

In [None]:
import math 

# We normalize x values to match with the 0-2π cycle
tiny_master_19["month_norm"] = 2 * math.pi * tiny_master_19["month"] / tiny_master_19["month"].max()

tiny_master_19["cos_month"] = np.cos(tiny_master_19["month_norm"])
tiny_master_19["sin_month"] = np.sin(tiny_master_19["month_norm"])

In [None]:
tiny_master_19["month_norm"]

In [None]:
tiny_master_19['price_per_day'] = tiny_master_19['inquiry_price']/tiny_master_19['length_stay']

In [None]:
# generate the feature inquiry_price_per_day, max_price_per_day, min_price_per_day

medium_master_price_agg = tiny_master_19.groupby(['listing_id','month']).agg(['mean','max','min'])['price_per_day'].reset_index()

In [None]:
medium_master_price_agg

In [None]:
medium_master_inquiry_count = tiny_master_19.groupby(['listing_id','month']).agg(['count'])['price_per_day'].reset_index()

In [None]:
medium_master_inquiry_count

In [None]:
medium_master_1 = pd.merge(medium_master_price_agg, tiny_master_19, left_on=['listing_id','month'], right_on=['listing_id','month'], how='right')

In [None]:
medium_master_1.drop('Unnamed: 0',axis=1,inplace=True)

In [None]:
medium_master_2 = pd.merge(medium_master_inquiry_count, medium_master_1, left_on=['listing_id','month'], right_on=['listing_id','month'], how='right')

In [None]:
medium_master_2.drop(['inquiry_price','month','month_norm','year','yearmonth'],axis=1,inplace=True)

In [None]:
medium_master_2.drop(['price_per_day'],axis=1,inplace=True)

In [None]:
medium_master_2.rename(columns={'count':'inquiry_count','mean':'mean_price_per_day','max':'max_price_per_day','min':'min_price_per_day'},inplace=True)

In [None]:
list(medium_master_2.columns.values)

In [None]:
medium_master_2.drop(['date','arrival_date','departure_date','option_holiday_with_your_dog','option_holiday_with_your_horse'],axis=1,inplace=True)

In [None]:
# trying not to be a dummy by getting dummies

master_dummies = pd.get_dummies(medium_master_2, columns=['subregion','property_type'])

In [None]:
master_dummies.to_csv('../data/master_with_dummies.csv')

In [None]:
X = master_dummies.drop(['listing_id','inquiry_count'],axis=1)
Y = master_dummies['inquiry_count']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=42)

In [None]:
# we love to normalize our data!!

from sklearn.preprocessing import MinMaxScaler


In [None]:
the_list = ['result_views','expose_views','adult_count','children_count','pets_count','length_stay','bathrooms','bedrooms','max_guests','living_area']

In [None]:
# Scaling with MinMaxScaler
scaler_norm = MinMaxScaler()
X_train_scaled_norm = scaler_norm.fit_transform(X_train[the_list])
X_test_scaled_norm = scaler_norm.transform(X_test[the_list])

In [None]:
# Concatenating normalized columns 
X_train_preprocessed_norm = np.concatenate([X_train_scaled_norm, X_train.drop(the_list, axis=1)], axis=1)
X_test_preprocessed_norm = np.concatenate([X_test_scaled_norm, X_test.drop(the_list, axis=1)], axis=1)

In [None]:
X_train_preprocessed_norm.shape

In [None]:
X_test_preprocessed_norm.shape

## Trainining the model and tracking with MLFlow

In [None]:
# setting the MLFlow connection and experiment
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)
mlflow.start_run()
run = mlflow.active_run()

In [None]:
print("Active run_id: {}".format(run.info.run_id))

In [None]:
master_dummies.isna().sum()

In [None]:
#training the model
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X_train_preprocessed_norm, y_train)

In [None]:
from sklearn.metrics import mean_squared_error
y_train_pred = reg.predict(X_train_preprocessed_norm)
rmse_train = mean_squared_error(y_train, y_train_pred,squared=False)
print(rmse_train)

In [None]:
y_test_pred = reg.predict(X_test_preprocessed_norm)
rmse_test = mean_squared_error(y_test, y_test_pred,squared=False)
print(rmse_test)

In [None]:
params = {
      "train_test_split": 30,
      "normalized data": 'yes',
      "metric": 'rmse'
  }

In [None]:
mlflow.log_params(params)
mlflow.set_tag("running_from_jupyter", "baseline model 2019")
mlflow.log_metric("train -" + "RMSE", rmse_train)
mlflow.log_metric("test -" + "RMSE", rmse_test)
# mlflow.log_artifact("../models")
# mlflow.sklearn.log_model(reg, "model")
mlflow.end_run()

In [None]:
mlflow.get_run(run_id=run.info.run_id)

## Checking the experiments

while the next cell is running you will not be able to run other cells in the notebook

In [None]:
!mlflow ui