## Import necessary libraries 


In [None]:

import sys
# adding to the path variables the one folder higher (locally, not changing system variables)
sys.path.append("..")
import pandas as pd
# from pandas_profiling import ProfileReport
import numpy as np
import warnings
import mlflow
import gc
import seaborn as sns
import matplotlib.pyplot as plt 
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error 
from sklearn.model_selection import cross_val_score 
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from modeling.config import EXPERIMENT_NAME
TRACKING_URI = open("../.mlflow_uri").read().strip()

ROOT = os.environ.get('PWD')

warnings.filterwarnings('ignore')

## Importing Data

In [None]:
# save to csv if desired: this is the normed, filtered set with agg price data for the years in question
master_dummies = pd.read_csv('../data/master_with_dummies.csv')

## Modelling

In [None]:
# free up memory
del master_filter_1
gc.collect()

In [None]:
# create X and Y sets for train test split
X = master_dummies.drop(['listing_id','inquiry_count'],axis=1)
Y = master_dummies['inquiry_count']

In [None]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=42)

In [None]:
the_list = ['mean_price_per_day','adult_count','children_count','pets_count','length_stay','bathrooms','bedrooms','max_guests','living_area']

In [None]:
# Scaling with MinMaxScaler
scaler_norm = MinMaxScaler()
X_train_scaled_norm = scaler_norm.fit_transform(X_train[the_list])
X_test_scaled_norm = scaler_norm.transform(X_test[the_list])

In [None]:
# Concatenating normalized columns 
X_train_preprocessed_norm = np.concatenate([X_train_scaled_norm, X_train.drop(the_list, axis=1)], axis=1)
X_test_preprocessed_norm = np.concatenate([X_test_scaled_norm, X_test.drop(the_list, axis=1)], axis=1)

In [None]:
X_train_preprocessed_norm.shape

In [None]:
X_test_preprocessed_norm.shape


## Trainining the model and tracking with MLFlow


In [None]:
# setting the MLFlow connection and experiment
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)
mlflow.start_run()
run = mlflow.active_run()

In [None]:
print("Active run_id: {}".format(run.info.run_id))

In [None]:
#training the model
reg1 = LinearRegression().fit(X_train_preprocessed_norm, y_train)

In [None]:
y_train_pred = reg1.predict(X_train_preprocessed_norm)
rmse_train = mean_squared_error(y_train, y_train_pred,squared=False)
r2_train = r2_score(y_train, y_train_pred)
print(rmse_train)
print(r2_train)

In [None]:
y_test_pred = reg1.predict(X_test_preprocessed_norm)
rmse_test = mean_squared_error(y_test, y_test_pred,squared=False)
r2_test = r2_score(y_test, y_test_pred)
print(rmse_test)
print(r2_test)

## Validating model and visualizing model and residuals

In [None]:
mean_absolute_error(y_test, y_test_pred)
print(cross_val_score(reg1, X, Y, cv=10, scoring='r2').mean())

In [None]:
y_residual = y_test - y_test_pred

In [None]:
sns.scatterplot(x=X_train, y=y_rep, )


In [None]:
plt.rcParams['agg.path.chunksize'] = 10000

In [None]:
sns.residplot(x=y_test, y=y_test_pred)

In [None]:
plt.scatter(y=y_test, x=y_test_pred)
plt.plot(y_test_pred, y_test_pred, color='orange')
plt.xlabel('Actual')
plt.ylabel('predicted')
plt.title(f'Actual VS Predicted Inquiries')
plt.show()

In [None]:
plt.axhline(0, c=(.5, .5, .5), ls='--')
plt.axvline(0, c=(.5, .5, .5), ls='--')
plt.scatter(x=y_test_pred, y=y_residual)

In [None]:
sns.displot(y_residual)

In [None]:
params = {
      "Model" : "Lasso",
      "Folds this run": 5
      "train_test_split": 30,
      "normalized data": 'yes',
      "2019 and 2020 data": "2019", 
      "metric": 'rmse', 'r2'
      "cross_val_score": "yes", "no"
  }

In [None]:
lflow.log_params(params)
mlflow.set_tag("running_from_jupyter", "Lasso model 2019")
mlflow.log_metric("train -" + "RMSE", rmse_train)
mlflow.log_metric("test -" + "RMSE", rmse_test)
# mlflow.log_artifact("../models")
# mlflow.sklearn.log_model(reg, "model")
mlflow.end_run()

In [None]:
mlflow.get_run(run_id=run.info.run_id)

## Checking the experiments

In [None]:
# while the next cell is running you will not be able to run other cells in the notebook
!mlflow ui