# This notebook serves to create a model using polynomial regression for our TFW data

In [None]:
import sys
# adding to the path variables the one folder higher (locally, not changing system variables)
sys.path.append("..")
import pandas as pd
# from pandas_profiling import ProfileReport
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

import warnings
import mlflow
from modeling.config import EXPERIMENT_NAME
TRACKING_URI = open("../.mlflow_uri").read().strip()

ROOT = os.environ.get('PWD')

warnings.filterwarnings('ignore')

# Data cleaning and feature engineering has taken place to run the baseline model, this data set will be used as it is for running the polynomial model

In [None]:
# read data from csv: this is the time normed, filtered set with agg price data for the years in question

master_dummies = pd.read_csv('../data/master_with_dummies.csv')

In [None]:
# create X and Y sets for train test split

X = master_dummies.drop(['listing_id','inquiry_count'],axis=1)
Y = master_dummies['inquiry_count']

In [None]:
# train test split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=42)

In [None]:
# Creating a list with numerical features
the_list = ['mean_price_per_day','length_stay','adult_count','children_count','pets_count','bathrooms','bedrooms','max_guests','living_area', 'lat', 'lng']

In [None]:
# Creating the polynomial features for a given degree.

poly_features = PolynomialFeatures(degree=2)

# transforms the existing features to higher degree features.
X_train_poly = poly_features.fit_transform(X_train[the_list])
X_test_poly = poly_features.fit_transform(X_test[the_list])

In [None]:
# we love to normalize our data!!
from sklearn.preprocessing import MinMaxScaler

# Scaling with MinMaxScaler
scaler_norm = MinMaxScaler()
X_train_scaled_norm = scaler_norm.fit_transform(X_train_poly)
X_test_scaled_norm = scaler_norm.transform(X_test_poly)

In [None]:
# Concatenating normalized columns 
X_train_preprocessed_norm = np.concatenate([X_train_scaled_norm, X_train.drop(the_list, axis=1)], axis=1)
X_test_preprocessed_norm = np.concatenate([X_test_scaled_norm, X_test.drop(the_list, axis=1)], axis=1)

In [None]:
print(X_train_preprocessed_norm.shape)
print(X_test_preprocessed_norm.shape)
print(y_train.shape)
print(y_test.shape)

## Trainining the model and tracking with MLFlow

In [None]:
# setting the MLFlow connection and experiment
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)
mlflow.start_run()
run = mlflow.active_run()

In [None]:
print("Active run_id: {}".format(run.info.run_id))

In [None]:
# fit the transformed features to Linear Regression
poly_model = LinearRegression()
poly_model.fit(X_train_preprocessed_norm, y_train)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

score = poly_model.score(X_train_preprocessed_norm, y_train)
y_pred_train = poly_model.predict(X_train_preprocessed_norm)
y_pred_test = poly_model.predict(X_test_preprocessed_norm)
 
rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)
r2_test = r2_score(y_test, y_pred_test)

print("R-squared:", score)
print("RMSE train:", rmse_train)
print("RMSE test:", rmse_test)
print("R-squard test:", r2_test)

### Residual plots without optimization: Degree = 2

In [None]:
import seaborn as sns
sns.residplot(x=y_test, y=y_pred_test)

In [None]:
plt.scatter(x=y_test, y=y_pred_test)
plt.plot(y_pred_test, y_pred_test, color='orange')
plt.xlabel('Actual')
plt.ylabel('predicted')
plt.title(f'Actual VS Predicted Inquiries')
plt.show()

In [None]:
test = X_test.join(pd.DataFrame(y_test))

In [None]:
y = pd.DataFrame(y_pred_test, columns=['pred_inquiry_count'])

In [None]:
test.reset_index(drop=True, inplace=True)
y.reset_index(drop=True, inplace=True)

In [None]:
test = test.join(y)

In [None]:
test_neg = test.query('pred_inquiry_count <= 0')

In [None]:
# set global default to be able to see all columns

pd.set_option('display.max_rows', None)

In [None]:
round(test_neg.describe(),3)

In [None]:
X_test.shape

In [None]:
for column in X_test:
    print(column, X_test[column].min())

### Residual plots without optimization: degree = 3

In [None]:
sns.residplot(x=y_test, y=y_pred_test)

In [None]:
plt.scatter(y=y_test, x=y_pred_test)
plt.plot(y_pred_test, y_pred_test, color='orange')
plt.xlabel('Actual')
plt.ylabel('predicted')
plt.title(f'Actual VS Predicted Inquiries')
plt.show()

In [None]:
params = {
      "model":"Polynomial regression",
      "Polynomial params":"degree=3",
      "year":'2019',
      "train_test_split": 30,
      "normalized data": 'yes',
      "metric": 'rmse and R2'
  }

In [None]:
mlflow.log_params(params)
mlflow.set_tag("running_from_jupyter", "Polynomial model 2019")
mlflow.log_metric("train -" + "RMSE", rmse_train)
mlflow.log_metric("test -" + "RMSE", rmse_test)
# mlflow.log_artifact("../models")
# mlflow.sklearn.log_model(reg, "model")
mlflow.end_run()

In [None]:
mlflow.get_run(run_id=run.info.run_id)