# Import

In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

import os
import time
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error

import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

import scipy.stats as stats

import seaborn as sns

from datetime import datetime

import re
import random

from tqdm.notebook import tqdm

import dask.dataframe as dd
from dask.distributed import LocalCluster, Client

sys.path.insert(0, 'tools/')

from tools import * 

In [None]:
import warnings
warnings.simplefilter('ignore')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import cross_val_score

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

from sklearn import neighbors
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import validation_curve

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error

In [None]:
import tensorflow as tf
from tensorflow import keras

In [None]:
from keras.callbacks_v1 import TensorBoard
from keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input

# Read files

In [None]:
%%time

X_train = pd.read_csv('dades/processed/training_data.csv', index_col='index')
X_train = X_train.reset_index(drop=True)
y_train = X_train.ctx0.copy()

X_val = pd.read_csv('dades/processed/validation_data.csv', index_col='index')
X_val = X_val.reset_index(drop=True)
y_val = X_val.ctx0.copy()

# X_test = pd.read_csv('dades/processed/testing_data.csv', index_col='index')
# X_test = X_test.reset_index(drop=True)
# y_test = X_test.ctx0.copy()


In [None]:
cat_y_train = ((y_train.copy()*100)/5).round().astype(int)
cat_y_val = ((y_val.copy()*100)/5).round().astype(int)
# cat_y_test = ((y_test.copy()*100)/5).round().astype(int)

In [None]:
years = X_train.year.unique().tolist()
years

In [None]:
months = X_train.month.unique().tolist()
months

In [None]:
cat_y_train.unique()

# Run pipeline

In [None]:
X_train.columns

In [None]:
%%time 

correlations = np.abs(
    X_train.loc[X_train.year.isin([2022]),
        [
            'station_id', 'year', 'month', 'dayofweek',
            'day', 'dayofyear', 'hour','capacity', 
            'ctx0', 'ctx1', 'ctx2', 'ctx3', 'ctx4',
            'festius', 'festius_sun', 'festius_sun_sat', 
            'weekend'
        ]
    ].corr(method='pearson').ctx0
).sort_values(ascending=False)

# num_docks_available                     0.880782
# timestamp                               0.024289
# num_bikes_available_types.ebike        -0.393887
# num_bikes_available_types.mechanical   -0.755995
# num_bikes_available                    -0.865541
# is_returning                            0.006442
# is_renting                              0.006442
# status                                  0.006277

correlations.to_dict()

In [None]:
columns_meteo = ['VALOR_TN_X4','VALOR_TM_X4', 'VALOR_TX_X4', 'VALOR_PPT_X4']

class Config:
    num_attribs0 = ['capacity', 'ctx1', 'ctx2', 'ctx3', 'ctx4'] + columns_meteo
    cat_attribs0 = ['dayofyear', 'hour', 'month', 'dayofweek', ]
#     cat_attribs1 = [station_id]
    gen_attribs0 = ['festius_sun', 'weekend']
    target_col = ['ctx0']
    
    epochs=50
    batch_size=128
    seed=42
    lr=1e-4

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
    os.environ['TF_CUDNN_DETERMINISTIC'] = '1'

    tf.config.threading.set_inter_op_parallelism_threads(1)
    tf.config.threading.set_intra_op_parallelism_threads(1)
    

In [None]:
config=Config()
seed_everything(config.seed)

In [None]:

def build_preprocessor(config):
    num_attribs0 = config.num_attribs0
    cat_attribs0 = config.cat_attribs0
#     cat_attribs1 = config.cat_attribs1
    gen_attribs0 = config.gen_attribs0

    num_transformer0 = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="mean")),
        ('std_scaler', (StandardScaler())),
    ])

    categorical_transformer0 = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="constant",fill_value=0)),
        ('ordinal_encoder', (OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan))),
    ])
    
#     categorical_transformer1 = Pipeline(steps=[
#         ("imputer", SimpleImputer(strategy="constant",fill_value=0)),
#         ('one_hot_encoder', (OneHotEncoder(handle_unknown='ignore'))),
#     ])
    
    generic_transformer0 = Pipeline([
        ("imputer", SimpleImputer(strategy="constant",fill_value=0)),
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ("num0", num_transformer0, num_attribs0),
            ("gen1", generic_transformer0, gen_attribs0),
            ("cat0", categorical_transformer0, cat_attribs0),
#             ("cat1", categorical_transformer1, cat_attribs1),
        ],
        remainder="drop"
    )
    
    return preprocessor

In [None]:
%%time 

full_pipeline = build_preprocessor(config)

In [None]:
%%time 

full_pipeline.fit(X_train)

In [None]:
def apply_pipeline(pipline, X, y, args=None, show=True):
    assert X.shape[0] == y.shape[0]
    
    X_prepared = full_pipeline.transform(X)
    
    if show:
        print("X", X.shape, 
              "X_prepared:", X_prepared.shape,
              "y: ", y.shape
             )
        
    if args:
        return X_prepared, y, *args
    else:
        return X_prepared, y

In [None]:
def prepare_splits(
    pipeline,
    data_train,
    data_val,
#     data_test,
    years, 
    months, 
    show=True
):
    if show:
        print("train")
    
    train_cond = (data_train[0].year.isin(years) & data_train[0].month.isin(months))
    Xtr, ytr = apply_pipeline(
        pipeline,
        data_train[0][train_cond], 
        data_train[1][train_cond],
        show=show
    )

    if show:
        print("val")
    
    val_cond = (data_val[0].year.isin(years) & data_val[0].month.isin(months))
    Xva, yva = apply_pipeline(
        pipeline, 
        data_val[0][val_cond], 
        data_val[1][val_cond],
        show=show
    )

#     if show:
#         print("test")
    
#     test_cond = (data_test[0].year.isin([2023]) & data_test[0].month.isin([3]))
#     Xte, yte = apply_pipeline(
#         pipeline, 
#         data_test[0][test_cond],
#         data_test[1][test_cond],
#         show=show
#     )
    
    return Xtr, ytr, Xva, yva #, Xte, yte

In [None]:
[months.remove(x) for x in [6,7,8,9]]

In [None]:
years, months

In [None]:
# Take data of 2022 

X_train, cat_y_train, X_val, cat_y_val = prepare_splits(
    full_pipeline,
    (X_train, cat_y_train),
    (X_val, cat_y_val),
#     (X_test, cat_y_test),
    [2022], months, True)

# X_test, cat_y_test

# Needed for NN

In [None]:
# %%time 

# X_test = X_test.toarray()


In [None]:
import gc

gc.collect()

# Predicción______________________________

a) Regresión lineal: relación lineal entre las variables de entrada y la variable de salida. 

b) Regresión Redes Neuronales (RNN -redes neuronales recurrentes-): pueden capturar relaciones no lineales entre las variables de entrada y salida.

In [None]:
print(keras.__version__)
print(tf.__version__)
print(tf.test.gpu_device_name())

In [None]:
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

In [None]:
# clear previous session
keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)

In [None]:
import concurrent.futures
import logging
import sys
import random
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

In [None]:
from keras import backend as K
# Configuring a session
session_conf = tf.compat.v1.ConfigProto(
    intra_op_parallelism_threads=3,
    inter_op_parallelism_threads=3
)
sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
#K.set_session(sess)
tf.compat.v1.keras.backend.set_session(sess)

In [None]:
from keras.callbacks_v1 import TensorBoard
from keras.callbacks import ReduceLROnPlateau

In [None]:
# indicate the input shape
input_shape = X_train.shape[1]
print(input_shape)

In [None]:
classes = sorted(cat_y_train.unique())
classes

In [None]:
class ExponentialLearningRate(keras.callbacks.Callback):
    def __init__(self, factor):
        self.factor = factor
        self.rates = []
        self.losses = []
    def on_batch_end(self, batch, logs):
        self.rates.append(K.get_value(self.model.optimizer.learning_rate))
        self.losses.append(logs["loss"])
        K.set_value(self.model.optimizer.learning_rate, self.model.optimizer.learning_rate * self.factor)

In [None]:
def root_mean_squared_error(y_true, y_pred):
    y = K.cast(y_true, K.np.float32)
    y_hat = K.cast(y_pred, K.np.float32)
    return K.sqrt(K.mean(K.square(y_hat - y)))

In [None]:
model = keras.models.Sequential([
    keras.layers.Input(
        shape=(input_shape,)
    ),
    keras.layers.Dense(input_shape//0.15, activation='relu'),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(input_shape//0.20, activation='relu'),
    keras.layers.Dense(input_shape//0.50, activation='relu'),
    keras.layers.Dense(len(classes), activation='softmax')
])


In [None]:
model.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=keras.optimizers.SGD(),
    metrics=["accuracy"]
)


In [None]:
# tensorboard=TensorBoard(log_dir="Model_log")
# define a call back
checkpoint_cb = keras.callbacks.ModelCheckpoint(
    "./checkpoints/softmax.h5", 
    verbose=2, 
    monitor='val_accuracy', 
    save_best_only=True
)

early_stopping_cb = keras.callbacks.EarlyStopping(
    monitor='val_accuracy',
    patience=10, 
    verbose=2, 
    restore_best_weights=True
)

# Set a learning rate annealer - to have a decreasing learning rate during the training to reach efficiently the global minimum of the loss function.
# The LR is decreased dynamically when the score is not improved. This keeps the advantage of the fast computation time with a high LR at the start.
learning_rate_reduction = ReduceLROnPlateau(
    monitor='val_accuracy',  # Track the score on the validation set
    patience=5,  # Number of epochs in which no improvement is seen.
    verbose=2,
    factor=0.8,  # Factor by which the LR is multiplied.
    min_lr=0.0000001  # Don't go below this value for LR.
)

In [None]:
model.summary()

In [None]:

keras.utils.plot_model(model, "softmax.png", show_shapes=True)


In [None]:
history = model.fit(
    X_train, cat_y_train, 
    batch_size=config.batch_size,
    epochs=config.epochs, 
    validation_data=(X_val, cat_y_val), 
    callbacks=[checkpoint_cb, early_stopping_cb, learning_rate_reduction]
)


## Generate Sample

In [None]:
sample_data = pd.read_csv('dades/processed/kaggle_sample_data.csv')

In [None]:
sample_data

# apply pipeline

# Prediction 

In [None]:
X_test_prepared = full_pipeline.transform(sample_data)

print("x_train_prepared:",X_train_prepared.shape,"y_train: ",y_train.shape)
print("x_test_prepared:",X_val_prepared1.shape,"y_test: ",y_val1.shape)
print("x_test_prepared:",X_val_prepared2.shape,"y_test: ",y_val2.shape)
print("x_test_prepared:",X_test_prepared.shape)

In [None]:
yhat = lin_reg.predict(X_test_prepared)


In [None]:
yhat.shape

In [None]:
sample_data['percentage_docks_available'] = yhat

In [None]:
sample_data['percentage_docks_available'].to_csv('predicton_RandomForest.csv', header=True, index_label='index')