# Forecasting

In [1]:
import boto3
import io
import json
import pandas as pd
import numpy as np
from unidecode import unidecode
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.experimental import enable_halving_search_cv
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.multioutput import RegressorChain
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import HalvingGridSearchCV

In [2]:
def make_steps(
    df : pd.DataFrame, 
    y : str, 
    steps : int
    ) -> pd.DataFrame:
    """
    Get target feature steps ahead.

    Parameters
    ----------
    df : pandas data frame
        Data frame with target feature.
    
    y : str
        Target feature name.
    
    steps : int
        Steps to forecast.

    Returns
    -------
    df : pandas data frame
        Data frame with target features.
    """

    for i in range(steps):
        df[f"{y}_step_{i+1}"] = df[y].shift(-i+1)
    return df


# models
rid = RegressorChain(
    base_estimator=RidgeCV(
        alphas=[0.001, 0.01, 0.1, 1, 10, 100, 1000],
        cv=TimeSeriesSplit(n_splits=3, test_size=2000),
    ),
    random_state=123
)

gb = RegressorChain(
    base_estimator=HalvingGridSearchCV(
        estimator=HistGradientBoostingRegressor(random_state=123),
        param_grid = {
            'learning_rate': [0.01, 0.05, 0.1, 0.2],
            'max_iter': [100, 500, 1000],
            'max_depth': [3, 5, 10],
            'min_samples_leaf': [20, 50, 100],
            'l2_regularization': [0, 0.01, 0.1, 1]
        },
        cv=TimeSeriesSplit(n_splits=3, test_size=2000),
        aggressive_elimination=True,
        scoring="neg_mean_squared_error",
        random_state=123,
        n_jobs=-1
    ),
    random_state=123
)

mlp = RegressorChain(
    base_estimator=HalvingGridSearchCV(
        estimator=MLPRegressor(max_iter=5000, random_state=123),
        param_grid = {
            'hidden_layer_sizes': [(50,), (50, 50), (100,), (100, 100), (100, 75, 25)],
            'activation': ['relu', 'logistic', 'tanh', 'identity'],
            'solver': ['adam', 'sgd', 'lbfgs'],
            'alpha': [0.0001, 0.001, 0.01, 0.1],
            'learning_rate': ['constant', 'invscaling', 'adaptive'],
            'learning_rate_init': [0.001, 0.01, 0.1]
        },
        cv=TimeSeriesSplit(n_splits=3, test_size=2000),
        aggressive_elimination=True,
        scoring="neg_mean_squared_error",
        random_state=123,
        n_jobs=-1
    ),
    random_state=123
)

models = {
    # "rid": rid,
    "gb": gb,
    # "mlp": mlp,
}

In [3]:
# AWS credentials to read files on S3 bucket
f = open('../credentials.json')
credentials = json.load(f)

s3_client = boto3.client(
    "s3",
    aws_access_key_id=credentials["Access key ID"],
    aws_secret_access_key=credentials["Secret access key"]
    )

s3_resource = boto3.resource(
    "s3",
    aws_access_key_id=credentials["Access key ID"],
    aws_secret_access_key=credentials["Secret access key"]
    )

In [4]:
prefix_objs = s3_resource.Bucket("cge").objects.filter(Prefix="processed")
keys = [obj.key for obj in prefix_objs]
keys

['processed/',
 'processed/anhembi.csv',
 'processed/butanta.csv',
 'processed/campo_limpo.csv',
 'processed/capela_do_socorro.csv',
 'processed/cge_clusters.csv',
 'processed/cidade_ademar.csv',
 'processed/cluster_A.csv',
 'processed/cluster_B.csv',
 'processed/cluster_C.csv',
 'processed/cluster_D.csv',
 'processed/freguesia_do_o.csv',
 'processed/ipiranga.csv',
 'processed/itaim_paulista.csv',
 'processed/itaquera.csv',
 'processed/jabaquara.csv',
 'processed/lapa.csv',
 "processed/m'boi_mirim.csv",
 'processed/maua.csv',
 'processed/mooca.csv',
 'processed/parelheiros.csv',
 'processed/penha.csv',
 'processed/perus.csv',
 'processed/pinheiros.csv',
 'processed/pirituba.csv',
 'processed/riacho_grande.csv',
 'processed/santana_do_parnaiba.csv',
 'processed/santo_amaro.csv',
 'processed/sao_mateus.csv',
 'processed/sao_miguel_paulista.csv',
 'processed/se.csv',
 'processed/tremembe.csv',
 'processed/tucuruvi.csv',
 'processed/vila_formosa.csv',
 'processed/vila_maria.csv',
 'process

In [5]:
# getting preprocessed data
# prefix_objs = s3_resource.Bucket("cge").objects.filter(Prefix="processed")
# keys = [obj.key for obj in prefix_objs]
keys = [
    'processed/cluster_A.csv',
    # 'processed/cluster_B.csv',
    # 'processed/cluster_C.csv',
    # 'processed/cluster_D.csv',
    # 'processed/parelheiros.csv'
]

for key in keys:
    obj = s3_client.get_object(Bucket="cge", Key=key)
    df = pd.read_csv(io.BytesIO(obj["Body"].read()))
    df = df.reset_index(drop=True)
    
    df = df.dropna(axis=1, how="all")
    df = df.dropna()
    
    cluster = df.cluster.unique()[0]
    ts = df[["timestamp"]]
    df = df.drop(["timestamp", "cluster", "Unnamed: 0"], axis=1)

    # target
    y = make_steps(df=df[["temperature"]], y="temperature", steps=6).drop("temperature", axis=1)
    y = y.dropna()

    # predictors
    X = df.drop(["temperature"], axis=1)
    X = X.loc[y.index.min():y.index.max()]

    # scaling data
    scaler = StandardScaler()
    scaler.fit(X)
    X_standard = scaler.transform(X)

    # train and test data split
    test_size = 0.30
    X_train_ref, X_test_ref, _, _ = train_test_split(X, y, test_size=0.30, shuffle=False)
    X_train, X_test, y_train, y_test = train_test_split(X_standard, y, test_size=test_size, shuffle=False)

    # model training
    for model in models:
        models[model].fit(X_train, y_train)
        y_pred = pd.DataFrame(models[model].predict(X_test), index=X_test_ref.index, columns=y.columns)
        ts = ts.loc[y_pred.index.min():y_pred.index.max()]
        y_pred["timestamp"] = ts.timestamp 
        y_pred["cluster"] = cluster

        # writing predictions to S3 bucket
        cluster_ = unidecode(key.lower().replace(" ", "_").replace("processed/", "").replace(".csv", ""))
        file_name = cluster_ + "_" + model
        buffer = io.StringIO()
        y_pred.to_csv(buffer)
        s3_resource.Object("cge", f"output_teste_2/{file_name}.csv").put(Body=buffer.getvalue())


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f"{y}_step_{i+1}"] = df[y].shift(-i+1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f"{y}_step_{i+1}"] = df[y].shift(-i+1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f"{y}_step_{i+1}"] = df[y].shift(-i+1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .l

KeyboardInterrupt: 

In [None]:
# # getting preprocessed data
# # prefix_objs = s3_resource.Bucket("cge").objects.filter(Prefix="processed")
# # keys = [obj.key for obj in prefix_objs]
# keys = [
#     'processed/cluster_A.csv',
#     'processed/cluster_B.csv',
#     'processed/cluster_C.csv',
#     'processed/cluster_D.csv'
# ]

# for key in keys:
#     obj = s3_client.get_object(Bucket="cge", Key=key)
#     df = pd.read_csv(io.BytesIO(obj["Body"].read()))
#     df = df.reset_index(drop=True)
    
#     df = df.dropna(axis=1, how="all")
#     df = df.dropna()
    
#     cluster = df.cluster.unique()[0]
#     ts = df[["timestamp"]]
#     df = df.drop(["timestamp", "cluster", "Unnamed: 0"], axis=1)

#     # target
#     y = make_steps(df=df[["temperature"]], y="temperature", steps=6).drop("temperature", axis=1)
#     y = y.dropna()

#     # predictors
#     X = df.drop(["temperature"], axis=1)
#     X = X.loc[y.index.min():y.index.max()]

#     # scaling data
#     scaler = StandardScaler()
#     scaler.fit(X)
#     X_standard = scaler.transform(X)

#     # train and test data split
#     test_size = 0.30
#     X_train_ref, X_test_ref, _, _ = train_test_split(X, y, test_size=0.30, shuffle=False)
#     X_train, X_test, y_train, y_test = train_test_split(X_standard, y, test_size=test_size, shuffle=False)

#     # model training
#     for model in models:
#         models[model].fit(X_train, y_train)
#         y_pred = pd.DataFrame(models[model].predict(X_test), index=X_test_ref.index, columns=y.columns)
#         ts = ts.loc[y_pred.index.min():y_pred.index.max()]
#         y_pred["timestamp"] = ts.timestamp 
#         y_pred["cluster"] = cluster

#         # writing predictions to S3 bucket
#         cluster_ = unidecode(key.lower().replace(" ", "_").replace("processed/", "").replace(".csv", ""))
#         file_name = cluster_ + "_" + model
#         buffer = io.StringIO()
#         y_pred.to_csv(buffer)
#         s3_resource.Object("cge", f"output/{file_name}.csv").put(Body=buffer.getvalue())


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f"{y}_step_{i+1}"] = df[y].shift(-i+1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f"{y}_step_{i+1}"] = df[y].shift(-i+1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f"{y}_step_{i+1}"] = df[y].shift(-i+1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .l