# Preprocessing CGE data

In [None]:
# the following code lines are necessary to import custom module
import os
import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import boto3
import io
import json
import pandas as pd
from src.preprocessing.data_preparation_and_cleaning import change_features_names, get_station_name
from src.preprocessing.feature_engineer import get_wind_components, resample_data

In [None]:
# AWS credentials to read files on S3 bucket
f = open('../credentials.json')
credentials = json.load(f)

s3_client = boto3.client(
    's3',
    aws_access_key_id=credentials["Access key ID"],
    aws_secret_access_key=credentials["Secret access key"]
    )

s3_resource = boto3.resource(
    's3',
    aws_access_key_id=credentials["Access key ID"],
    aws_secret_access_key=credentials["Secret access key"]
    )

In [None]:
df_full = []
for year in list(range(2009, 2019+1)):  
    prefix=f"raw/{year}/"
    prefix_objs = s3_resource.Bucket("cge").objects.filter(Prefix=prefix)
    keys = [obj.key for obj in prefix_objs]
    for key in keys:
        obj = s3_client.get_object(Bucket="cge", Key=key)
        df = pd.read_csv(io.BytesIO(obj["Body"].read()))
        df_full.append(df)

In [None]:
cge = pd.concat(df_full, ignore_index=True)
cge = cge.drop_duplicates(ignore_index=True)
cge = get_station_name(cge)
cge[['Posto Nome']] = cge[['Posto Nome']].replace('NaN', '1000300')

cge = cge[[
    'Posto',
    'Posto Nome',
    'DATA',
    'PLU(mm)',
    'Vel.VT(m/s)',
    'Dir.VT(o)',
    'Temp(oC)',
    'Umid.Rel.(%)',
    'Pressão(mb)',
    'Rajada.VT(m/s)',
    'Sens. Térmica(°C)'
    ]]

cge = cge[cge['Posto Nome'].isin(['Sé', 'Lapa', 'Pirituba', 'Penha', 'Jabaquara', 'Parelheiros'])]
cge = change_features_names(cge)

In [None]:
lapa = cge[cge.station_name == "Lapa"] \
    .dropna(subset=["precipitation", "wind_velocity", "wind_direction", "temperature", "relative_humidity", "pressure", "wind_blow"]) \
    .reset_index(drop=True) \
    .drop(["thermal_sensation"], axis=1)

lapa

In [None]:
lapa = get_wind_components(
    lapa, 
    wind_velocity='wind_velocity', 
    wind_direction='wind_direction', 
    x_name='wind_velocity_x', 
    y_name='wind_velocity_y'
    )

lapa = lapa.drop(['wind_velocity'], axis=1)

lapa = get_wind_components(
    lapa, 
    wind_velocity='wind_blow', 
    wind_direction='wind_direction', 
    x_name='wind_blow_x', 
    y_name='wind_blow_y'
    )

lapa = lapa.drop(['wind_blow'], axis=1)
lapa = lapa.drop(['wind_direction'], axis=1)
lapa = resample_data(lapa)
lapa = lapa.reset_index()
lapa = lapa.set_index("timestamp")

lapa

In [None]:
cols = [
    'precipitation',
    'temperature',
    'relative_humidity',
    'pressure',
    'wind_velocity_x',
    'wind_velocity_y',
    'wind_blow_x',
    'wind_blow_y'
    ]

In [None]:
from feature_engine.creation import CyclicalFeatures
from feature_engine.datetime import DatetimeFeatures
from feature_engine.imputation import DropMissingData
from feature_engine.selection import DropFeatures
from feature_engine.timeseries.forecasting import LagFeatures, WindowFeatures, ExpandingWindowFeatures
from sklearn.pipeline import Pipeline

In [None]:
dtf = DatetimeFeatures(
    # the datetime variable
    variables="index",
    
    # the features we want to create
    features_to_extract=[
        "month",
        "week",
        "day_of_month",
        "hour",
    ],
)

lagf = LagFeatures(
    variables=cols, # the input variables
    freq=[f"{i}H" for i in range(1,37)], # move 1 hr to 36 hrs forward
    missing_values="ignore"
)

winf = WindowFeatures(
    variables=cols, # the input variables
    window=["3H", "6H", "12H", "24H"], # average of 3, 6, 12 and 24 previous hours
    freq="1H", # move 1 hr forward
    functions=["mean", "std", "min", "max"],
    missing_values="ignore"
)

cyclicf = CyclicalFeatures(
    variables=["month", "hour"], # The features we want to transform.
    drop_original=False, # Whether to drop the original features.
)

pipe = Pipeline(
    [
        ("datetime_features", dtf),
        ("lagf", lagf),
        ("winf", winf),
        ("Periodic", cyclicf),
    ]
)

lapa = pipe.fit_transform(lapa)
lapa = lapa.drop(["station", "station_name"], axis=1)


In [None]:
lapa.head()