# Preprocessing CGE data

In [1]:
# the following code lines are necessary to import custom module
import os
import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import boto3
import io
import json
import pandas as pd
import numpy as np
from src.preprocessing.data_preparation_and_cleaning import change_features_names, get_station_name
from src.preprocessing.feature_engineer import get_wind_components, resample_data
from statsmodels.tsa.seasonal import STL

In [2]:
# # AWS credentials to read files on S3 bucket
# f = open('../credentials.json')
# credentials = json.load(f)

# s3_client = boto3.client(
#     's3',
#     aws_access_key_id=credentials["Access key ID"],
#     aws_secret_access_key=credentials["Secret access key"]
#     )

# s3_resource = boto3.resource(
#     's3',
#     aws_access_key_id=credentials["Access key ID"],
#     aws_secret_access_key=credentials["Secret access key"]
#     )

In [3]:
# df_full = []
# for year in list(range(2009, 2019+1)):  
#     prefix=f"raw/{year}/"
#     prefix_objs = s3_resource.Bucket("cge").objects.filter(Prefix=prefix)
#     keys = [obj.key for obj in prefix_objs]
#     for key in keys:
#         obj = s3_client.get_object(Bucket="cge", Key=key)
#         df = pd.read_csv(io.BytesIO(obj["Body"].read()))
#         df_full.append(df)

In [4]:
# cge = pd.concat(df_full, ignore_index=True)
# cge = cge.drop_duplicates(ignore_index=True)
# cge = get_station_name(cge)
# cge[['Posto Nome']] = cge[['Posto Nome']].replace('NaN', '1000300')

# cge = cge[[
#     'Posto',
#     'Posto Nome',
#     'DATA',
#     'PLU(mm)',
#     'Vel.VT(m/s)',
#     'Dir.VT(o)',
#     'Temp(oC)',
#     'Umid.Rel.(%)',
#     'Pressão(mb)',
#     'Rajada.VT(m/s)',
#     'Sens. Térmica(°C)'
#     ]]

# cge = cge[cge['Posto Nome'].isin(['Sé', 'Lapa', 'Pirituba', 'Penha', 'Jabaquara', 'Parelheiros'])]
# cge = change_features_names(cge)

In [5]:
# lapa = cge[cge.station_name == "Lapa"] \
#     .dropna(subset=["precipitation", "wind_velocity", "wind_direction", "temperature", "relative_humidity", "pressure", "wind_blow"]) \
#     .reset_index(drop=True) \
#     .drop(["thermal_sensation"], axis=1)

# lapa.to_csv("../data/raw/lapa.csv")

In [6]:
# rural = cge[cge.station_name == "Parelheiros"] \
#     .dropna(subset=["precipitation", "wind_velocity", "wind_direction", "temperature", "relative_humidity", "pressure", "wind_blow"]) \
#     .reset_index(drop=True) \
#     .drop(["thermal_sensation"], axis=1)

# rural = rural.set_index("timestamp")
# rural = rural.drop(["station", "station_name"], axis=1)
# rural.columns = [f"{i}_rural" for i in list(rural.columns)]
# rural = rural.reset_index()

# rural.to_csv("../data/raw/rural.csv")

In [7]:
lapa = pd.read_csv("../data/raw/lapa.csv").drop("Unnamed: 0", axis=1)
rural = pd.read_csv("../data/raw/rural.csv").drop("Unnamed: 0", axis=1)

In [8]:
lapa = lapa.merge(rural, on="timestamp")
lapa["uhi"] = lapa.temperature - lapa.temperature_rural 
lapa

Unnamed: 0,station,station_name,timestamp,precipitation,wind_velocity,wind_direction,temperature,relative_humidity,pressure,wind_blow,precipitation_rural,wind_velocity_rural,wind_direction_rural,temperature_rural,relative_humidity_rural,pressure_rural,wind_blow_rural,uhi
0,1000848,Lapa,2016-01-21 14:10:00,0.0,5.640,166.041,24.054,62.180,934.53,6.580,0.25,2.444,203.645,23.349,64.585,926.27,6.768,0.705
1,1000848,Lapa,2016-01-21 14:20:00,0.6,2.068,129.338,24.034,62.821,934.42,6.580,0.25,2.632,207.687,23.067,63.846,926.30,6.768,0.967
2,1000848,Lapa,2016-01-21 14:30:00,0.6,1.880,100.178,23.983,59.360,934.36,5.264,0.25,4.512,228.272,22.724,65.348,926.29,6.768,1.259
3,1000848,Lapa,2016-01-21 14:40:00,0.6,2.444,168.234,23.822,60.172,934.72,7.332,0.25,4.136,230.220,22.483,65.647,926.38,6.956,1.339
4,1000848,Lapa,2016-01-21 14:50:00,0.6,2.256,161.165,23.903,59.659,935.08,7.520,0.25,2.820,227.683,22.805,64.908,926.08,5.640,1.098
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
154890,1000848,Lapa,2019-02-18 22:00:00,0.0,0.000,182.327,24.890,80.888,931.12,3.384,0.00,0.940,219.600,20.942,95.568,923.08,1.128,3.948
154891,1000848,Lapa,2019-02-18 22:10:00,0.0,0.000,159.136,24.527,82.646,931.63,2.068,0.00,0.188,187.969,20.871,95.776,923.25,1.504,3.656
154892,1000848,Lapa,2019-02-18 22:20:00,0.0,0.000,157.964,24.376,83.934,931.18,0.000,0.00,0.000,188.378,20.660,95.971,922.98,0.752,3.716
154893,1000848,Lapa,2019-02-18 22:30:00,0.0,0.000,135.862,24.225,84.697,931.76,2.820,0.00,0.940,187.805,20.609,96.075,922.77,0.940,3.616


In [9]:
lapa = get_wind_components(
    lapa, 
    wind_velocity='wind_velocity', 
    wind_direction='wind_direction', 
    x_name='wind_velocity_x', 
    y_name='wind_velocity_y'
    )

lapa = get_wind_components(
    lapa, 
    wind_velocity='wind_velocity_rural', 
    wind_direction='wind_direction_rural', 
    x_name='wind_velocity_x_rural', 
    y_name='wind_velocity_y_rural'
    )

lapa = lapa.drop(['wind_velocity', "wind_velocity_rural"], axis=1)

lapa = get_wind_components(
    lapa, 
    wind_velocity='wind_blow', 
    wind_direction='wind_direction', 
    x_name='wind_blow_x', 
    y_name='wind_blow_y'
    )

lapa = get_wind_components(
    lapa, 
    wind_velocity='wind_blow_rural', 
    wind_direction='wind_direction_rural', 
    x_name='wind_blow_x_rural', 
    y_name='wind_blow_y_rural'
    )

lapa = lapa.drop(['wind_blow', 'wind_blow_rural', 'wind_direction', "wind_direction_rural"], axis=1)
lapa = resample_data(lapa)
lapa = lapa.reset_index()
lapa = lapa.set_index("timestamp")

lapa

  df = df.resample(rule='60min').mean()


Unnamed: 0_level_0,station,precipitation,temperature,relative_humidity,pressure,precipitation_rural,temperature_rural,relative_humidity_rural,pressure_rural,uhi,wind_velocity_x,wind_velocity_y,wind_velocity_x_rural,wind_velocity_y_rural,wind_blow_x,wind_blow_y,wind_blow_x_rural,wind_blow_y_rural,station_name
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2016-01-21 14:00:00,1000848,0.48,23.959200,60.838400,934.622000,0.25,22.885600,64.866800,926.264000,1.073600,-2.328878,1.207415,-2.423504,-2.166851,-5.156431,3.156087,-4.989077,-4.085259,Lapa
2016-01-21 15:00:00,1000848,0.60,24.260500,59.027167,934.495000,0.25,23.360333,65.310167,925.856667,0.900167,-2.345159,1.383109,-1.503434,-1.626554,-3.926744,3.178061,-3.718405,-4.473877,Lapa
2016-01-21 16:00:00,1000848,0.60,24.631000,56.776833,934.336667,0.25,22.873667,63.563500,925.821667,1.757333,-3.118908,1.691404,-3.378752,-1.953446,-5.939076,3.450454,-5.475402,-3.475989,Lapa
2016-01-21 17:00:00,1000848,0.60,24.642833,55.731833,934.245000,0.25,22.880500,64.743333,925.243333,1.762333,-1.820458,1.629404,-0.091948,-1.772614,-3.763557,4.095237,-0.562312,-4.957440,Lapa
2016-01-21 18:00:00,1000848,0.60,24.625833,55.659833,933.590000,0.25,22.210500,66.918667,925.206667,2.415333,-3.149486,1.046056,0.138602,-3.055047,-5.778626,1.970324,0.244836,-6.892774,Lapa
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-02-18 18:00:00,1000848,0.00,27.068333,67.378167,930.448333,0.00,24.683000,77.648167,922.628333,2.385333,0.987694,-1.814244,-1.996099,0.305147,3.193784,-4.762114,-3.437821,0.428088,Lapa
2019-02-18 19:00:00,1000848,0.00,26.657000,69.979500,930.051667,0.00,23.723167,84.699167,921.975000,2.933833,0.917986,-0.961140,-2.063564,0.145425,2.594402,-4.523592,-3.898125,0.106745,Lapa
2019-02-18 20:00:00,1000848,0.00,26.392000,71.525833,930.355000,0.00,22.447333,89.048167,922.263333,3.944667,0.097844,-0.196218,-1.365202,-0.096637,1.376222,-3.055869,-3.354214,-0.101888,Lapa
2019-02-18 21:00:00,1000848,0.00,25.975667,72.498333,930.805000,0.00,21.134667,94.161000,923.043333,4.841000,0.079571,-0.134955,-0.983332,-0.587433,-0.145668,-0.977395,-2.348451,-1.609505,Lapa


In [10]:
to_drop = [
    'station',
    'station_name',
]

cols = list(lapa.drop(to_drop, axis=1).columns)
cols

['precipitation',
 'temperature',
 'relative_humidity',
 'pressure',
 'precipitation_rural',
 'temperature_rural',
 'relative_humidity_rural',
 'pressure_rural',
 'uhi',
 'wind_velocity_x',
 'wind_velocity_y',
 'wind_velocity_x_rural',
 'wind_velocity_y_rural',
 'wind_blow_x',
 'wind_blow_y',
 'wind_blow_x_rural',
 'wind_blow_y_rural']

In [11]:
# def get_outliers(df, features, factor):
#     df_outliers = pd.DataFrame()
#     for feature in features:
#         df_ = df.copy()
#         df_rolling_stats = df_[feature].rolling(window=24, center=True, min_periods=1).agg({"rolling_mean": "mean", "rolling_std": "std"})
#         df_[["rolling_mean", "rolling_std"]] = df_rolling_stats
#         df_outliers[f"{feature}_outlier"] = np.abs(df_[feature] - df_["rolling_mean"]) > factor * df_["rolling_std"]
#         df_outliers[f"{feature}_outlier"] = df_outliers[f"{feature}_outlier"].astype(int)
#     return df_outliers

# df_outliers = get_outliers(df=lapa, features=cols, factor=3)
# df_outliers

In [12]:
# outliers = lapa[["uhi"]].merge(df_outliers[["uhi_outlier"]], left_index=True, right_index=True)
# outliers = outliers[outliers.uhi_outlier == 1].drop("uhi_outlier", axis=1)
# outliers

In [13]:
# import matplotlib.pyplot as plt
# plt.plot(lapa.uhi)
# plt.plot(lapa.uhi.rolling(24).mean())
# plt.plot(outliers.uhi, ls="", marker=".", color="red")
# plt.show()

In [14]:
# cols = list(lapa.drop([
#     "station", 
#     "station_name",
#     'precipitation',
#     'wind_velocity',
#     'temperature',
#     'relative_humidity',
#     'pressure',
#     'precipitation_rural',
#     'wind_velocity_rural',
#     'temperature_rural',
#     'relative_humidity_rural',
#     'pressure_rural',
# ], axis=1).columns)

# cols

In [15]:
from feature_engine.creation import CyclicalFeatures
from feature_engine.datetime import DatetimeFeatures
from feature_engine.timeseries.forecasting import LagFeatures, WindowFeatures
from sklearn.pipeline import Pipeline

In [16]:
# # binary features
# dtf = DatetimeFeatures(
#     variables="index",
#     features_to_extract=[
#         "month",
#         "hour",
#     ],
# )

# lagf = LagFeatures(
#     variables="uhi_outlier", # the input variables
#     freq=[f"{i}H" for i in range(1,3)], # move 1 hr to n hrs forward
#     missing_values="ignore"
# )

# winf = WindowFeatures(
#     variables=["uhi_outlier"], # the input variables
#     window=["3H", "6H", "12H"], 
#     freq="1H", # move 1 hr forward
#     functions=["sum"],
#     missing_values="ignore"
# )

# pipe_binary_features = Pipeline(
#     [
#         ("dtf", dtf),
#         ("lagf", lagf),
#         ("winf", winf)
#     ]
# )

# df_binary_features = pipe_binary_features.fit_transform(df_outliers)

In [17]:
# continuous features
dtf = DatetimeFeatures(
    variables="index",
    features_to_extract=[
        "month",
        "hour",
    ],
)

cyclicf = CyclicalFeatures(
    variables=["month", "hour"], # The features we want to transform.
    drop_original=False, # Whether to drop the original features.
)

lagf = LagFeatures(
    variables=cols, # the input variables
    freq=[f"{i}H" for i in range(1,24)], # move 1 hr to n hrs forward
    missing_values="ignore"
)

winf = WindowFeatures(
    variables=cols, # the input variables
    window=[f"{i}H" for i in range(1,24)],
    freq="1H", # move 1 hr forward
    functions=["mean", "std", "min", "max"],
    missing_values="ignore"
)

pipe_continuous_features = Pipeline(
    [
        ("dtf", dtf),
        ("cyclicf", cyclicf),
        ("lagf", lagf),
        ("winf", winf)
    ]
)

df_continuous_features = pipe_continuous_features.fit_transform(lapa)
df_continuous_features = df_continuous_features.drop(["month", "hour"], axis=1)

In [18]:
# import datetime

# def get_season(x):
#     try:
#         if x >= datetime.datetime(x.year, 12, 21) and x < datetime.datetime(x.year, 1, 1):
#             return "summer"
#         elif x >= datetime.datetime(x.year, 1, 1) and x < datetime.datetime(x.year, 3, 21):
#             return "summer"
#         elif x >= datetime.datetime(x.year, 3, 21) and x < datetime.datetime(x.year, 6, 21):
#             return "autumn"
#         elif x >= datetime.datetime(x.year, 6, 21) and x < datetime.datetime(x.year, 9, 23):
#             return "winter"
#         elif x >= datetime.datetime(x.year, 9, 23) and x < datetime.datetime(x.year, 12, 21):
#             return "spring"
#     except:
#         pass

# lapa["season"] = lapa.reset_index("timestamp")["timestamp"].apply(get_season)
# lapa["season"]

In [19]:
# import datetime

# # def get_season2(x):
# #     try:
# #         if x == 12:
# #             return "summer"
# #         elif x >= 1 and x <= 3:
# #             return "summer"
# #         elif x > 3 and x <= 6:
# #             return "autumn"
# #         elif x > 6 and x <= 9:
# #             return "winter"
# #         elif x > 9 and x < 12:
# #             return "spring"
# #     except:
# #         pass

# def get_season2(x):
#     try:
#         if x == 9 or x == 10 or x == 11 or x == 12 or x == 1 or x == 2:
#             return "spring_summer"
#         else:
#             return "autumn_winter"
#     except:
#         pass

# df_binary_features["season"] = df_binary_features["month"].apply(get_season2)
# df_binary_features["season"].unique()

In [20]:
# dummies = pd.get_dummies(df_binary_features[["season"]])
# df_binary_features = df_binary_features.reset_index().merge(dummies, on="timestamp").set_index("timestamp")

In [23]:
df_continuous_features.to_csv("../data/processed/df_continuous_features.csv", index=True)
# df_binary_features.to_csv("../data/processed/df_binary_features.csv", index=True)

In [24]:
df_continuous_features

Unnamed: 0_level_0,station,precipitation,temperature,relative_humidity,pressure,precipitation_rural,temperature_rural,relative_humidity_rural,pressure_rural,uhi,...,wind_blow_y_window_23H_min,wind_blow_y_window_23H_max,wind_blow_x_rural_window_23H_mean,wind_blow_x_rural_window_23H_std,wind_blow_x_rural_window_23H_min,wind_blow_x_rural_window_23H_max,wind_blow_y_rural_window_23H_mean,wind_blow_y_rural_window_23H_std,wind_blow_y_rural_window_23H_min,wind_blow_y_rural_window_23H_max
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-21 14:00:00,1000848,0.48,23.959200,60.838400,934.622000,0.25,22.885600,64.866800,926.264000,1.073600,...,,,,,,,,,,
2016-01-21 15:00:00,1000848,0.60,24.260500,59.027167,934.495000,0.25,23.360333,65.310167,925.856667,0.900167,...,3.156087,3.156087,-4.989077,,-4.989077,-4.989077,-4.085259,,-4.085259,-4.085259
2016-01-21 16:00:00,1000848,0.60,24.631000,56.776833,934.336667,0.25,22.873667,63.563500,925.821667,1.757333,...,3.156087,3.178061,-4.353741,0.898501,-4.989077,-3.718405,-4.279568,0.274794,-4.473877,-4.085259
2016-01-21 17:00:00,1000848,0.60,24.642833,55.731833,934.245000,0.25,22.880500,64.743333,925.243333,1.762333,...,3.156087,3.450454,-4.727628,0.907208,-5.475402,-3.718405,-4.011708,0.502993,-4.473877,-3.475989
2016-01-21 18:00:00,1000848,0.60,24.625833,55.659833,933.590000,0.25,22.210500,66.918667,925.206667,2.415333,...,3.156087,4.095237,-3.686299,2.210464,-5.475402,-0.562312,-4.248141,0.626315,-4.957440,-3.475989
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-02-18 18:00:00,1000848,0.00,27.068333,67.378167,930.448333,0.00,24.683000,77.648167,922.628333,2.385333,...,-5.248752,0.000000,0.565764,2.017731,-2.717396,3.856368,0.193443,1.099372,-1.957888,2.821975
2019-02-18 19:00:00,1000848,0.00,26.657000,69.979500,930.051667,0.00,23.723167,84.699167,921.975000,2.933833,...,-5.248752,0.000000,0.434920,2.180896,-3.437821,3.856368,0.301681,0.989719,-1.957888,2.821975
2019-02-18 20:00:00,1000848,0.00,26.392000,71.525833,930.355000,0.00,22.447333,89.048167,922.263333,3.944667,...,-5.248752,0.000000,0.369772,2.290306,-3.898125,3.856368,0.395528,0.853815,-1.300598,2.821975
2019-02-18 21:00:00,1000848,0.00,25.975667,72.498333,930.805000,0.00,21.134667,94.161000,923.043333,4.841000,...,-5.248752,0.000000,0.322314,2.358290,-3.898125,3.856368,0.374345,0.860386,-1.300598,2.821975
