# Preprocessing CGE data

In [16]:
import boto3
import io
import json
import pandas as pd
import sys
from os.path import expanduser

home = expanduser("~")
sys.path.append(f"{home}/ml-uhi/src/preprocessing")
from preprocessing import change_features_names, get_station_name, feature_engineer_pipeline

In [17]:
# AWS credentials to read files on S3 bucket
f = open('../credentials.json')
credentials = json.load(f)

s3_client = boto3.client(
    's3',
    aws_access_key_id=credentials["Access key ID"],
    aws_secret_access_key=credentials["Secret access key"]
    )

s3_resource = boto3.resource(
    's3',
    aws_access_key_id=credentials["Access key ID"],
    aws_secret_access_key=credentials["Secret access key"]
    )

In [18]:
df_full = []
for year in list(range(2009, 2019+1)):  
    prefix=f"raw/{year}/"
    prefix_objs = s3_resource.Bucket("cge").objects.filter(Prefix=prefix)
    keys = [obj.key for obj in prefix_objs]
    for key in keys:
        obj = s3_client.get_object(Bucket="cge", Key=key)
        df = pd.read_csv(io.BytesIO(obj["Body"].read()))
        df_full.append(df)

In [19]:
cge = pd.concat(df_full, ignore_index=True)
cge = cge.drop_duplicates(ignore_index=True)
cge = get_station_name(cge)
cge[['Posto Nome']] = cge[['Posto Nome']].replace('NaN', '1000300')

cge = cge[[
    'Posto',
    'Posto Nome',
    'DATA',
    'PLU(mm)',
    'Vel.VT(m/s)',
    'Dir.VT(o)',
    'Temp(oC)',
    'Umid.Rel.(%)',
    'Pressão(mb)',
    'Rajada.VT(m/s)',
    'Sens. Térmica(°C)'
    ]]

cge = cge[cge['Posto Nome'].isin(['Sé', 'Lapa', 'Pirituba', 'Penha', 'Jabaquara', 'Parelheiros'])]
cge = change_features_names(cge)
cge = cge.dropna(subset=['temperature'])
cge = feature_engineer_pipeline(cge)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[x_name] = wv * np.cos(wd_rad)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[y_name] = wv * np.sin(wd_rad)
  df = df.resample(rule='60min').mean()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[x_name] = wv * np.cos(wd_rad)
A value is trying to be set on a copy of a slice from a DataFrame.

In [20]:
cge.station_name.unique()

array(['Lapa', 'Sé', 'Parelheiros', 'Pirituba', 'Penha', 'Jabaquara'],
      dtype=object)

In [21]:
for i in cge.station_name.unique():
    print(i, cge[cge.station_name == i].timestamp.max())

Lapa 2019-02-18 22:00:00
Sé 2019-02-18 22:00:00
Parelheiros 2019-02-18 22:00:00
Pirituba 2019-02-18 22:00:00
Penha 2019-02-18 22:00:00
Jabaquara 2019-02-18 22:00:00


In [22]:
cge

Unnamed: 0,station,station_name,timestamp,temperature,precipitation,relative_humidity,pressure,wind_velocity_x,wind_velocity_y,wind_blow_x,wind_blow_y,day_sin,day_cos,year_sin,year_cos
0,1000848,Lapa,2012-09-27 15:00:00,13.997500,0.0,62.604000,948.365000,,,,,-0.707107,-7.071068e-01,-0.998196,-0.060034
1,1000848,Lapa,2012-09-27 16:00:00,14.230833,0.0,63.378167,946.660000,,,,,-0.866025,-5.000000e-01,-0.998239,-0.059318
2,1000848,Lapa,2012-09-27 17:00:00,14.303000,0.0,64.452333,946.225000,,,,,-0.965926,-2.588190e-01,-0.998281,-0.058603
3,1000848,Lapa,2012-09-27 18:00:00,14.212167,0.0,65.090333,946.263333,,,,,-1.000000,-6.383261e-12,-0.998323,-0.057887
4,1000848,Lapa,2012-09-27 19:00:00,13.584333,0.0,68.237833,947.458333,,,,,-0.965926,2.588190e-01,-0.998364,-0.057172
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32665,634,Jabaquara,2019-02-18 18:00:00,27.590500,0.0,55.360833,924.640000,,,,,-1.000000,-1.011709e-11,0.745081,0.666974
32666,634,Jabaquara,2019-02-18 19:00:00,26.820000,0.0,64.296000,924.540000,,,,,-0.965926,2.588190e-01,0.745559,0.666439
32667,634,Jabaquara,2019-02-18 20:00:00,24.154500,0.0,78.230167,925.338333,,,,,-0.866025,5.000000e-01,0.746037,0.665905
32668,634,Jabaquara,2019-02-18 21:00:00,22.717500,0.0,80.274667,925.993333,,,,,-0.707107,7.071068e-01,0.746514,0.665370
