In [1]:
import boto3
import io
import json
import pandas as pd

In [None]:
def get_timestamp(df):
    df["#HR_int"] = df["#HR"].astype(int).astype(str).str.zfill(2)
    df["date"] = pd.to_datetime(df["Year"] * 1000 + df["YD"], format="%Y%j").astype(str)
    df["datetime"] = df["date"] + " " + df["#HR_int"] + ":00:00"
    df["timestamp"] = pd.to_datetime(df["datetime"])
    df = df.drop(["#HR_int", "date", "datetime"], axis=1)
    return df

In [2]:
# AWS credentials to read files on S3 bucket
f = open('../credentials.json')
credentials = json.load(f)

s3_client = boto3.client(
    "s3",
    aws_access_key_id=credentials["Access key ID"],
    aws_secret_access_key=credentials["Secret access key"]
    )

s3_resource = boto3.resource(
    "s3",
    aws_access_key_id=credentials["Access key ID"],
    aws_secret_access_key=credentials["Secret access key"]
    )

In [3]:
# get IAG .dat data from S3 bucket
prefix_objs = s3_resource.Bucket("iag-usp").objects.filter(Prefix=f"raw/")
keys = [obj.key for obj in prefix_objs]
print(keys)

['raw/', 'raw/ITU_2017.dat', 'raw/PR40_IT_17.DAT', 'raw/SFZ_2017.dat', 'raw/SFZe_2017.dat']


In [41]:
obj = s3_client.get_object(Bucket="iag-usp", Key='raw/ITU_2017.dat')
ITU_2017 = pd.read_csv(io.BytesIO(obj["Body"].read()), sep="\s+")
ITU_2017

Unnamed: 0,#HR,YD,Year,T_CS215(C),RH_CS215(%),WS_034B(ms),WD_034B(degree),NetRL1(Wm2),Precip(mm)
0,0.000,1,2017,20.27,100.0,0.859,58.57,18.766,0.0
1,0.083,1,2017,20.33,100.0,1.034,70.04,20.074,0.0
2,0.167,1,2017,20.36,100.0,1.085,70.52,20.942,0.0
3,0.250,1,2017,20.39,100.0,1.084,69.76,21.190,0.0
4,0.333,1,2017,20.43,100.0,1.260,73.69,21.465,0.0
...,...,...,...,...,...,...,...,...,...
98781,23.250,365,2017,19.31,99.0,0.551,334.30,48.107,0.0
98782,23.333,365,2017,19.26,99.2,0.130,287.80,47.749,0.0
98783,23.417,365,2017,19.19,99.2,0.745,322.80,47.680,0.0
98784,23.500,365,2017,19.21,99.2,0.999,355.30,45.325,0.0


In [5]:
ITU_2017_metadata = {
    "#HR": "hour",
    "YD": "Day of the year",
    "Year": "Year",
    "T_CS215(C)": "Air temperature in Celsius",
    "RH_CS215(%)": "Percentual relative humidity",
    "WS_034B(ms)": "Wind speed in m/s",
    "WD_034B(degree)": "Wind direction in degrees",
    "NetRL1(Wm2)": "Net radiation in Watt/m²",
    "Precip(mm)": "Precipitation in mm"
}

In [47]:
ITU_2017 = get_timestamp(df=ITU_2017)
ITU_2017

Unnamed: 0,#HR,YD,Year,T_CS215(C),RH_CS215(%),WS_034B(ms),WD_034B(degree),NetRL1(Wm2),Precip(mm),timestamp
0,0.000,1,2017,20.27,100.0,0.859,58.57,18.766,0.0,2017-01-01 00:00:00
1,0.083,1,2017,20.33,100.0,1.034,70.04,20.074,0.0,2017-01-01 00:00:00
2,0.167,1,2017,20.36,100.0,1.085,70.52,20.942,0.0,2017-01-01 00:00:00
3,0.250,1,2017,20.39,100.0,1.084,69.76,21.190,0.0,2017-01-01 00:00:00
4,0.333,1,2017,20.43,100.0,1.260,73.69,21.465,0.0,2017-01-01 00:00:00
...,...,...,...,...,...,...,...,...,...,...
98781,23.250,365,2017,19.31,99.0,0.551,334.30,48.107,0.0,2017-12-31 23:00:00
98782,23.333,365,2017,19.26,99.2,0.130,287.80,47.749,0.0,2017-12-31 23:00:00
98783,23.417,365,2017,19.19,99.2,0.745,322.80,47.680,0.0,2017-12-31 23:00:00
98784,23.500,365,2017,19.21,99.2,0.999,355.30,45.325,0.0,2017-12-31 23:00:00


In [6]:
obj = s3_client.get_object(Bucket="iag-usp", Key='raw/PR40_IT_17.DAT')
PR40_IT_17 = pd.read_csv(io.BytesIO(obj["Body"].read()), sep="\s+")
PR40_IT_17.head()

Unnamed: 0,0.441,930.586,10.583,44,2017,9
0,0.444,930.584,10.667,44,2017,9
1,0.448,930.568,10.75,44,2017,9
2,0.455,915.193,10.917,44,2017,9
3,0.458,930.608,11.0,44,2017,9
4,0.462,930.627,11.083,44,2017,9


In [7]:
obj = s3_client.get_object(Bucket="iag-usp", Key='raw/SFZ_2017.dat')
SFZ_2017 = pd.read_csv(io.BytesIO(obj["Body"].read()), sep="\s+")
SFZ_2017.head()

Unnamed: 0,#HR,YD,Year,T_CS215(C),RH_CS215(%),WS_034B(ms),WD_034B(degree),NetRL1(Wm2),Precip(mm),WS_CSAT(m/s),WD_CSAT(degree)
0,0.083,1,2017,23.07,81.0,3.036,12.98,39.439,0.0,3.104,28.009
1,0.167,1,2017,23.07,81.1,2.82,15.94,39.17,0.0,2.69,31.425
2,0.25,1,2017,23.08,81.1,2.382,9.84,39.085,0.0,2.543,23.273
3,0.333,1,2017,23.08,81.1,1.98,9.1,39.528,0.0,0.91,350.112
4,0.417,1,2017,23.06,81.1,1.478,11.21,40.063,0.0,2.021,21.471


In [8]:
SFZ_2017_metadata = {
    "#HR": "hour",
    "YD": "Day of the year",
    "Year": "Year",
    "T_CS215(C)": "Air temperature in Celsius",
    "RH_CS215(%)": "Percentual relative humidity",
    "WS_034B(ms)": "Wind speed in m/s",
    "WD_034B(degree)": "Wind direction in degrees",
    "NetRL1(Wm2)": "Net radiation in Watt/m²",
    "Precip(mm)": "Precipitation in mm",
    "WS_CSAT(m/s)": "Wind speed in m/s",
    "WD_CSAT(degree)": "Wind direction in degrees"
}

In [48]:
SFZ_2017 = get_timestamp(df=SFZ_2017)
SFZ_2017

Unnamed: 0,#HR,YD,Year,T_CS215(C),RH_CS215(%),WS_034B(ms),WD_034B(degree),NetRL1(Wm2),Precip(mm),WS_CSAT(m/s),WD_CSAT(degree),timestamp
0,0.083,1,2017,23.07,81.0,3.036,12.98,39.439,0.0,3.104,28.009,2017-01-01 00:00:00
1,0.167,1,2017,23.07,81.1,2.820,15.94,39.170,0.0,2.690,31.425,2017-01-01 00:00:00
2,0.250,1,2017,23.08,81.1,2.382,9.84,39.085,0.0,2.543,23.273,2017-01-01 00:00:00
3,0.333,1,2017,23.08,81.1,1.980,9.10,39.528,0.0,0.910,350.112,2017-01-01 00:00:00
4,0.417,1,2017,23.06,81.1,1.478,11.21,40.063,0.0,2.021,21.471,2017-01-01 00:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...
97339,23.250,365,2017,20.53,92.7,3.859,274.00,10.194,0.0,2.988,295.899,2017-12-31 23:00:00
97340,23.333,365,2017,20.52,92.7,4.224,279.20,16.214,0.0,3.742,334.669,2017-12-31 23:00:00
97341,23.417,365,2017,20.51,92.7,3.650,283.00,16.006,0.0,1.341,346.755,2017-12-31 23:00:00
97342,23.500,365,2017,20.52,92.7,3.132,283.70,14.934,0.0,1.165,311.496,2017-12-31 23:00:00


In [9]:
obj = s3_client.get_object(Bucket="iag-usp", Key='raw/SFZe_2017.dat')
SFZe_2017 = pd.read_csv(io.BytesIO(obj["Body"].read()), sep="\s+")
SFZe_2017.head()

Unnamed: 0,#HR,YD,Year,SW_CMP3(Wm2),T_GMX(C),RH_GMX(%),P_GMX(mb),WD_GMX(degree),WS_GMX(ms),T_CS215_e(C),RH_CS215_e(%)
0,0.0,1,2017,0.0,25.232,67.0,931.053,260.267,0.915,24.599,68.72
1,0.083,1,2017,0.0,25.365,66.47,930.98,233.2,1.017,24.757,68.32
2,0.167,1,2017,0.0,25.623,65.73,930.9,227.467,0.832,25.102,67.09
3,0.25,1,2017,0.0,25.643,65.4,930.88,206.333,0.593,25.092,67.15
4,0.333,1,2017,0.0,25.787,65.07,930.793,152.267,0.867,25.142,67.01


In [10]:
SFZe_2017_metadata = {
    "#HR": "hour",
    "YD": "Day of the year",
    "Year": "Year",
    "SW_CMP3(Wm2)": "Net radiation in Watt/m²",
    "T_GMX(C)": "Air temperature in Celsius",
    "RH_GMX(%)": "Percentual relative humidity",
    "P_GMX(mb)": "Atmospheric pressure in millibars (or hPa)",
    "WD_GMX(degree)": "Wind direction in degrees",
    "WS_GMX(ms)": "Wind speed in m/s",
    "T_CS215_e(C)": "Air temperature in Celsius (same as T_GMX, but the data provider says this one is more reliable)",
    "RH_CS215_e(%)": "Percentual relative humidity (same as RH_GMX(%), but the data provider says this one is more reliable)"
}

In [49]:
SFZe_2017 = get_timestamp(df=SFZe_2017)
SFZe_2017

Unnamed: 0,#HR,YD,Year,SW_CMP3(Wm2),T_GMX(C),RH_GMX(%),P_GMX(mb),WD_GMX(degree),WS_GMX(ms),T_CS215_e(C),RH_CS215_e(%),timestamp
0,0.000,1,2017,0.0,25.232,67.00,931.053,260.267,0.915,24.599,68.72,2017-01-01 00:00:00
1,0.083,1,2017,0.0,25.365,66.47,930.980,233.200,1.017,24.757,68.32,2017-01-01 00:00:00
2,0.167,1,2017,0.0,25.623,65.73,930.900,227.467,0.832,25.102,67.09,2017-01-01 00:00:00
3,0.250,1,2017,0.0,25.643,65.40,930.880,206.333,0.593,25.092,67.15,2017-01-01 00:00:00
4,0.333,1,2017,0.0,25.787,65.07,930.793,152.267,0.867,25.142,67.01,2017-01-01 00:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...
116272,23.583,365,2017,0.0,22.730,79.27,929.700,201.467,0.755,22.173,80.50,2017-12-31 23:00:00
116273,23.667,365,2017,0.0,22.844,79.00,929.680,144.867,0.685,22.292,79.92,2017-12-31 23:00:00
116274,23.750,365,2017,0.0,22.874,78.73,929.607,213.200,0.879,22.430,79.25,2017-12-31 23:00:00
116275,23.833,365,2017,0.0,23.173,77.53,929.600,192.800,0.993,22.716,78.30,2017-12-31 23:00:00
