In [1]:
import json
from os import stat
import numpy as np
import pandas as pd
from datetime import date, timedelta
import time
import urllib.request as ur
import os

The `parse_api` function is called to do two things. These are 

1. read the data from the url using the api
2. converting that data from the json into a python object

Objective 1 is done through the `urllib.request` package. First, the correct url is given to a `urllib.request` object. Once the correct url is read, the obj is then read and parsed with a json reader from the python package `json`. The read json, now called `data_obj` is then returned.

In [2]:
def parse_api(url, year):
    url =  url.format(str(year))
    # print(url)
    fileobj = ur.urlopen(url)
    readobj = fileobj.read()
    data_obj = json.loads(readobj)
    return data_obj

The `get_dates` function is called to obtain all the dates between the start and end date using the `datetime` package. The function takes the start and end date as inputs, and returns all dates inclusive of the end date in one day intervals. 

*TO-DO*
1. Add in a way to set user defined date ranges

In [3]:
def get_dates(sdate, edate):
    delta = edate - sdate       # as timedelta
    dates = []

    for i in range(delta.days + 1):
        day = sdate + timedelta(days=i)
        dates.append(day)

    dates = np.array(dates, dtype = object)
    return dates

The `get_stations` function takes the read `data_obj` and obtains all metadata from `data_obj` related to the identity and location of the reporting stations. This is then converted into a dataframe, and returned. 

In [4]:
def get_stations(data_obj):
    stations = data_obj["metadata"]["stations"]
    df = pd.DataFrame.from_dict(stations)

    locations = df['location'].to_numpy()
    df['longitude'] = np.zeros(locations.size)
    df['latitude'] = np.zeros(locations.size)

    for i in range(locations.size):
        df.loc[i, 'longitude'] = float(locations[i]['longitude'])
        df.loc[i, 'latitude'] = float(locations[i]['latitude'])

    df = df.drop(['location'], axis = 1)
    return df 

The `get_data_day` function takes in the information of the stations and the `data_obj` and returns a dataframe that represents all rainfall data obtained for that day. Usually data is taken at 5 minute intervals throughout the day from all stations. 

The flow of this function is to 

1. Obtain all the station id's
2. Obtain the dictionary of all rainfall data that day from the json file
3. Extract the time that the reading was taken
4. Extract all rainfall data from the stations throughout the dat 
5. Write them into a dataframe

Return dataframe

In [5]:
def get_data_day(data_obj, stations):
    keys = np.array(stations['id'])
    precipitation_raw = data_obj["items"]
    n = len(precipitation_raw)
    t = np.zeros(n, dtype = object)
    df = pd.DataFrame(np.zeros((n, keys.size)), columns= keys)

    for i, d_obj in enumerate(precipitation_raw):
        t[i] = d_obj['timestamp'].split('+')[0].split('T')[-1]
        list_timestep = d_obj['readings']
        for j in range(len(list_timestep)):
            dict_key = list_timestep[j]['station_id']
            value = list_timestep[j]['value']
            df[dict_key].iloc[i] += value
    
    df['Time'] = t
    df = df.set_index('Time')
    return df

In [6]:
# def concat_stations(station_ids):
#     n = station_ids.size
#     if n < 1:
#         return None
#     elif n == 1:
#         print(True)
#         df = station_ids[0]
#     else:
#         for i in range(0, n-1):
#             df = pd.concat([station_ids[i], station_ids[i+1]], ignore_index=True)

#     df.drop_duplicates(subset = 'id', ignore_index = True, inplace = True)
#     return df

# def collate_data(all_stations, sum_preci, dates):
#     row = dates.size
#     column_id = np.array(all_stations['id'])

#     df = pd.DataFrame(np.zeros((row, column_id.size)), columns = column_id)

#     for i, series in enumerate(sum_preci):
#         keys = np.array(series.index)
#         df[keys].iloc[i] = sum_preci[i]

#     df['Date'] = dates
#     df = df.set_index('Date')
#     return df 

In [9]:
url = "https://api.data.gov.sg/v1/environment/rainfall?date={0}"
save_folder = 'raw_data'


try:
    os.mkdir('./{0}'.format(save_folder))
except FileExistsError:
    print('Folder exists')

sdate = date(2022, 8, 12)   # start date
edate = date.today()   # end date

dates = get_dates(sdate, edate)


for i, d in enumerate(dates):
    data_obj = parse_api(url, d)
    print(d)
    if len(data_obj['metadata']['stations']) == 0:
        print('No valid stations')
        dates[i] = 0
        continue
    else:
        stations = get_stations(data_obj)
        precipitation = get_data_day(data_obj, stations)
        stations.to_csv('./{0}/{1}_stations.csv'.format(save_folder, d))
        precipitation.to_csv('./{0}/{1}_precipitation.csv'.format(save_folder, d))
        

        # station_ids[i] = stations
        # sum_precipation_datas[i] = precipitation.sum()

# dates = dates[np.where(dates != 0)]
# station_ids = station_ids[np.where(station_ids != 0)]
# sum_precipation_datas = sum_precipation_datas[np.where(sum_precipation_datas != 0)]

# all_stations = concat_stations(station_ids)
# df = collate_data(all_stations, sum_precipation_datas, dates)

    #df.to_csv('Daily_rainfall.csv')
    #all_stations.to_csv('station_stats.csv')

Folder exists
2022-08-12
2022-08-13
2022-08-14
2022-08-15
