# Code to extract data for one day's worth of rainfall

### 0. Imports

In [36]:
import pandas as pd
import requests

### 1. Accessing data from data.gov.sg website

In [37]:
url = "https://api.data.gov.sg/v1/environment/rainfall"
params = {"date": "2023-03-01"} # YYYY-MM-DD
data_dict = requests.get(url, params=params).json()

how to extract data from more days?

| idea | concerns |
|--|--|
| generate list of all dates, then extract rainfall values from all dates | time-consuming, may be very computationally expensive, may need a lot of storage |
| access http://www.weather.gov.sg/climate-historical-daily/ and find list of dates with significant rainfall, then extract rainfall values | how to determine a suitable threshold for what is "significant"? (may need to analyse the data from weather.gov.sg's webiste) 

### 2. Extracting dataframe of stations

In [38]:
stations_lst = data_dict["metadata"]["stations"]
stations_df = pd.DataFrame.from_dict(stations_lst)
stations_df = stations_df.join(pd.json_normalize(stations_df["location"]))

stations_df

Unnamed: 0,id,device_id,name,location,latitude,longitude
0,S77,S77,Alexandra Road,"{'latitude': 1.2937, 'longitude': 103.8125}",1.29370,103.81250
1,S109,S109,Ang Mo Kio Avenue 5,"{'latitude': 1.3764, 'longitude': 103.8492}",1.37640,103.84920
2,S117,S117,Banyan Road,"{'latitude': 1.256, 'longitude': 103.679}",1.25600,103.67900
3,S64,S64,Bukit Panjang Road,"{'latitude': 1.3824, 'longitude': 103.7603}",1.38240,103.76030
4,S90,S90,Bukit Timah Road,"{'latitude': 1.3191, 'longitude': 103.8191}",1.31910,103.81910
...,...,...,...,...,...,...
64,S08,S08,Upper Thomson Road,"{'latitude': 1.3701, 'longitude': 103.8271}",1.37010,103.82710
65,S116,S116,West Coast Highway,"{'latitude': 1.281, 'longitude': 103.754}",1.28100,103.75400
66,S104,S104,Woodlands Avenue 9,"{'latitude': 1.44387, 'longitude': 103.78538}",1.44387,103.78538
67,S100,S100,Woodlands Road,"{'latitude': 1.4172, 'longitude': 103.74855}",1.41720,103.74855


note: stations_df only needs to be retrieved once. (Assuming that every day's data includes all stations)

### 3. Extracting dataframe of readings

In [39]:
readings_lst = data_dict["items"]
readings_df = pd.DataFrame.from_dict(readings_lst)
readings_df

Unnamed: 0,timestamp,readings
0,2023-03-01T00:05:00+08:00,"[{'station_id': 'S77', 'value': 2}, {'station_..."
1,2023-03-01T00:10:00+08:00,"[{'station_id': 'S77', 'value': 1}, {'station_..."
2,2023-03-01T00:15:00+08:00,"[{'station_id': 'S77', 'value': 0.8}, {'statio..."
3,2023-03-01T00:20:00+08:00,"[{'station_id': 'S77', 'value': 1.2}, {'statio..."
4,2023-03-01T00:25:00+08:00,"[{'station_id': 'S77', 'value': 0.8}, {'statio..."
...,...,...
282,2023-03-01T23:35:00+08:00,"[{'station_id': 'S77', 'value': 0}, {'station_..."
283,2023-03-01T23:40:00+08:00,"[{'station_id': 'S77', 'value': 0}, {'station_..."
284,2023-03-01T23:45:00+08:00,"[{'station_id': 'S77', 'value': 0}, {'station_..."
285,2023-03-01T23:50:00+08:00,"[{'station_id': 'S77', 'value': 0}, {'station_..."


#### 3.1 Converting the 'readings' column into a more useful format

In [40]:
# convert readings col from a list of dicts containing the keys "station_id" and "value" to a dict of station_id-value key-value pairs
def spread_column(lst):
    new_dict = dict()
    for ddict in lst:
        new_dict[ddict["station_id"]] = ddict["value"]
    return new_dict

readings_df["loc_val"] = readings_df["readings"].map(lambda entry: spread_column(entry))

readings_df = readings_df.join(pd.json_normalize(readings_df["loc_val"]))

readings_df


Unnamed: 0,timestamp,readings,loc_val,S77,S109,S117,S64,S90,S61,S114,...,S123,S89,S115,S24,S69,S08,S116,S104,S100,S36
0,2023-03-01T00:05:00+08:00,"[{'station_id': 'S77', 'value': 2}, {'station_...","{'S77': 2, 'S109': 1.2, 'S117': 0.6, 'S64': 0,...",2.0,1.2,0.6,0.0,4.2,3.2,0.0,...,1.0,0.2,0.2,0.6,0.0,0.0,2.4,0.0,0.0,
1,2023-03-01T00:10:00+08:00,"[{'station_id': 'S77', 'value': 1}, {'station_...","{'S77': 1, 'S109': 3, 'S117': 1.4, 'S64': 0, '...",1.0,3.0,1.4,0.0,2.4,3.6,0.4,...,1.2,0.8,0.2,3.0,0.2,0.6,1.6,0.2,0.0,
2,2023-03-01T00:15:00+08:00,"[{'station_id': 'S77', 'value': 0.8}, {'statio...","{'S77': 0.8, 'S109': 2.8, 'S117': 1.6, 'S64': ...",0.8,2.8,1.6,0.2,1.6,2.2,0.0,...,1.8,1.0,0.2,2.4,1.0,3.2,2.6,1.0,0.0,
3,2023-03-01T00:20:00+08:00,"[{'station_id': 'S77', 'value': 1.2}, {'statio...","{'S77': 1.2, 'S109': 1.6, 'S117': 0.8, 'S64': ...",1.2,1.6,0.8,1.2,1.8,0.4,0.2,...,3.4,0.6,0.4,3.0,2.6,2.4,2.4,3.2,2.2,
4,2023-03-01T00:25:00+08:00,"[{'station_id': 'S77', 'value': 0.8}, {'statio...","{'S77': 0.8, 'S109': 1.4, 'S117': 0.2, 'S64': ...",0.8,1.4,0.2,2.2,3.6,1.6,0.4,...,2.6,0.2,1.0,5.0,2.2,2.2,1.0,1.2,1.8,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
282,2023-03-01T23:35:00+08:00,"[{'station_id': 'S77', 'value': 0}, {'station_...","{'S77': 0, 'S109': 0, 'S117': 0, 'S64': 0, 'S9...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
283,2023-03-01T23:40:00+08:00,"[{'station_id': 'S77', 'value': 0}, {'station_...","{'S77': 0, 'S109': 0, 'S117': 0, 'S64': 0, 'S9...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
284,2023-03-01T23:45:00+08:00,"[{'station_id': 'S77', 'value': 0}, {'station_...","{'S77': 0, 'S109': 0, 'S117': 0, 'S64': 0, 'S9...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,
285,2023-03-01T23:50:00+08:00,"[{'station_id': 'S77', 'value': 0}, {'station_...","{'S77': 0, 'S109': 0, 'S117': 0, 'S64': 0, 'S9...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


### 3.2 Checking whether columns containing rainfall values contain null values

In [41]:
readings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 287 entries, 0 to 286
Data columns (total 72 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   timestamp  287 non-null    object 
 1   readings   287 non-null    object 
 2   loc_val    287 non-null    object 
 3   S77        287 non-null    float64
 4   S109       254 non-null    float64
 5   S117       287 non-null    float64
 6   S64        287 non-null    float64
 7   S90        287 non-null    float64
 8   S61        287 non-null    float64
 9   S114       287 non-null    float64
 10  S50        287 non-null    float64
 11  S107       287 non-null    float64
 12  S215       287 non-null    float64
 13  S118       287 non-null    float64
 14  S120       287 non-null    float64
 15  S71        287 non-null    float64
 16  S43        287 non-null    float64
 17  S66        272 non-null    float64
 18  S112       287 non-null    float64
 19  S40        287 non-null    float64
 20  S108      

station_id = S16 has many null values (not sure why)