In [90]:
from datetime import datetime, timedelta
import requests
import pandas as pd
import json
import numpy as np


# 1. get stations near points in routine

# 2. retrieve data from API and format data to fit into model

## Current date and time
dt_now = datetime.now()
dt_now_str = dt_now.strftime("%Y-%m-%d %H:%M:%S")
current_date_str = dt_now_str[:11]
current_time_str = dt_now_str[11:]

In [108]:
## Get most recent 11 timestamps in a list of (date, time) tuples
def get_time_tuples(date_time_tuple, num):
	date_str, time_str = date_time_tuple
	datetime_obj = datetime.strptime(date_str + ' ' + time_str, '%Y-%m-%d %H:%M:%S')
	result = []
	if num == 6:
		for i in range(7,13):
			new_datetime_obj = datetime_obj - timedelta(minutes=i*5)
			new_date_str = new_datetime_obj.strftime('%Y-%m-%d')
			new_time_str = new_datetime_obj.strftime('%H:%M:%S')
			result.append((new_date_str, new_time_str))
	else:
		for i in range(2,13):
			new_datetime_obj = datetime_obj - timedelta(minutes=i*5)
			new_date_str = new_datetime_obj.strftime('%Y-%m-%d')
			new_time_str = new_datetime_obj.strftime('%H:%M:%S')
			result.append((new_date_str, new_time_str))
	return result

timestamps_extracted = get_time_tuples((current_date_str, current_time_str), 11)  

In [109]:
timestamps_extracted

[('2023-04-08', '17:04:28'),
 ('2023-04-08', '16:59:28'),
 ('2023-04-08', '16:54:28'),
 ('2023-04-08', '16:49:28'),
 ('2023-04-08', '16:44:28'),
 ('2023-04-08', '16:39:28'),
 ('2023-04-08', '16:34:28'),
 ('2023-04-08', '16:29:28'),
 ('2023-04-08', '16:24:28'),
 ('2023-04-08', '16:19:28'),
 ('2023-04-08', '16:14:28')]

In [93]:
## retrieve data of most recent 11 rainfall values for all stations and save into dataframe
url = "https://api.data.gov.sg/v1/environment/rainfall"

data_df = []
def extract_data():
    for row in timestamps_extracted:
	    row_dt = row[0] + "T" + row[1]
	    params = {"date_time": row_dt} # YYYY-MM-DD
	    data_dict = requests.get(url, params=params).json()
	    readings_lst = data_dict["items"]
	    readings_df = pd.DataFrame.from_dict(readings_lst)
	    data_df.append(readings_df)
        
extract_data()
data = pd.concat(data_df, ignore_index=True)

In [94]:
data

Unnamed: 0,timestamp,readings
0,2023-04-08T17:00:00+08:00,"[{'station_id': 'S77', 'value': 0}, {'station_..."
1,2023-04-08T16:55:00+08:00,"[{'station_id': 'S77', 'value': 0.2}, {'statio..."
2,2023-04-08T16:50:00+08:00,"[{'station_id': 'S77', 'value': 0.2}, {'statio..."
3,2023-04-08T16:45:00+08:00,"[{'station_id': 'S77', 'value': 0.2}, {'statio..."
4,2023-04-08T16:40:00+08:00,"[{'station_id': 'S77', 'value': 0.6}, {'statio..."
5,2023-04-08T16:35:00+08:00,"[{'station_id': 'S77', 'value': 0.2}, {'statio..."
6,2023-04-08T16:30:00+08:00,"[{'station_id': 'S77', 'value': 0.2}, {'statio..."
7,2023-04-08T16:25:00+08:00,"[{'station_id': 'S77', 'value': 0}, {'station_..."
8,2023-04-08T16:20:00+08:00,"[{'station_id': 'S77', 'value': 0.2}, {'statio..."
9,2023-04-08T16:15:00+08:00,"[{'station_id': 'S77', 'value': 0.4}, {'statio..."


In [95]:
## expand dataframe such that each row is date+time and columns are each stations
def spread_column(lst):
   	new_dict = dict()
   	for ddict in lst:
   		new_dict[ddict["station_id"]] = ddict["value"]
   	return new_dict
data["loc_val"] = data["readings"].map(lambda entry: spread_column(entry))
data = data.join(pd.json_normalize(data["loc_val"]))
data = data.drop(columns = ["readings", "loc_val"])

In [96]:
## reshape dataframe such that each row is date+time and one station only
data = data.melt(id_vars = ["timestamp"], var_name = "station")

In [97]:
## Drop timestamp column and convert to date & time columns
data[['date_time', 'timezone']] = data['timestamp'].str.split('+', expand=True)
data[['date', 'time']] = data['date_time'].str.split('T', expand=True)
data.drop('date_time', axis=1, inplace=True)
data.drop('timezone', axis=1, inplace=True)
data.drop('timestamp', axis=1, inplace=True)

## Shift date & time columns to the first 2 columns
data.insert(0, 'time', data.pop('time'))
data.insert(0, 'date', data.pop('date'))

In [98]:
data

Unnamed: 0,date,time,station,value
0,2023-04-08,17:00:00,S77,0.0
1,2023-04-08,16:55:00,S77,0.2
2,2023-04-08,16:50:00,S77,0.2
3,2023-04-08,16:45:00,S77,0.2
4,2023-04-08,16:40:00,S77,0.6
...,...,...,...,...
710,2023-04-08,16:30:00,S104,0.0
711,2023-04-08,16:25:00,S104,0.0
712,2023-04-08,16:20:00,S104,0.2
713,2023-04-08,16:15:00,S104,0.0


In [99]:
data_pivot = data.pivot(index=["date","time"], columns="station", values="value")

In [100]:
data_pivot

Unnamed: 0_level_0,station,S08,S104,S106,S107,S108,S109,S111,S112,S113,S114,...,S77,S78,S79,S81,S84,S88,S89,S90,S900,S94
date,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2023-04-08,16:10:00,0.4,0.0,0.0,0.0,0.0,0.0,0.2,0.4,0.0,0.4,...,0.6,0.0,0.0,0.0,0.0,0.0,1.6,0.4,0.0,0.0
2023-04-08,16:15:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4,0.0,0.2,...,0.4,0.0,0.2,0.0,0.0,0.0,0.8,0.0,0.0,0.0
2023-04-08,16:20:00,0.2,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.2,0.0,0.0,0.0,0.0,0.0,0.6,0.2,0.0,0.0
2023-04-08,16:25:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.4,0.6,0.0,0.0
2023-04-08,16:30:00,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.2,...,0.2,0.0,0.0,0.0,0.0,0.0,0.4,0.2,0.0,0.0
2023-04-08,16:35:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,...,0.2,0.0,0.0,0.0,0.0,0.0,0.4,0.2,0.0,0.0
2023-04-08,16:40:00,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.4,...,0.6,0.0,0.0,0.0,0.0,0.2,0.0,0.2,0.0,0.0
2023-04-08,16:45:00,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4,...,0.2,0.0,0.2,0.0,0.0,0.0,0.2,0.0,0.0,0.0
2023-04-08,16:50:00,0.0,0.2,0.0,0.0,0.0,0.0,0.2,0.2,0.0,0.0,...,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0
2023-04-08,16:55:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,...,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [101]:
%cd "C:\Users\Angel\Documents\NUS\Y3S2\DSA3101 Data Science in Practice\DSA3101\project_backend\flask-app"

C:\Users\Angel\Documents\NUS\Y3S2\DSA3101 Data Science in Practice\DSA3101\project_backend\flask-app


In [102]:
## Load dictionary of 5 nearest stations of each station to target station
with open('5-nearest-stations.txt') as f:
    station_dict = f.read()
k_nearest_dic = json.loads(station_dict)

In [110]:
def create_sliding_window(req_datetime, rain_data_pivot):
    lst_of_rows = []
    for i in range(0,len(req_datetime)):
        orig_date, orig_time = req_datetime
        newRow = []
        # rowIndex = req_datetime.index[i]
        time_tuples = get_time_tuples((orig_date, orig_time), 6)
        for time_indx,(date,time_stamp) in enumerate(time_tuples,start=1):
            for (stn,dist) in nearest_k_stns:
                newRow.append(-5*time_indx-30)
                newRow.append(dist)
                newRow.append(rain_data_pivot.get((date,time_stamp),{}).get(stn,np.nan))
            newRow.append(-5*time_indx-30)
            newRow.append(0)
            newRow.append(rain_data_pivot.get((date,time_stamp),{}).get(orig_stn,np.nan))
        lst_of_rows.append(newRow)
    
    return lst_of_rows

In [111]:
## To predict most recent 2 readings first
sec_rec_date = data['date'].max()
sec_rec_time = data['time'].max()
sec_rec_datetime_obj = datetime.strptime(sec_rec_date + ' ' + sec_rec_time, '%Y-%m-%d %H:%M:%S')
sec_rec_datetime_obj = sec_rec_datetime_obj + timedelta(minutes=5)
sec_rec_date_str = sec_rec_datetime_obj.strftime('%Y-%m-%d')
sec_rec_time_str = sec_rec_datetime_obj.strftime('%H:%M:%S')

most_rec_datetime_obj = sec_rec_datetime_obj + timedelta(minutes=5)
most_rec_date_str = most_rec_datetime_obj.strftime('%Y-%m-%d')
most_rec_time_str = most_rec_datetime_obj.strftime('%H:%M:%S')

last_2_readings_req = ((sec_rec_date_str, sec_rec_time_str), (most_rec_date_str, most_rec_time_str))


In [105]:
last_2_readings_req

(('2023-04-08', '17:05:00'), ('2023-04-08', '17:10:00'))

In [112]:
create_sliding_window(last_2_readings_req, data_pivot)

TypeError: can only concatenate tuple (not "str") to tuple