#### Step 1. Checking the anoymous temporal data

In [1]:
import numpy as np
import pandas as pd
import os

reservoir_info = {
    "BSR": 1990,
    "CAU": 1999,
    "CRY": 1982,
    "DCR": 1987,
    "DIL": 1985,
    "ECH": 1982,
    "ECR": 1992,
    "FGR": 1982,
    "FON": 1990,
    "GMR": 1982,
    "HYR": 1999,
    "JOR": 1997,
    "JVR": 1996,
    "LCR": 1998,
    "LEM": 1982,
    "MCP": 1991,
    "MCR": 1998,
    "NAV": 1986,
    "PIN": 1990,
    "RFR": 1989,
    "RID": 1990,
    "ROC": 1982,
    "RUE": 1982,
    "SCO": 1996,
    "SJR": 1992,
    "STA": 1982,
    "STE": 1982,
    "TPR": 1982,
    "USR": 1991,
    "VAL": 1986,
}


data_dir = os.path.join("..", "lstm", "data", "raw")
end_date = "2011-12-31"
all_data = {}

for res, start_year in reservoir_info.items():
    file_path = os.path.join(data_dir, f"{res}.dat")
    if not os.path.exists(file_path):
        print(f"Warning: {file_path} does not exist, skipping this reservoir.")
        continue
    data = np.loadtxt(file_path, delimiter=',')
    start_date = f"{start_year}-01-01"
    dates = pd.date_range(start=start_date, end=end_date, freq='D')
    if len(data) != len(dates):
        print(f"Warning: {res} data points ({len(data)}) do not match number of dates ({len(dates)})!")
        if len(data) > len(dates):
            print(f"  {len(data) - len(dates)} extra days of data.")
        else:
            print(f"  Missing {len(dates) - len(data)} days of data.")
        # continue

    df = pd.DataFrame(data)
    # df['date'] = dates
    df.columns = ['temperature', 'perception', 'inflow_x', 'inflow_y']
    all_data[res] = df

  Missing 2 days of data.
  Missing 2 days of data.


In [2]:
dates = pd.date_range(start="1982-01-01", end="2011-12-31", freq="D")
print(len(dates))

TPR = pd.read_csv('data/unalign/TPR.csv', usecols=['Date', 'Inflow** (cfs)'])
TPR.columns = ['Date', 'Inflow']
TPR['Date'] = pd.to_datetime(TPR['Date'], format='%d-%b-%y')
TPR['Date'] = TPR['Date'].apply(lambda x: x.replace(year=x.year-100) if x.year > 2021 else x)
TPR = TPR.dropna(subset=['Date'])
TPR = TPR[TPR['Date'].isin(dates)]
TPR.set_index('Date', inplace=True)
print(len(TPR['Inflow']))

CRY = pd.read_csv('data/unalign/CRY.csv', usecols=['Date', 'Inflow (cfs)'])
CRY.columns = ['Date', 'Inflow']
CRY['Date'] = pd.to_datetime(CRY['Date'], format='%d-%b-%y')
CRY['Date'] = CRY['Date'].apply(lambda x: x.replace(year=x.year-100) if x.year > 2021 else x)
CRY = CRY.dropna(subset=['Date'])
CRY = CRY[CRY['Date'].isin(dates)]
CRY.set_index('Date', inplace=True)
print(len(CRY['Inflow']))

10957
10957
10957


In [3]:
TPR['Inflow'][-12:]  # .abs()

Date
2011-12-20    66
2011-12-21    74
2011-12-22    35
2011-12-23    74
2011-12-24    66
2011-12-25    52
2011-12-26    66
2011-12-27    67
2011-12-28    66
2011-12-29    74
2011-12-30    67
2011-12-31    59
Name: Inflow, dtype: int64

In [4]:
all_data['TPR']['inflow_x'][-10:].astype(int)

10945    66
10946    74
10947    35
10948    74
10949    66
10950    52
10951    66
10952    67
10953    66
10954    74
Name: inflow_x, dtype: int64

In [5]:
CRY['Inflow'][-12:]  # .abs()

Date
2011-12-20    1531
2011-12-21    1487
2011-12-22    1535
2011-12-23    1489
2011-12-24    1508
2011-12-25    1512
2011-12-26    1436
2011-12-27    1518
2011-12-28    1257
2011-12-29    1101
2011-12-30    1145
2011-12-31    1152
Name: Inflow, dtype: int64

In [6]:
all_data['CRY']['inflow_x'][-10:].astype(int)

10945    1531
10946    1487
10947    1535
10948    1489
10949    1508
10950    1512
10951    1436
10952    1518
10953    1257
10954    1101
Name: inflow_x, dtype: int64

### Fill lacking info for unaligned reservoirs

In [7]:
T = pd.read_csv("data/unalign/Reservoir_T.csv", usecols=['Unnamed: 0', 'CRY', 'TPR'])
T.columns = ['Date', 'CRY', 'TPR']
T['Date'] = pd.to_datetime(T['Date'], format='%m/%d/%y')
T['Date'] = T['Date'].apply(lambda x: x.replace(year=x.year-100) if x.year > 2021 else x)
T = T.dropna(subset=['Date'])
T = T[T['Date'].isin(dates)]
T.set_index('Date', inplace=True)
print(len(T))
print(T['CRY'][-2:])
print(T['TPR'][-2:])

10957
Date
2011-12-30    0.0635
2011-12-31    1.9130
Name: CRY, dtype: float64
Date
2011-12-30   -3.2025
2011-12-31   -1.2920
Name: TPR, dtype: float64


In [8]:
P = pd.read_csv("data/unalign/Reservoir_P.csv", usecols=['Unnamed: 0', 'CRY', 'TPR'])
P.columns = ['Date', 'CRY', 'TPR']
P['Date'] = pd.to_datetime(P['Date'], format='%m/%d/%y')
P['Date'] = P['Date'].apply(lambda x: x.replace(year=x.year-100) if x.year > 2021 else x)
P = P.dropna(subset=['Date'])
P = P[P['Date'].isin(dates)]
P.set_index('Date', inplace=True)
print(len(P))
print(P['CRY'][-2:])
print(P['TPR'][-2:])

10957
Date
2011-12-30    0.011
2011-12-31    0.000
Name: CRY, dtype: float64
Date
2011-12-30    0.0
2011-12-31    0.0
Name: TPR, dtype: float64


In [9]:
# For CRY filling
tmp = pd.DataFrame()
tmp['inflow_x'] = CRY['Inflow'][-2:]
tmp['inflow_y'] = CRY['Inflow'][-2:]
tmp['temperature'] = T['CRY'][-2:]
tmp['perception'] = P['CRY'][-2:]
tmp.reset_index(inplace=True, drop=True)
all_data['CRY'] = pd.concat([all_data['CRY'], tmp], axis=0).reset_index(drop=True)

# For TPR filling
tmp = pd.DataFrame()
tmp['inflow_x'] = TPR['Inflow'][-2:]
tmp['inflow_y'] = TPR['Inflow'][-2:]
tmp['temperature'] = T['TPR'][-2:]
tmp['perception'] = P['TPR'][-2:]
tmp.reset_index(inplace=True, drop=True)
all_data['TPR'] = pd.concat([all_data['TPR'], tmp], axis=0).reset_index(drop=True)

In [10]:
for res, start_year in reservoir_info.items():
    data = all_data[res]
    start_date = f"{start_year}-01-01"
    end_date = "2011-12-31"
    dates = pd.date_range(start=start_date, end=end_date, freq='D')
    if len(data) != len(dates):
        print(f"Warning: {res} data points({len(data)}) does not match dates({len(dates)})!")
        if len(data) > len(dates):
            print(f"  {len(data) - len(dates)} extra days of data.")
        else:
            print(f"  Missing {len(dates) - len(data)} days of data.")

    df = pd.DataFrame(data)
    df['date'] = dates
    all_data[res] = df

In [11]:
all_data['CRY']

Unnamed: 0,temperature,perception,inflow_x,inflow_y,date
0,-7.5650,11.362,623.0,623.0,1982-01-01
1,-10.2920,5.321,741.0,741.0,1982-01-02
2,-11.0520,1.402,782.0,782.0,1982-01-03
3,-12.9620,0.279,857.0,857.0,1982-01-04
4,-8.6690,7.302,202.0,202.0,1982-01-05
...,...,...,...,...,...
10952,-4.2515,0.000,1518.0,1518.0,2011-12-27
10953,-3.8725,0.000,1257.0,1257.0,2011-12-28
10954,-2.9225,0.000,1101.0,1101.0,2011-12-29
10955,0.0635,0.011,1145.0,1145.0,2011-12-30


#### Store the aligned data in `data\align`

In [12]:
os.makedirs('data/align', exist_ok=True)

for reservoir in all_data.keys():
    output_path = f'data/align/{reservoir}.csv'
    all_data[reservoir].to_csv(output_path, index=False)