In [1]:
import pandas as pd
import numpy as np
import os
import geopandas
from datetime import datetime
from dateutil.relativedelta import relativedelta

In [2]:
measurements_dir = 'data/measurements/'
input_data_dir = os.path.join(measurements_dir, 'input_data.json')
input_data_manual = os.path.join(measurements_dir, "GA_5513_1739_2022_quantitative_status_monitoring_manual.json")
input_data_automatic = os.path.join(measurements_dir, "GA_5513_1739_2022_quantitative_status_monitoring_automatic.json")

In [3]:
def prepare_gdf(gdf):
    df = pd.DataFrame(gdf.drop(columns='geometry'))
    df = df.reset_index()
    df = df.set_index(["date", "name"])
    df = df.drop(columns=["index"])
    df = df.drop(columns=["value", "height"])
    df = df.rename({"normalized_value": "value" }, axis=1)
    return df

In [4]:
manual_gdf = geopandas.read_file(input_data_manual)
automatic_gdf = geopandas.read_file(input_data_automatic)

manual_df = prepare_gdf(manual_gdf)
automatic_df = prepare_gdf(automatic_gdf)

manual_df["value"] = manual_df["value"].fillna(automatic_df["value"])
df = manual_df.reset_index()
df = df.set_index(["date"])
df = df.pivot(columns='name', values='value')

In [5]:
df.isna().sum()

name
II/112/1        3
II/113/1        0
II/114/1        0
II/115/1     1062
II/116/1     1061
II/131/1       22
II/132/1        0
II/1345/1     159
II/1346/1     159
II/1351/1     232
II/1352/1     209
II/292/1        0
II/297/1        2
II/298/1        0
II/472/1      339
II/922/1      852
II/924/1        0
II/931/1        6
II/932/1     1062
II/936/1     1022
II/940/1        0
II/949/1      227
II/951/1      209
II/952/1      209
II/957/1      647
dtype: int64

In [10]:
cols = df.columns.values

In [7]:
station = "II/297/1"

In [15]:
def extract_data_for_station(df, station):
    df2 = df[station]
    df2.index = pd.to_datetime(df2.index)
    df2 = df2.reset_index()

    date = '2002-01-01'
    date = datetime.strptime(date, '%Y-%m-%d')
    end_date = '2021-12-01'
    end_date = datetime.strptime(end_date, '%Y-%m-%d')

    output_df = pd.DataFrame(columns = ["date", "value"])

    while date<end_date:
        idx = df2.date.searchsorted(date)
        s_row = df2.iloc[idx]
        value = 0
        if s_row.date != date:
            if s_row.date < date:
                value = (s_row[station] + df2.iloc[idx+1][station])/2.0
            else:
                value = (s_row[station] + df2.iloc[idx-1][station])/2.0
        else:
            value = s_row[station]
        output_df = output_df.append(pd.Series({"date": date.strftime('%Y-%m-%d'), "value": value}), ignore_index=True)
        date = date + relativedelta(months=1)
    output_df = output_df.sort_values(by="date", ignore_index=True)
    output_df = output_df.set_index("date")
    return output_df

In [12]:
def sanitize_station_name(station):
    return station.replace("/", "_")

In [17]:
def extract_all_stations(df, stations):
    for station in stations:
        out_df = extract_data_for_station(df, station)
        pd.to_pickle(out_df, "data/measurements/" + sanitize_station_name(station) + ".pickle")

In [19]:
extract_all_stations(df, cols)

  output_df = output_df.append(pd.Series({"date": date.strftime('%Y-%m-%d'), "value": value}), ignore_index=True)
  output_df = output_df.append(pd.Series({"date": date.strftime('%Y-%m-%d'), "value": value}), ignore_index=True)
  output_df = output_df.append(pd.Series({"date": date.strftime('%Y-%m-%d'), "value": value}), ignore_index=True)
  output_df = output_df.append(pd.Series({"date": date.strftime('%Y-%m-%d'), "value": value}), ignore_index=True)
  output_df = output_df.append(pd.Series({"date": date.strftime('%Y-%m-%d'), "value": value}), ignore_index=True)
  output_df = output_df.append(pd.Series({"date": date.strftime('%Y-%m-%d'), "value": value}), ignore_index=True)
  output_df = output_df.append(pd.Series({"date": date.strftime('%Y-%m-%d'), "value": value}), ignore_index=True)
  output_df = output_df.append(pd.Series({"date": date.strftime('%Y-%m-%d'), "value": value}), ignore_index=True)
  output_df = output_df.append(pd.Series({"date": date.strftime('%Y-%m-%d'), "value": va

In [12]:
output_df = extract_data_for_station(df, station)

  output_df = output_df.append(pd.Series({"date": date.strftime('%Y-%m-%d'), "value": value}), ignore_index=True)
  output_df = output_df.append(pd.Series({"date": date.strftime('%Y-%m-%d'), "value": value}), ignore_index=True)
  output_df = output_df.append(pd.Series({"date": date.strftime('%Y-%m-%d'), "value": value}), ignore_index=True)
  output_df = output_df.append(pd.Series({"date": date.strftime('%Y-%m-%d'), "value": value}), ignore_index=True)
  output_df = output_df.append(pd.Series({"date": date.strftime('%Y-%m-%d'), "value": value}), ignore_index=True)
  output_df = output_df.append(pd.Series({"date": date.strftime('%Y-%m-%d'), "value": value}), ignore_index=True)
  output_df = output_df.append(pd.Series({"date": date.strftime('%Y-%m-%d'), "value": value}), ignore_index=True)
  output_df = output_df.append(pd.Series({"date": date.strftime('%Y-%m-%d'), "value": value}), ignore_index=True)
  output_df = output_df.append(pd.Series({"date": date.strftime('%Y-%m-%d'), "value": va

In [13]:
output_df

Unnamed: 0_level_0,value
date,Unnamed: 1_level_1
2002-01-01,279.595
2002-02-01,280.040
2002-03-01,280.115
2002-04-01,279.850
2002-05-01,279.590
...,...
2021-07-01,279.080
2021-08-01,278.885
2021-09-01,279.275
2021-10-01,279.350


In [18]:
pd.to_pickle(output_df, "data/measurements/" + sanitize_station_name(station) + ".pickle")