In [None]:
import pandas as pd
import xarray as xr
import numpy as np

import re
import datetime
import time

import os
from collections import OrderedDict

import matplotlib.pyplot as plt
import matplotlib

from sys import getsizeof,path
path.append("../src")
from Utility import DateFrom8digitName,RKI_AG_to_int

In [None]:
def NRW_ParseAgeGroups(fn):
    df = pd.read_csv(fn,sep="\t",header=[1],thousands='.',decimal=",",na_values="-")
    
    df = df.filter(["Fälle","Fälle.1","Fälle.2","Todesfälle","Todesfälle.1","Todesfälle.2"])
    df = df.fillna(0).iloc[:-2]
    df = df.astype("int64")
    df.index.name="age"
    df.index = df.index.map(RKI_AG_to_int)
    
    f = {"Fälle.1":"male","Fälle.2":"female","Todesfälle.1":"male","Todesfälle.2":"female","Fälle":"total","Todefälle":"total"}
    df = df.rename(columns=f)
#    df["divers"] = df["total"]-(df["male"]+df["female"])
    
    return df.to_xarray().to_array()

def NRW_ParseLargeFile(filename,published):
#    print(filename,published)
    df = pd.read_csv(filename,index_col=1,parse_dates=True)
    
    rename = OrderedDict()
    restructure_age,restructure = OrderedDict(),OrderedDict()
    stat,ages,sexes = ["anzahlM","hospitalisiert","verstorben","anzahl","anzahlE","anzahlEM","verstorbenE","verstorbenM"],[x*10 for x in range(10)],["M","W"]
    for i,status in enumerate(stat):
        for n,sex in enumerate(sexes):
            for j,age in enumerate(ages):
                name = status+"A%02d"%age+sex
                rename[name] = name
                if status in ["anzahlM","hospitalisiert","verstorben"]:
                    restructure_age[name] = (i,j,n,)
        rename[status] = status
        if status not in ["anzahl"]:
            restructure[status] = (i)
    df = df.filter(rename.keys())
    
    length = len(df)
    z = np.zeros((length,1,3,10,2,),dtype="int32")
    for k,v in restructure_age.items():
        i,j,n = v
        z[:,0,i,j,n] = df[k].values
    with_age = xr.DataArray(z,dims=["date","published","status","age","sex"],coords={"date":df.index.values,"published":[published],"status":stat[:3],"age":ages,"sex":sexes})
    
    # Open new style without age/sex instead
    replacement = False
    try:
        fnc = filename.split("_")
        filename = fnc[0]+"_"+fnc[1]+"_"+"zeitreihe"+"_"+fnc[2]
        
        df2 = pd.read_csv(filename,index_col=1,parse_dates=True)
        replacement = True
    except:
        pass
    
    if length == 15 and replacement:
        df = df2.filter(rename.keys())
        
        length = len(df)
        stat = [x for x in df.columns]
        y = np.zeros((length,1,len(stat),),dtype="int32")
        for i,k in enumerate(stat):
            y[:,0,i] = df[k].values

        
    else:
        y = np.zeros((length,1,len(stat),),dtype="int32")
        for k,i in restructure.items():
            y[:,0,i] = df[k].values
    no_age = xr.DataArray(y,dims=["date","published","status"],coords={"date":df.index.values,"published":[published],"status":stat})
    
    return no_age,with_age

def NRW_ParseEarly_Deaths(filename,published):
    df = pd.read_csv(filename,parse_dates=True,sep="\t",na_values=["-"])
    df.columns = ["date","verstorben"]
    df.index = pd.to_datetime(df["date"],format="%d.%m.%Y")
    df["verstorben"] = df["verstorben"].fillna(0).astype("int32")
    del df["date"]
#    df = df.iloc[::-1]
    
    x = np.zeros((len(df.index),1,1),dtype="int32")
    x[:,0,0] = df["verstorben"].values
    deaths = xr.DataArray(x,dims=["date","published","status"],coords={"date":df.index,"published":[published],"status":["verstorben"]})
    
    return deaths

def ImportNRW(d="../Data/Bundeslaender/NRW"):
    files = sorted(os.listdir(d))

 #   Altersgruppe = {}
#    Woche = {}
    
    timeseries = OrderedDict()
    timeseries_age = OrderedDict()
    early_deaths = OrderedDict()
    
    for fn in [x for x in files[:] if "NRW" in x][::-1]:
        dt = DateFrom8digitName(d+"/"+fn)
        dfn = d+"/"+fn
        if dt != None:
            tid,fid = fn.split("_")[1:3]
            if fid == "Cases7dSum":
                pass
            elif fid == "CasesAGs":
                NRW_ParseAgeGroups(dfn)
            elif fid == "VerstorbeneAGs":
                NRW_ParseAgeGroups(dfn)
            elif fid == "Verstorbene":
                early_deaths[dt] = NRW_ParseEarly_Deaths(dfn,dt)
            elif tid == "Covid19" and len(fn) == 24 and "zeitreihe" not in fn:
                no_age,with_age = NRW_ParseLargeFile(dfn,dt)
                timeseries[dt] = no_age
                timeseries_age[dt] = with_age
            else:
                pass
#                print(fn,dt)
    
    ts = xr.concat(timeseries.values(),dim="published",fill_value=0)
    ts_age = xr.concat(timeseries_age.values(),dim="published",fill_value=0)
    
#    print([x.sel(status="verstorben") for x in timeseries.values()][0])
 #   print([x.sel(status="verstorben") for x in early_deaths.values()][0])
    ts_deaths = xr.concat([x.sel(status="verstorben") for x in timeseries.values()]+[x.sel(status="verstorben") for x in early_deaths.values()],dim="published",fill_value=0)
    
    return {"timeseries":ts,"timeseries_age":ts_age,"timeseries_deaths":ts_deaths}
        
t0 = time.time()

data = ImportNRW()
        
t1 = time.time()
print("%.3f"%(t1-t0))


In [None]:
#data["timeseries_age"].sum(["age","sex"]).sel(published=datetime.datetime(2021,9,2),status="verstorben")

#

In [None]:
# Deaths, no age. by publication
colors = ['#ccebc5','#a8ddb5','#7bccc4','#43a2ca','#0868ac','#000000']
alphas = [1,1,1,1,.8,.5]
dates = [datetime.datetime(2021,1,29),datetime.datetime(2021,3,1),datetime.datetime(2021,3,24),datetime.datetime(2021,4,15),datetime.datetime(2021,5,1),datetime.datetime(2021,9,3)]

for d,c,a in zip(dates,colors,alphas):
    plt.plot(data["timeseries_deaths"].sel(published=d),color=c,alpha=a)


In [None]:
# By date
colors = ['#fcc5c0','#fa9fb5','#f768a1','#c51b8a','#7a0177',"#000000"]
dates = [datetime.datetime(2021,1,29),datetime.datetime(2021,2,15),datetime.datetime(2021,3,1),datetime.datetime(2021,3,24),datetime.datetime(2021,4,15),datetime.datetime(2021,5,1)]
for d,c,a in zip(dates,colors,alphas):
    plt.plot(data["timeseries_deaths"].sel(date=d)[::-1],color=c,alpha=a,label=d)
plt.legend(loc="upper right")

In [None]:
hospi_age = data["timeseries_age"].sel(status="hospitalisiert").sum(["sex"])

dates = [datetime.datetime(2021,3,25),datetime.datetime(2021,5,12),datetime.datetime(2021,8,1),datetime.datetime(2021,8,15),datetime.datetime(2021,9,1),datetime.datetime(2021,9,3)]

time,ages = hospi_age.coords["date"],hospi_age.coords["age"]
fig = plt.figure(figsize=(8,12), dpi= 100, facecolor='w', edgecolor='k')
axs = fig.subplots(len(ages))

base = np.zeros((len(ages),len(dates),len(time),))
for i,age in enumerate(ages):
    for j,d in enumerate(dates):
        axs[i].plot(time,hospi_age.sel(published=d,age=age))
        #y = hospi_age.sel(published=d,age=age)
        #axs[i].bar(time,y,bottom=base[i,j])
        #base[i,j] += y

#plt.plot(hospi_sum.sel(published=datetime.datetime(2021,3,25)))
#plt.plot()

In [None]:
# complete an incomplete time-series by publication
colors = ['#ccece6','#99d8c9','#66c2a4','#2ca25f','#006d2c','<#000000']
dates = [datetime.datetime(2021,2,15),datetime.datetime(2021,3,1),datetime.datetime(2021,3,15),datetime.datetime(2021,3,24),datetime.datetime(2021,4,1),datetime.datetime(2021,4,15),datetime.datetime(2021,5,1),datetime.datetime(2021,5,15),datetime.datetime(2021,6,1),datetime.datetime(2021,6,15),]
print(len(dates))
for d,c in zip(dates,colors+colors[1:5]):
    y = hospi_sum.sel(date=d)
    m = "-"
    if d > datetime.datetime(2021,4,23):
        m = "--"
    plt.plot(y[::-1],m,color=c)