# We start off by importing the data

In [None]:
import pandas as pd
import downloader as downloader
import plotly.graph_objects as go

bl_df = {}
for kurzel in downloader.BL_KURZEL:
    bl_df[kurzel] = pd.read_csv(f'data/vac_{kurzel}.csv', sep=',', index_col=0, parse_dates=True)


df_BY = pd.read_csv('data/vac_BY.csv', sep=',', index_col=0, parse_dates=True) # it's important to define index and parse_dates
pd.options.plotting.backend = "plotly"

## We'll have a look at the column names

In [None]:
# list(df.columns.values)

### Let's remove columns that don't interest us

In [None]:
uninteresting = ['indikation_alter_dosen', 'indikation_alter_erst', 'indikation_alter_voll',
                 'indikation_beruf_dosen', 'indikation_beruf_erst', 'indikation_beruf_voll',
                 'indikation_medizinisch_dosen', 'indikation_medizinisch_erst', 'indikation_medizinisch_voll',
                'indikation_pflegeheim_dosen', 'indikation_pflegeheim_erst', 'indikation_pflegeheim_voll']

different_vaccines = ['dosen_biontech_kumulativ', 'dosen_erst_biontech_kumulativ', 'dosen_voll_biontech_kumulativ',
                      'dosen_moderna_kumulativ', 'dosen_erst_moderna_kumulativ', 'dosen_voll_moderna_kumulativ',
                      'dosen_astrazeneca_kumulativ', 'dosen_erst_astrazeneca_kumulativ', 'dosen_voll_astrazeneca_kumulativ']

df_BY = df_BY.drop(uninteresting, axis=1)
df_BY = df_BY.drop(different_vaccines, axis=1)
list(df_BY.columns.values)


### Or actually let's just pick the few columns that interest us

In [None]:
df_BY = df_BY[['publication_date', 'dosen_kumulativ', 'impf_inzidenz_dosen']]

# Let's fix the data

### We'll first fill missing rows, Saturdays seem to be missing!

In [None]:
df_BY.head()

In [None]:
idx = pd.date_range(df_BY.index.min(), df_BY.index.max())

df_BY = df_BY.reindex(idx, method='ffill')

# Create new index column because that's waay easier than having the date column be the index
df_BY = df_BY.reset_index()
df_BY = df_BY.rename(columns={'index': 'date'})
#fill_value=0 
# convert column "a" to int64 dtype and "b" to complex type
# df = df.astype({'dosen_kumulativ': int})

In [None]:
# df['dosen_kumulativ'] = df['dosen_kumulativ'].fillna(method='ffill')
# df['impf_inzidenz_dosen'] = df['impf_inzidenz_dosen'].fillna(0)
# df.head(5)

### Let's fix errors in the data. There are two spikes, that just seem out of place

In [None]:
df_BY.dosen_kumulativ = df_BY.dosen_kumulativ.replace(384311, 343668)
df_BY.dosen_kumulativ = df_BY.dosen_kumulativ.replace(1150129, 1186228)
# df.dosen_kumulativ.replace(343668, 384311)

# df.at[29, 'dosen_kumulativ'] = 343668
# df.at['2020-03-03', 'dosen_kumulativ'] = 1186228




# Let's add some new useful columns

## New column: Day of the week


In [None]:
df_BY.date


In [None]:
def week_day_string(weekday):
    if weekday == 0:
        return 'Mon'
    elif weekday == 1:
        return 'Tue'
    elif weekday == 2:
        return 'Wed'
    elif weekday == 3:
        return 'Thu'
    elif weekday == 4:
        return 'Fri'
    elif weekday == 5:
        return 'Sat'
    elif weekday == 6:
        return 'Sun'
    else:
        return 'other'

def is_weekend(weekday):
    if weekday == 5:
        return True
    elif weekday == 6:
        return True
    else:
        return False

df_BY['weekday'] = df_BY.date.dt.dayofweek
df_BY['is_weekend'] = df_BY.apply(lambda x: is_weekend(x['weekday']), axis=1)
df_BY['weekday_name'] = df_BY.apply(lambda x: week_day_string(x['weekday']), axis=1)
df_BY['calendar_week'] = df_BY.date.dt.week

In [None]:
df_BY.tail(5)

### Let's fix the sunday value describing the whole weekend


In [None]:
to_modify = []
prev_value = -1

for index, row in df_BY.iterrows():
    if row['dosen_kumulativ'] == prev_value and row['weekday'] == 5 :
        to_modify.append(index)
    prev_value = row['dosen_kumulativ']

print(f'Saturdays with 0 {to_modify}')

for i in to_modify:
    try:
        sunday_value = df_BY.loc[df_BY.index == i+1, 'dosen_kumulativ'].values[0]
        dif = sunday_value - df_BY.loc[df_BY.index == i, 'dosen_kumulativ'].values[0]
        add = int(dif / 2)
        new_total = df_BY.loc[df_BY.index == i, 'dosen_kumulativ'].values[0] + add
        df_BY.loc[df_BY.index == i, 'dosen_kumulativ'] = new_total
        print(f'modifying dosen_kumu of {i} to {new_total} using {sunday_value} and half diff which is {add}')
    except IndexError:
        print("An exception occurred")


## New column Differenz in total doses

In [None]:
df_BY['vortag_dosen_dif'] = df_BY.dosen_kumulativ - df_BY.dosen_kumulativ.shift(1)
df_BY['vortag_dosen_dif'] = df_BY['vortag_dosen_dif'].fillna(0)
df_BY = df_BY.astype({'vortag_dosen_dif': 'int64'})



## Let's take a look

In [None]:
df_BY.tail(10)

## Save to pickle

In [None]:
path = 'data/vac-prep.pkl'
df_BY.to_pickle(path)
print(f'Saved pickle to {path}')