# We start off by importing the data

In [None]:
import pandas as pd
df = pd.read_csv('data/vac.csv', sep=',', index_col=0, parse_dates=True) # it's important to define index and parse_dates



## We'll have a look at the column names

In [None]:
# list(df.columns.values)

### Let's remove columns that don't interest us

In [None]:
uninteresting = ['indikation_alter_dosen', 'indikation_alter_erst', 'indikation_alter_voll',
                 'indikation_beruf_dosen', 'indikation_beruf_erst', 'indikation_beruf_voll',
                 'indikation_medizinisch_dosen', 'indikation_medizinisch_erst', 'indikation_medizinisch_voll',
                'indikation_pflegeheim_dosen', 'indikation_pflegeheim_erst', 'indikation_pflegeheim_voll']

different_vaccines = ['dosen_biontech_kumulativ', 'dosen_erst_biontech_kumulativ', 'dosen_voll_biontech_kumulativ',
                      'dosen_moderna_kumulativ', 'dosen_erst_moderna_kumulativ', 'dosen_voll_moderna_kumulativ',
                      'dosen_astrazeneca_kumulativ', 'dosen_erst_astrazeneca_kumulativ', 'dosen_voll_astrazeneca_kumulativ']

df = df.drop(uninteresting, axis=1)
df = df.drop(different_vaccines, axis=1)
list(df.columns.values)


### Or actually let's just pick the few columns that interest us

In [None]:
df = df[['publication_date', 'dosen_kumulativ', 'impf_inzidenz_dosen']]

# Let's fix the data

### We'll first fill missing rows, Saturdays seem to be missing!

In [None]:
df.head()

In [None]:
idx = pd.date_range(df.index.min(), df.index.max())

df = df.reindex(idx, method='ffill')

# Create new index column because that's waay easier than having the date column be the index
df = df.reset_index()
df = df.rename(columns={'index': 'date'})
#fill_value=0 
# convert column "a" to int64 dtype and "b" to complex type
# df = df.astype({'dosen_kumulativ': int})

In [None]:
# df['dosen_kumulativ'] = df['dosen_kumulativ'].fillna(method='ffill')
# df['impf_inzidenz_dosen'] = df['impf_inzidenz_dosen'].fillna(0)
# df.head(5)

### Let's fix errors in the data. There are two spikes, that just seem out of place

In [None]:
df.dosen_kumulativ = df.dosen_kumulativ.replace(384311, 343668)
df.dosen_kumulativ = df.dosen_kumulativ.replace(1150129, 1186228)
# df.dosen_kumulativ.replace(343668, 384311)

# df.at[29, 'dosen_kumulativ'] = 343668
# df.at['2020-03-03', 'dosen_kumulativ'] = 1186228




# Let's add some new useful columns

## New column: Day of the week


In [None]:
df.date


In [None]:
def week_day_string(weekday):
    if weekday == 0:
        return 'Mon'
    elif weekday == 1:
        return 'Tue'
    elif weekday == 2:
        return 'Wed'
    elif weekday == 3:
        return 'Thu'
    elif weekday == 4:
        return 'Fri'
    elif weekday == 5:
        return 'Sat'
    elif weekday == 6:
        return 'Sun'
    else:
        return 'other'

def is_weekend(weekday):
    if weekday == 5:
        return True
    elif weekday == 6:
        return True
    else:
        return False

df['weekday'] = df.date.dt.dayofweek
df['is_weekend'] = df.apply(lambda x: is_weekend(x['weekday']), axis=1)
df['weekday_name'] = df.apply(lambda x: week_day_string(x['weekday']), axis=1)
df['calendar_week'] = df.date.dt.week

In [None]:
df.tail(25)

### Let's fix the sunday value describing the whole weekend


In [None]:
to_modify = []
prev_value = -1

for index, row in df.iterrows():
    if row['dosen_kumulativ'] == prev_value and row['weekday'] == 5 :
        to_modify.append(index)
    prev_value = row['dosen_kumulativ']

print(f'Saturdays with 0 {to_modify}')

for i in to_modify:
    try:
        sunday_value = df.loc[df.index == i+1, 'dosen_kumulativ'].values[0]
        dif = sunday_value - df.loc[df.index == i, 'dosen_kumulativ'].values[0]
        add = int(dif / 2)
        new_total = df.loc[df.index == i, 'dosen_kumulativ'].values[0] + add
        df.loc[df.index == i, 'dosen_kumulativ'] = new_total
        print(f'modifying dosen_kumu of {i} to {new_total} using {sunday_value} and half diff which is {add}')
    except IndexError:
        print("An exception occurred")


## New column Differenz in total doses

In [None]:
df['vortag_dosen_dif'] = df.dosen_kumulativ - df.dosen_kumulativ.shift(1)
df['vortag_dosen_dif'] = df['vortag_dosen_dif'].fillna(0)
df = df.astype({'vortag_dosen_dif': 'int64'})



## Let's take a look

In [None]:
df.tail(2)

In [None]:
current_official_doses = df.tail(1)['dosen_kumulativ']
#current_official_doses.values[0]
current_official_doses

In [None]:
df.set_index('date')['dosen_kumulativ'].plot.line(figsize=(18,8), color='cornflowerblue', linewidth=3, title='Impfungen Bayern')


In [None]:
print(df['vortag_dosen_dif'].max())
print(df['vortag_dosen_dif'].min())


In [None]:
int(df.tail(3)['vortag_dosen_dif'].values.mean())

In [None]:
df['rolling_vortag_dosen_dif'] = df.vortag_dosen_dif.rolling(7).mean()
df.set_index('date')['vortag_dosen_dif'].plot.line(x='date', figsize=(18,8), color='silver', linewidth=0.8)
df.set_index('date')['rolling_vortag_dosen_dif'].plot.line(x='date', figsize=(18,8), color='cornflowerblue', linewidth=7, title='Rolling Impfdosen differenz zum Vortag - Bayern')


## Vaccinations per weekday

In [None]:
def avg_vacs_per_weekday_last_x_weeks(x):
    df.tail(7*x).groupby(["weekday_name"])['vortag_dosen_dif'].mean().sort_values().plot.bar(x='weekday_name', figsize=(10,6), color='seagreen', title=f'Average vaccinations per weekday (last {x} weeks)')


In [None]:
avg_vacs_per_weekday_last_x_weeks(30)

In [None]:
avg_vacs_per_weekday_last_x_weeks(5)

## Vaccinations per calendar week

In [None]:
# tail(length - 5) removes 53 calendar week
df.tail(len(df.index) - 8).groupby(["calendar_week"])['vortag_dosen_dif'].sum().plot.bar(x='weekday_name', figsize=(10,6), color='darksalmon', title=f'Total vaccinations per calendar week')


In [None]:
# tail(length - 5) removes 53 calendar week
df.tail(len(df.index) - 8).groupby(["calendar_week"])['vortag_dosen_dif'].mean().plot.bar(x='weekday_name', figsize=(10,6), color='darksalmon', title=f'Average vaccinations per calendar week')
