# We start off by importing the data

In [None]:
import math
import pandas as pd
import shared
pd.reset_option('precision')
pd.set_option('float_format', '{:f}'.format)
bl_kurzel = shared.bl_kurzel

dfs = {}
for kurzel in shared.bl_kurzel:
    dfs[kurzel] = pd.read_csv(f'data/vac_{kurzel}.csv', sep=',', index_col=0, parse_dates=True)
    # it's important to define index and parse_dates



### Let' pick the few columns that interest us

In [None]:
for kurzel in bl_kurzel:
    dfs[kurzel] = dfs[kurzel][['publication_date', 'dosen_kumulativ']]

# Let's fix the data

### There are two spikes, those are just wrong data, let's fix them manually

In [None]:
dfs['BY'].dosen_kumulativ = dfs['BY'].dosen_kumulativ.replace(384311, 343668)
dfs['BY'].dosen_kumulativ = dfs['BY'].dosen_kumulativ.replace(1150129, 1186228)




### New column shots_today instead of total doses til now (dosen_kumulativ)

Each row is seen as it's own entry, the value of a day should not depend on the previous value.
But that is the case for the dosen_kumulativ column.
Instead we will add a new column 'heutige_dosen' and recreate the dosen_kumulativ column

In [None]:
dfs['BY'].head(5)

In [None]:
def add_dif_column(df):
    df['shots_today'] = df.dosen_kumulativ - df.dosen_kumulativ.shift(1)
    df['shots_today'] = df['shots_today'].fillna(0)
    return df.astype({'shots_today': 'int64'})

for kurzel in bl_kurzel:
    dfs[kurzel] = add_dif_column(dfs[kurzel])

### Add missing rows with empty values

In [None]:
def fix_missing_days(df):
    # fill in the dates
    idx = pd.date_range(start='2020-12-26', end=df.index.max())

    df = df.reindex(idx)

    # Create new index column because that's waay easier than having the date column be the index
    df = df.reset_index()
    df.at[0,'dosen_kumulativ'] = 0
    df.at[0,'shots_today'] = 0
    df.at[1,'shots_today'] =  df.iloc[1, :]['dosen_kumulativ']

    return df.rename(columns={'index': 'date'})

for kurzel in bl_kurzel:
    dfs[kurzel] = fix_missing_days(dfs[kurzel])


### Fix NaN values in shots_today

In [None]:
def fix_NaN_dosen(df):
    i = 0
    while i < len(df.index):
        row = df.iloc[i, :]
        #print(f'{row} with type: {type(row)}')
        if pd.isnull(row['shots_today']):
            j = 1
            new_row = df.iloc[i+j, :]
            while pd.isnull(new_row['shots_today']):
                j = j + 1
                new_row = df.iloc[i+j, :]
            next_valid_row = df.iloc[i+j, :]
            quotient = next_valid_row['shots_today'] / (j+1)
            df.at[i+j,'shots_today'] = quotient
            for to_change in range(i, i+j):
                df.at[to_change,'shots_today'] = quotient
            i = i+j
        else:
            i = i + 1

for kurzel in bl_kurzel:
    fix_NaN_dosen(dfs[kurzel])


### New shots_sum column

In [None]:
def add_shots_sum(df):
    df['shots_sum'] = 0
    df['shots_sum'] = df['shots_today'].cumsum().round()
    df.shots_sum = df.shots_sum.astype(int)
    return df

for kurzel in bl_kurzel:
    dfs[kurzel] = add_shots_sum(dfs[kurzel])

In [None]:
dfs['BY'].head(6)

### New column: Day of the week

In [None]:
for kurzel in bl_kurzel:
    dfs[kurzel] = shared.add_weekday_stuff(dfs[kurzel], 'date')

## Let's take a look

In [None]:
dfs['BY'].head(15)

In [None]:
dfs['BY'].tail(10)

## Save to pickle

In [None]:
from datetime import datetime
print(f"{datetime.now()}")

for kurzel in bl_kurzel:
    path = f'data/df_vac_{kurzel}.pkl'
    dfs[kurzel].to_pickle(path)
    print(f'Saved pickle to {path}')
