# One run full walktrhough 

In [1]:
## check some parameters
## depending where you launch your notebook, the relative path might not work
## you should start the notebook server from your base path
## when opening the notebook, typically your path will be ../ads_covid-19/notebooks
import os
if os.path.split(os.getcwd())[-1]=='notebooks':
    os.chdir("../")

'Your base path is at: '+os.path.split(os.getcwd())[-1]

'Your base path is at: ads_covid-19'

## 1 Update all data

In [2]:
# %load src/data/get_data.py

import subprocess
import os

import pandas as pd
import numpy as np

from datetime import datetime

import requests
import json

# def get_johns_hopkins():
#     ''' Get data by a git pull request, the source code has to be pulled first
#         Result is stored in the predifined csv structure
#     '''
#     git_pull = subprocess.Popen( "/usr/bin/git pull" ,
#                          cwd = os.path.dirname( 'data/raw/COVID-19/' ),
#                          shell = True,
#                          stdout = subprocess.PIPE,
#                          stderr = subprocess.PIPE )
#     (out, error) = git_pull.communicate()


#     print("Error : " + str(error))
#     print("out : " + str(out))


def get_current_data_germany():
    ''' Get current data from germany, attention API endpoint not too stable
        Result data frame is stored as pd.DataFrame

    '''
    # 16 states
    #data=requests.get('https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/Coronaf%C3%A4lle_in_den_Bundesl%C3%A4ndern/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json')

    # 400 regions / Landkreise
    data=requests.get('https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/RKI_Landkreisdaten/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json')

    json_object=json.loads(data.content)
    full_list=[]
    for pos,each_dict in enumerate (json_object['features'][:]):
        full_list.append(each_dict['attributes'])

    pd_full_list=pd.DataFrame(full_list)
    pd_full_list.to_csv('data/raw/NPGEO/GER_state_data.csv',sep=';')
    print(' Number of regions rows: '+str(pd_full_list.shape[0]))

if __name__ == '__main__':
#     get_johns_hopkins()
    get_current_data_germany()


 Number of regions rows: 412


## 2. Process pipeline 

In [3]:
# %load src/data/process_JH_data.py
import pandas as pd
import numpy as np

from datetime import datetime


def store_relational_JH_data():
    ''' Transformes the COVID data in a relational data set

    '''

    data_path='data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
    pd_raw=pd.read_csv(data_path)

    pd_data_base=pd_raw.rename(columns={'Country/Region':'country',
                      'Province/State':'state'})

    pd_data_base['state']=pd_data_base['state'].fillna('no')

    pd_data_base=pd_data_base.drop(['Lat','Long'],axis=1)


    pd_relational_model=pd_data_base.set_index(['state','country']) \
                                .T                              \
                                .stack(level=[0,1])             \
                                .reset_index()                  \
                                .rename(columns={'level_0':'date',
                                                   0:'confirmed'},
                                                  )

    pd_relational_model['date']=pd_relational_model.date.astype('datetime64[ns]')

    pd_relational_model.to_csv('data/processed/COVID_relational_confirmed.csv',sep=';',index=False)
    print(' Number of rows stored: '+str(pd_relational_model.shape[0]))
    print(' Latest date is: '+str(max(pd_relational_model.date)))
if __name__ == '__main__':

    store_relational_JH_data()


 Number of rows stored: 32452
 Latest date is: 2020-05-22 00:00:00


## 3  Filter and Doubling Rate Calculation

In [4]:
# %load src/features/build_features.py

import numpy as np
from sklearn import linear_model
reg = linear_model.LinearRegression(fit_intercept=True)
import pandas as pd

from scipy import signal


def get_doubling_time_via_regression(in_array):
    ''' Use a linear regression to approximate the doubling rate

        Parameters:
        ----------
        in_array : pandas.series

        Returns:
        ----------
        Doubling rate: double
    '''

    y = np.array(in_array)
    X = np.arange(-1,2).reshape(-1, 1)

    assert len(in_array)==3
    reg.fit(X,y)
    intercept=reg.intercept_
    slope=reg.coef_

    return intercept/slope


def savgol_filter(df_input,column='confirmed',window=5):
    ''' Savgol Filter which can be used in groupby apply function (data structure kept)

        parameters:
        ----------
        df_input : pandas.series
        column : str
        window : int
            used data points to calculate the filter result

        Returns:
        ----------
        df_result: pd.DataFrame
            the index of the df_input has to be preserved in result
    '''

    degree=1
    df_result=df_input

    filter_in=df_input[column].fillna(0) # attention with the neutral element here

    result=signal.savgol_filter(np.array(filter_in),
                           window, # window size used for filtering
                           1)
    df_result[str(column+'_filtered')]=result
    return df_result

def rolling_reg(df_input,col='confirmed'):
    ''' Rolling Regression to approximate the doubling time'

        Parameters:
        ----------
        df_input: pd.DataFrame
        col: str
            defines the used column
        Returns:
        ----------
        result: pd.DataFrame
    '''
    days_back=3
    result=df_input[col].rolling(
                window=days_back,
                min_periods=days_back).apply(get_doubling_time_via_regression,raw=False)



    return result




def calc_filtered_data(df_input,filter_on='confirmed'):
    '''  Calculate savgol filter and return merged data frame

        Parameters:
        ----------
        df_input: pd.DataFrame
        filter_on: str
            defines the used column
        Returns:
        ----------
        df_output: pd.DataFrame
            the result will be joined as a new column on the input data frame
    '''

    must_contain=set(['state','country',filter_on])
    assert must_contain.issubset(set(df_input.columns)), ' Erro in calc_filtered_data not all columns in data frame'

    df_output=df_input.copy() # we need a copy here otherwise the filter_on column will be overwritten

    pd_filtered_result=df_output[['state','country',filter_on]].groupby(['state','country']).apply(savgol_filter)#.reset_index()

    #print('--+++ after group by apply')
    #print(pd_filtered_result[pd_filtered_result['country']=='Germany'].tail())

    #df_output=pd.merge(df_output,pd_filtered_result[['index',str(filter_on+'_filtered')]],on=['index'],how='left')
    df_output=pd.merge(df_output,pd_filtered_result[[str(filter_on+'_filtered')]],left_index=True,right_index=True,how='left')
    #print(df_output[df_output['country']=='Germany'].tail())
    return df_output.copy()





def calc_doubling_rate(df_input,filter_on='confirmed'):
    ''' Calculate approximated doubling rate and return merged data frame

        Parameters:
        ----------
        df_input: pd.DataFrame
        filter_on: str
            defines the used column
        Returns:
        ----------
        df_output: pd.DataFrame
            the result will be joined as a new column on the input data frame
    '''

    must_contain=set(['state','country',filter_on])
    assert must_contain.issubset(set(df_input.columns)), ' Erro in calc_filtered_data not all columns in data frame'


    pd_DR_result= df_input.groupby(['state','country']).apply(rolling_reg,filter_on).reset_index()

    pd_DR_result=pd_DR_result.rename(columns={filter_on:filter_on+'_DR',
                             'level_2':'index'})

    #we do the merge on the index of our big table and on the index column after groupby
    df_output=pd.merge(df_input,pd_DR_result[['index',str(filter_on+'_DR')]],left_index=True,right_on=['index'],how='left')
    df_output=df_output.drop(columns=['index'])


    return df_output


if __name__ == '__main__':
    test_data_reg=np.array([2,4,6])
    result=get_doubling_time_via_regression(test_data_reg)
    print('the test slope is: '+str(result))

    pd_JH_data=pd.read_csv('data/processed/COVID_relational_confirmed.csv',sep=';',parse_dates=[0])
    pd_JH_data=pd_JH_data.sort_values('date',ascending=True).copy()

    #test_structure=pd_JH_data[((pd_JH_data['country']=='US')|
    #                  (pd_JH_data['country']=='Germany'))]
    
    temp_pd = pd.read_csv('data/external/202004_covid19countryinfo_clean.csv',sep=';',parse_dates=[0]).rename(columns={'pop':'population'})
    
    pd_JH_data = pd.merge(pd_JH_data, temp_pd[['country','population']], on=['country'], how='left')
    pd_JH_data = pd_JH_data[pd_JH_data['population'].notna()]
    print(pd_JH_data.head())
    
    pd_result_larg=calc_filtered_data(pd_JH_data)
    pd_result_larg=calc_doubling_rate(pd_result_larg)
    pd_result_larg=calc_doubling_rate(pd_result_larg,'confirmed_filtered')


    mask=pd_result_larg['confirmed']>100
    pd_result_larg['confirmed_filtered_DR']=pd_result_larg['confirmed_filtered_DR'].where(mask, other=np.NaN)
    pd_result_larg.to_csv('data/processed/COVID_final_set.csv',sep=';',index=False)
    print(pd_result_larg[pd_result_larg['country']=='Germany'].tail())


the test slope is: [2.]
        date    state       country  confirmed  population
0 2020-01-22  Alberta        Canada        0.0  37,742,154
1 2020-01-22       no  Korea, South        1.0  51,269,185
2 2020-01-22       no        Kosovo        0.0   1,810,463
3 2020-01-22       no        Kuwait        0.0   4,270,571
4 2020-01-22       no    Kyrgyzstan        0.0   6,524,195
            date state  country  confirmed  population  confirmed_filtered  \
17075 2020-05-18    no  Germany   176551.0  83,783,942            176984.6   
17076 2020-05-19    no  Germany   177778.0  83,783,942            177638.4   
17077 2020-05-20    no  Germany   178473.0  83,783,942            178306.6   
17078 2020-05-21    no  Germany   179021.0  83,783,942            179062.7   
17079 2020-05-22    no  Germany   179710.0  83,783,942            179818.8   

       confirmed_DR  confirmed_filtered_DR  
17075    441.111389             269.621713  
17076    251.099125             271.910483  
17077    184.80818

In [5]:
# print(pd_result_larg[pd_result_larg['country']=='US'].tail())

## 4 SIR

In [6]:
import pandas as pd
import numpy as np

from datetime import datetime
import pandas as pd 

from scipy import optimize
from scipy import integrate


df_analyse=pd.read_csv('data/processed/COVID_final_set.csv',sep=';',parse_dates=[0])
df_analyse.sort_values('date',ascending=True)
country_list = df_analyse['country'].unique().tolist()

df_sir = df_analyse[['date','country','confirmed']].groupby(['country','date']).agg(np.max).unstack(0)
df_sir.columns = df_sir.columns.droplevel()
df_sir.reset_index(level=0, inplace=True)
print(df_sir.Germany[df_sir.ne(0).idxmax()['Germany']], '\n', 
      df_analyse.loc[df_analyse['country'] == 'Germany']['population'].tolist()[0])


def quick_plot(x_in, df_input, title, x_label, y_label, y_scale='log', slider=False):
    """ Quick basic plot for quick static evaluation of a time series
    
        you can push selective columns of your data frame by .iloc[:,[0,6,7,8]]
        
        Parameters:
        ----------
        x_in : array 
            array of date time object, or array of numbers
        df_input : pandas dataframe 
            the plotting matrix where each column is plotted
            the name of the column will be used for the legend
        scale: str
            y-axis scale as 'log' or 'linear'
        slider: bool
            True or False for x-axis slider
    
        
        Returns:
        ----------
        
    """
    fig = go.Figure()

    for each in df_input.columns:
        fig.add_trace(go.Scatter(
                        x=x_in,
                        y=df_input[each],
                        name=each,
#                         mode="lines+markers",
                        opacity=0.8))
    
    fig.update_layout(autosize=True,
        width=1024,
        height=768,
        title=title,              
        font=dict(
            family="PT Sans, monospace",
            size=18,
            color="#7f7f7f"
            ),
        showlegend=True
        )
    fig.update_yaxes(type=y_scale,title=x_label),
    fig.update_xaxes(tickangle=-45,
                 nticks=20,
                 title=y_label,
                 tickfont=dict(size=14,color="#7f7f7f")
                )
    if slider==True:
        fig.update_layout(xaxis_rangeslider_visible=True)
#     fig.show()
    return fig

1.0 
 83,783,942


In [7]:
type(df_analyse['population'][0])

str

## 5 Visual Board

In [None]:
# %load src/visualization/visualize.py
import pandas as pd
import numpy as np

import dash
dash.__version__
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output,State

import plotly.graph_objects as go

import os
print(os.getcwd())
df_input_large=pd.read_csv('data/processed/COVID_final_set.csv',sep=';')


fig = go.Figure()

external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']

app = dash.Dash()
# app = dash.Dash()
app.layout = html.Div([
    dcc.Markdown('''
    #  Applied Data Science on COVID-19 data

    Goal of the project is to learn data science by applying a cross industry standard process,
    it covers the full walkthrough of: automated data gathering, data transformations,
    filtering and machine learning to approximating the doubling time, and
    (static) deployment of responsive dashboard.

    '''),


    dcc.Tabs([
        dcc.Tab(label='Covid-19 Dashboard', children=[
            dcc.Markdown('''
            **_Select multiple countries for visualization_**
            '''),

            dcc.Dropdown(
                id='country_drop_down',
                options=[ {'label': each,'value':each} for each in df_input_large['country'].unique()],
                value=['US', 'Germany','Italy'], # which are pre-selected
                multi=True
            ),
            
            dcc.Markdown('''
            **_Select Timeline of confirmed COVID-19 cases or the approximated doubling time_**
            '''),
            
            dcc.Dropdown(
                id='doubling_time',
                options=[
                    {'label': 'Timeline Confirmed ', 'value': 'confirmed'},
                    {'label': 'Timeline Confirmed Filtered', 'value': 'confirmed_filtered'},
                    {'label': 'Timeline Doubling Rate', 'value': 'confirmed_DR'},
                    {'label': 'Timeline Doubling Rate Filtered', 'value': 'confirmed_filtered_DR'},
                ],
                value='confirmed',
                multi=False
            ),

            dcc.Graph(figure=fig, id='main_window_slope')
        ]),
        dcc.Tab(label='SIR Virus Spread Model', children=[
            dcc.Markdown('''
            ##### Select respective values for the SIR simulation
            '''),
            
            dcc.Markdown('''
            **_Select Country_**
            '''),
            
            dcc.Dropdown(
                id='country_select_sir',
                options=[ {'label': each,'value':each} for each in country_list],
                value='Germany', # which are pre-selected
                multi=False
            ),
            
            dcc.Markdown('''
            **_Enter Beta min (minimum infection rate)_**
            '''),
            
            dcc.Input(
                id="beta_min",
                type="number",
                placeholder="Enter a number Ex: 0.11",
                value=0.11
            ),
            
            dcc.Markdown('''
            **_Enter Beta max (maximum infection rate)_**
            '''),
            
            dcc.Input(
                id="beta_max",
                type="number",
                placeholder="Enter a number (greater than beta min) Ex: 0.7",
                value=0.4
            ),
            
            dcc.Markdown('''
            **_Enter Gamma (recovery rate)_**
            '''),
            
            dcc.Input(
                id="gamma",
                type="number",
                placeholder="Enter a number Ex: 0.1",
                value=0.1
            ),
            
            dcc.Markdown('''
            **_Enter number of days with initial measures_**
            '''),
            
            dcc.Input(
                id="t_initial",
                type="number",
                placeholder="Enter number of days",
                value=20
            ),
            
            dcc.Markdown('''
            **_Enter number of days with introductory measures_**
            '''),
            
            dcc.Input(
                id="t_intro_measures",
                type="number",
                placeholder="Enter number of days",
                value=20
            ),
            
            dcc.Markdown('''
            **_Enter number of days when measures were held constant_**
            '''),
            
            dcc.Input(
                id="t_hold",
                type="number",
                placeholder="Enter number of days",
                value=135
            ),
            
            dcc.Markdown('''
            **_Enter number of days when measures were relaxed_**
            '''),
            
            dcc.Input(
                id="t_relax",
                type="number",
                placeholder="Enter number of days",
                value=25
            ),
            
            dcc.Markdown('''
            **_Select scale_**
            '''),

            dcc.Dropdown(
                id='graph_scale_sir',
                options=[
                    {'label': 'log ', 'value': 'log'},
                    {'label': 'linear', 'value': 'linear'},
                ],
                value='log',
                multi=False
            ),

            dcc.Graph(id='sir')   
        ]),
    ])
])


@app.callback(
    Output('main_window_slope', 'figure'),
    [Input('country_drop_down', 'value'),
    Input('doubling_time', 'value')])
def update_figure(country_list,show_doubling):


    if 'doubling_rate' in show_doubling:
        my_yaxis={'type':"log",
               'title':'Approximated doubling rate over 3 days (larger numbers are better #stayathome)'
              }
    else:
        my_yaxis={'type':"log",
                  'title':'Confirmed infected people (source johns hopkins csse, log-scale)'
              }


    traces = []
    for each in country_list:

        df_plot=df_input_large[df_input_large['country']==each]

        if show_doubling=='doubling_rate_filtered':
            df_plot=df_plot[['state','country','confirmed','confirmed_filtered','confirmed_DR','confirmed_filtered_DR','date']].groupby(['country','date']).agg(np.mean).reset_index()
        else:
            df_plot=df_plot[['state','country','confirmed','confirmed_filtered','confirmed_DR','confirmed_filtered_DR','date']].groupby(['country','date']).agg(np.sum).reset_index()
       #print(show_doubling)


        traces.append(dict(x=df_plot.date,
                                y=df_plot[show_doubling],
                                mode='markers+lines',
                                opacity=0.9,
                                name=each
                        )
                )

    return {
            'data': traces,
            'layout': dict (
                width=1280,
                height=720,

                xaxis={'title':'Timeline',
                        'tickangle':-45,
                        'nticks':20,
                        'tickfont':dict(size=14,color="#7f7f7f"),
                      },

                yaxis=my_yaxis
        )
    }


@app.callback(
    Output('sir', 'figure'),
    [Input('country_select_sir', 'value'),
    Input('beta_min', 'value'),
    Input('beta_max', 'value'),
    Input('gamma', 'value'),
    Input('t_initial', 'value'),
    Input('t_intro_measures', 'value'),
    Input('t_hold', 'value'),
    Input('t_relax', 'value'),
    Input('graph_scale_sir', 'value')])
def update_sir_figure(country, beta_min, beta_max, gamma, 
                        t_initial, t_intro_measures, t_hold, t_relax, scale):
    beta_min = float(beta_min)
    beta_max = float(beta_max)
    gamma = float(gamma)
    t_initial = int(float(t_initial))
    t_intro_measures = int(t_intro_measures)
    t_hold = int(t_hold)
    t_relax = int(t_relax)
    
    N0=float((df_analyse.loc[df_analyse['country'] == country]['population'].tolist()[0]).replace(',','')) #max susceptible population
    print(N0)
    ydata = np.array(df_sir[country][35:])
    t=np.arange(len(ydata))
#     print('ydata :', ydata, ' type:', type(ydata))
#     print('ydata :', t, ' type:', type(t))
    
    I0=ydata[0]
    S0=N0-I0
    R0=0
    
    pd_beta=np.concatenate((np.array(t_initial*[beta_max]),
                            np.linspace(beta_max,beta_min,t_intro_measures),
                            np.array(t_hold*[beta_min]),
                            np.linspace(beta_min,beta_max,t_relax),
                           ))


    def SIR_model(SIR,beta,gamma):
        ''' Simple SIR model
            S: susceptible population
            I: infected people
            R: recovered people
            beta: 

            overall condition is that the sum of changes (differnces) sum up to 0
            dS+dI+dR=0
            S+I+R= N (constant size of population)

        '''

        S,I,R=SIR
        dS_dt=-beta*S*I/N0          #S*I is the 
        dI_dt=beta*S*I/N0-gamma*I
        dR_dt=gamma*I
        return([dS_dt,dI_dt,dR_dt])


    SIR=np.array([S0,I0,R0])
    propagation_rates=pd.DataFrame(columns={'susceptible':S0,
                                            'infected':I0,
                                            'recoverd':R0})
    for each_beta in pd_beta:
        new_delta_vec=SIR_model(SIR,each_beta,gamma)

        SIR=SIR+new_delta_vec
        propagation_rates=propagation_rates.append({'susceptible':SIR[0],
                                                    'infected':SIR[1],
                                                    'recovered':SIR[2]}, ignore_index=True)

    
    df_plot = pd.DataFrame({'idx':propagation_rates.index, 'simulated infected':propagation_rates.infected})
    fg = quick_plot(df_plot.idx,
               df_plot.iloc[:,1:],
               title='SIR simulation infection',
               x_label='Time in days',
               y_label='Population infected',
               y_scale=scale,
               slider=True
               )
    
#     fg.add_trace(go.Bar(
#                         x=np.arange(len(ydata)),
#                         y=ydata,
#                         width=0.8,
#                         name="actual infected",
#                         opacity=0.3))

    fg.update_layout(
        shapes=[
            dict(
                type="rect",
                xref="x",
                yref="paper",
                x0=0,
                y0=0,
                x1=0+t_initial,
                y1=1,
                fillcolor="DeepSkyBlue",
                opacity=0.2,
            ),
            dict(
                type="rect",
                xref="x",
                yref="paper",
                x0=t_initial,
                y0=0,
                x1=t_initial+t_intro_measures,
                y1=1,
                fillcolor="DodgerBlue",
                opacity=0.2,
            ),
            dict(
                type="rect",
                xref="x",
                yref="paper",
                x0=t_initial+t_intro_measures,
                y0=0,
                x1=t_initial+t_intro_measures+t_hold,
                y1=1,
                fillcolor="RoyalBlue",
                opacity=0.2,
            ),
            dict(
                type="rect",
                xref="x",
                yref="paper",
                x0=t_initial+t_intro_measures+t_hold,
                y0=0,
                x1=t_initial+t_intro_measures+t_hold+t_relax,
                y1=1,
                fillcolor="DarkBlue",
                opacity=0.2,
            )
        ]
    )

    return fg

if __name__ == '__main__':

    app.run_server(debug=True, use_reloader=False)


E:\EDS\Project\ads_covid-19
Dash is running on http://127.0.0.1:8050/

 in production, use a production WSGI server like gunicorn instead.



 * Tip: There are .env or .flaskenv files present. Do "pip install python-dotenv" to use them.


 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: on
83783942.0
83783942.0
83783942.0
83783942.0
83783942.0
83783942.0
83783942.0
83783942.0
83783942.0
