# One run full walkthrough
* Do the full walkthorugh on the large data set
* Refactor the source code and bring it to the individual scripts
* Ensure a full run with one click

## 1 Update all data

In [1]:
# %load ../src/data/get_data.py

import subprocess
import os

import pandas as pd
import numpy as np

from datetime import datetime

import requests
import json

def get_johns_hopkins():
    ''' Get data by a git pull request, the source code has to be pulled first
        Result is stored in the predifined csv structure
    '''
    git_pull = subprocess.Popen( "/usr/bin/git pull" ,
                         cwd = os.path.dirname( '../data/raw/COVID-19/' ),
                         shell = True,
                         stdout = subprocess.PIPE,
                         stderr = subprocess.PIPE )
    (out, error) = git_pull.communicate()


    print("Error : " + str(error))
    print("out : " + str(out))


def get_current_data_germany():
    ''' Get current data from germany, attention API endpoint not too stable
        Result data frame is stored as pd.DataFrame

    '''
    # 16 states
    #data=requests.get('https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/Coronaf%C3%A4lle_in_den_Bundesl%C3%A4ndern/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json')

    # 400 regions / Landkreise
    data=requests.get('https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/RKI_Landkreisdaten/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json')

    json_object=json.loads(data.content)
    full_list=[]
    for pos,each_dict in enumerate (json_object['features'][:]):
        full_list.append(each_dict['attributes'])

    pd_full_list=pd.DataFrame(full_list)
    pd_full_list.to_csv('../data/raw/NPGEO/GER_state_data.csv',sep=';')
    print(' Number of regions rows: '+str(pd_full_list.shape[0]))

if __name__ == '__main__':
    get_johns_hopkins()
    #get_current_data_germany()


Error : b'From https://github.com/CSSEGISandData/COVID-19\n   7ad80c0c..fbc6aaae  web-data   -> origin/web-data\n'
out : b'Already up to date.\n'


## 2 Process pipline

In [2]:
# %load ../src/data/process_JH_data.py
import pandas as pd
import numpy as np

from datetime import datetime


def store_relational_JH_data():
    ''' Transformes the COVID data in a relational data set

    '''

    data_path = '../data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
    pd_raw = pd.read_csv(data_path)

    pd_data_base = pd_raw.rename(columns={'Country/Region':'country',
                      'Province/State':'state'})

    pd_data_base['state'] = pd_data_base['state'].fillna('no')

    pd_data_base = pd_data_base.drop(['Lat','Long'],axis=1)


    pd_relational_model = pd_data_base.set_index(['state','country']) \
                                .T                              \
                                .stack(level=[0,1])             \
                                .reset_index()                  \
                                .rename(columns={'level_0':'date',
                                                   0:'confirmed'},
                                                  )

    pd_relational_model['date'] = pd_relational_model.date.astype('datetime64[ns]')

    pd_relational_model.to_csv('../data/processed/COVID_relational_confirmed.csv',sep=';',index=False)
    print(' Number of rows stored: ' + str(pd_relational_model.shape[0]))

if __name__ == '__main__':

    store_relational_JH_data()


 Number of rows stored: 59850


## 3 Filter and Doubling Rate & SIR Model calculation

In [31]:
# %load ../src/features/build_features.py

import numpy as np
from sklearn import linear_model
reg = linear_model.LinearRegression(fit_intercept=True)
import pandas as pd

from scipy import signal
from scipy import optimize
from scipy import integrate


def get_doubling_time_via_regression(in_array):
    ''' Use a linear regression to approximate the doubling rate

        Parameters:
        ----------
        in_array : pandas.series

        Returns:
        ----------
        Doubling rate: double
    '''

    y = np.array(in_array)
    X = np.arange(-1,2).reshape(-1, 1)

    assert len(in_array)==3
    reg.fit(X,y)
    intercept=reg.intercept_
    slope=reg.coef_

    return intercept/slope


def savgol_filter(df_input,column='confirmed',window=5):
    ''' Savgol Filter which can be used in groupby apply function (data structure kept)

        parameters:
        ----------
        df_input : pandas.series
        column : str
        window : int
            used data points to calculate the filter result

        Returns:
        ----------
        df_result: pd.DataFrame
            the index of the df_input has to be preserved in result
    '''

    degree=1
    df_result=df_input

    filter_in=df_input[column].fillna(0) # attention with the neutral element here

    result=signal.savgol_filter(np.array(filter_in),
                           window, # window size used for filtering
                           1)
    df_result[str(column+'_filtered')]=result
    return df_result

def rolling_reg(df_input,col='confirmed'):
    ''' Rolling Regression to approximate the doubling time'

        Parameters:
        ----------
        df_input: pd.DataFrame
        col: str
            defines the used column
        Returns:
        ----------
        result: pd.DataFrame
    '''
    days_back=3
    result=df_input[col].rolling(
                window=days_back,
                min_periods=days_back).apply(get_doubling_time_via_regression,raw=False)



    return result




def calc_filtered_data(df_input,filter_on='confirmed'):
    '''  Calculate savgol filter and return merged data frame

        Parameters:
        ----------
        df_input: pd.DataFrame
        filter_on: str
            defines the used column
        Returns:
        ----------
        df_output: pd.DataFrame
            the result will be joined as a new column on the input data frame
    '''

    must_contain=set(['country', filter_on])
    assert must_contain.issubset(set(df_input.columns)), ' Erro in calc_filtered_data not all columns in data frame'

    df_output=df_input.copy() # we need a copy here otherwise the filter_on column will be overwritten

    pd_filtered_result=df_output[['country',filter_on]].groupby(['country']).apply(savgol_filter)#.reset_index()

    #print('--+++ after group by apply')
    #print(pd_filtered_result[pd_filtered_result['country']=='Germany'].tail())

    #df_output=pd.merge(df_output,pd_filtered_result[['index',str(filter_on+'_filtered')]],on=['index'],how='left')
    df_output=pd.merge(df_output,pd_filtered_result[[str(filter_on+'_filtered')]],left_index=True,right_index=True,how='left')
    #print(df_output[df_output['country']=='Germany'].tail())
    return df_output.copy()





def calc_doubling_rate(df_input,filter_on='confirmed'):
    ''' Calculate approximated doubling rate and return merged data frame

        Parameters:
        ----------
        df_input: pd.DataFrame
        filter_on: str
            defines the used column
        Returns:
        ----------
        df_output: pd.DataFrame
            the result will be joined as a new column on the input data frame
    '''

    must_contain=set(['country',filter_on])
    assert must_contain.issubset(set(df_input.columns)), ' Erro in calc_filtered_data not all columns in data frame'


    pd_DR_result= df_input.groupby([country']).apply(rolling_reg,filter_on).reset_index()

    pd_DR_result=pd_DR_result.rename(columns={filter_on:filter_on+'_DR',
                             'level_2':'index'})

    #we do the merge on the index of our big table and on the index column after groupby
    df_output=pd.merge(df_input,pd_DR_result[['index',str(filter_on+'_DR')]],left_index=True,right_on=['index'],how='left')
    df_output=df_output.drop(columns=['index'])


    return df_output


SyntaxError: EOL while scanning string literal (<ipython-input-31-5e59d7b854db>, line 138)

In [4]:

def SIR_model_t(SIR, t, beta, gamma):
    ''' Simple SIR model
        S: susceptible population
        t: time step, mandatory for integral.odeint
        I: infected people
        R: recovered people
        beta:

        overall condition is that the sum of changes (differnces) sum up to 0
        dS+dI+dR=0
        S+I+R= N (constant size of population)

    '''

    S, I, R = SIR
    dS_dt = -beta * S * I / N0  # S*I is the
    dI_dt = beta * S * I / N0 - gamma * I
    dR_dt = gamma * I
    return dS_dt, dI_dt, dR_dt

def fit_odeint(x, beta, gamma):
    '''
    helper function for the integration
    '''
    return integrate.odeint(SIR_model_t, (S0, I0, R0), t, args=(beta, gamma))[:,1] # we only would like to get dI

def SIR(data):
    popt = [0.4, 0.1]
    ydata = data
    t = np.arange(len(ydata))
    try:
        popt, pcov = optimize.curve_fit(fit_odeint, t, ydata, maxfev=5000)
        perr = np.sqrt(np.diag(pcov))
        
        # get the final fitted curve
        return fit_odeint(t, *popt)
    
    except ValueError:
        print('RunTimeError')
        return [0] * len(ydata)
            

In [5]:
pd_JH_data = pd.read_csv('../data/processed/COVID_relational_confirmed.csv', sep=';', parse_dates=[0])
pd_JH_data = pd_JH_data.sort_values('date', ascending=True).copy()
pd_JH_data.drop('state', axis=1, inplace=True)

# Create new Frame
newFrame = pd.DataFrame()
for col in pd_JH_data.columns:
    newFrame[col] = ''    
    
#if testFrame.reindex(sorted(testFrame.columns), axis=1).columns.all() == newFrame.reindex(sorted(newFrame.columns), axis=1).columns.all():        
    
for date in pd.unique(pd_JH_data['date']):
    for country in pd.unique(pd_JH_data['country']):
        singleFrame = pd_JH_data[(pd_JH_data['date'] == date) & (pd_JH_data['country'] ==  country)]
        singleFrame            = pd.DataFrame(singleFrame.sum()).T
        singleFrame['date']    = date
        singleFrame['country'] = country
        
        newFrame = newFrame.append(singleFrame)

newFrame = newFrame.reset_index()        
newFrame.to_csv('../data/processed/COVID_relational_confirmed.csv',sep=';',index=False)

pd_JH_data = newFrame.copy()
        

    

In [22]:
I0 = 27
S0 = pow(10, 6)
R0 = 0

N0 = pow(10, 6)
beta = 0.4
gamma = 0.1

pd_result_SIR = pd_JH_data.copy()
pd_result_SIR['SIR_static'] = 0


for each in pd.unique(pd_result_SIR['country']):
    
    print(each)
    
    # Extract y data
    ydata_SIR = np.array(pd_result_SIR[pd_result_SIR['country'] == each]['confirmed'][35:])            
    t = np.arange(len(ydata_SIR))        

    # Calc SIR Data
    fitted_SIR = SIR(ydata_SIR.copy())    

    # Create a new dataframe with SIR data for update function
    SIRFrame = pd.DataFrame({'date': pd_result_SIR[pd_result_SIR['country'] == each]['date'][35:]})
    SIRFrame['SIR_static'] = fitted_SIR
    SIRFrame['country'] = each

    pd_result_SIR.update(SIRFrame['SIR_static'])        
    

Canada




Korea, South
Kosovo
Kuwait
Kyrgyzstan
Laos
Latvia
Lebanon
Lesotho
Liberia
Libya
Liechtenstein
Lithuania
Luxembourg
MS Zaandam
Madagascar
Malawi
Malaysia
Maldives
Mali
Malta
Mauritania
Mauritius
Mexico


  dS_dt = -beta * S * I / N0  # S*I is the
  dI_dt = beta * S * I / N0 - gamma * I
  dR_dt = gamma * I


Moldova
Monaco
Mongolia
Montenegro
Morocco
Mozambique
Kenya
Kazakhstan
Jordan
Japan
Estonia
Eswatini
Ethiopia
Fiji
Finland
France
Gabon
Gambia
Georgia
Germany
Ghana
Greece
Grenada
Guatemala
Nepal
Guinea
Guyana
Haiti
Holy See
Honduras
Hungary
Iceland
India
Indonesia
Iran
Iraq
Ireland
Israel
Italy
Jamaica
Guinea-Bissau
Eritrea
Netherlands
Nicaragua
Sri Lanka
Sudan
Suriname
Sweden
Switzerland
Syria
Taiwan*
Tajikistan
Tanzania
Thailand
Timor-Leste
Togo
Trinidad and Tobago
Tunisia
Turkey
US
Uganda
Ukraine
United Arab Emirates
United Kingdom
Uruguay
Uzbekistan
Venezuela
Vietnam
West Bank and Gaza
Western Sahara
Yemen
Zambia
Zimbabwe
Spain
South Sudan
South Africa
Somalia
Niger
Nigeria
North Macedonia
Norway
Oman
Pakistan
Panama
Papua New Guinea
Paraguay
Peru
Philippines
Poland
Portugal
Qatar
New Zealand
Romania
Rwanda
Saint Kitts and Nevis
Saint Lucia
Saint Vincent and the Grenadines
San Marino
Sao Tome and Principe
Saudi Arabia
Senegal
Serbia
Seychelles
Sierra Leone
Singapore
Slovakia
Slove

In [28]:
pd_result_SIR

Unnamed: 0,index,date,country,confirmed,SIR_static
0,0,2020-01-22,Canada,0,0.000000
1,0,2020-01-22,"Korea, South",1,0.000000
2,0,2020-01-22,Kosovo,0,0.000000
3,0,2020-01-22,Kuwait,0,0.000000
4,0,2020-01-22,Kyrgyzstan,0,0.000000
...,...,...,...,...,...
42295,0,2020-09-02,Argentina,439172,748.354086
42296,0,2020-09-02,Antigua and Barbuda,94,91.502914
42297,0,2020-09-02,Angola,2777,2670.967744
42298,0,2020-09-02,Andorra,1199,647.241564


In [29]:
pd_JH_data['SIR_static'] = 0
pd_JH_data.update(pd_result_SIR['SIR_static'])
pd_JH_data[pd_JH_data['country'] == 'China'][33:].head(5)

Unnamed: 0,index,date,country,confirmed,SIR_static
6339,0,2020-02-24,China,77241,0.0
6527,0,2020-02-25,China,77754,0.0
6715,0,2020-02-26,China,78166,27.0
6903,0,2020-02-27,China,78600,29.049979
7091,0,2020-02-28,China,78928,31.255579


In [30]:
pd_result_larg = calc_filtered_data(pd_JH_data)
pd_result_larg = calc_doubling_rate(pd_result_larg)
pd_result_larg = calc_doubling_rate(pd_result_larg, 'confirmed_filtered')
mask = pd_result_larg['confirmed'] > 100
pd_result_larg['confirmed_filtered_DR'] = pd_result_larg['confirmed_filtered_DR'].where(mask, other=np.NaN)
pd_result_larg.to_csv('../data/processed/COVID_final_set.csv', sep=';', index=False)

KeyError: "['state'] not in index"

In [None]:
#pd_result_larg[pd_result_larg['country'] == 'US'][30:].head(50)

## 4 Visual Board

In [None]:
import pandas as pd
import numpy as np

import dash
dash.__version__
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output,State

import plotly.graph_objects as go

import os
print(os.getcwd())
df_input_large = pd.read_csv('../data/processed/COVID_final_set.csv',sep=';')


fig = go.Figure()

app = dash.Dash()
app.layout = html.Div([

    dcc.Markdown('''
    #  Applied Data Science on COVID-19 data

    Goal of the project is to teach data science by applying a cross industry standard process,
    it covers the full walkthrough of: automated data gathering, data transformations,
    filtering and machine learning to approximating the doubling time, and
    (static) deployment of responsive dashboard.

    '''),

    dcc.Markdown('''
    ## Multi-Select Country for visualization
    '''),


    dcc.Dropdown(
        id='country_drop_down',
        options=[ {'label':each, 'value':each} for each in df_input_large['country'].unique()],
        value=['US', 'Germany', 'Italy'], # which are pre-selected
        multi=True
    ),

    dcc.Markdown('''
        ## Select Timeline of confirmed COVID-19 cases or the approximated doubling time
        '''),


    dcc.Dropdown(
    id='doubling_time',
    options=[
        {'label': 'Timeline Confirmed ', 'value': 'confirmed'},
        {'label': 'Timeline Confirmed Filtered', 'value': 'confirmed_filtered'},
        {'label': 'Timeline Doubling Rate', 'value': 'confirmed_DR'},
        {'label': 'Timeline Doubling Rate Filtered', 'value': 'confirmed_filtered_DR'},
        {'label': 'Timeline SIR Model applied', 'value': 'SIR_static'},
    ],
    value='confirmed',
    multi=False
    ),

    dcc.Graph(figure=fig, id='main_window_slope')
])



@app.callback(
    Output('main_window_slope', 'figure'),
    [Input('country_drop_down', 'value'),
    Input('doubling_time', 'value')])
def update_figure(country_list, show_doubling):


    if 'doubling_rate' in show_doubling:
        my_yaxis={'type':"log",
               'title':'Approximated doubling rate over 3 days (larger numbers are better #stayathome)'
              }
    else:
        my_yaxis={'type':"log",
                  'title':'Confirmed infected people (source johns hopkins csse, log-scale)'
              }


    traces = []
    for each in country_list:

        df_plot=df_input_large[df_input_large['country']==each]

        if show_doubling=='doubling_rate_filtered':
            df_plot=df_plot[['state','country','confirmed','confirmed_filtered','confirmed_DR','confirmed_filtered_DR','date', 'SIR_static']].groupby(['country','date']).agg(np.mean).reset_index()
        else:
            df_plot=df_plot[['state','country','confirmed','confirmed_filtered','confirmed_DR','confirmed_filtered_DR','date', 'SIR_static']].groupby(['country','date']).agg(np.sum).reset_index()
       #print(show_doubling)


        traces.append(dict(x=df_plot.date,
                                y=df_plot[show_doubling],
                                mode='markers+lines',
                                opacity=0.9,
                                name=each
                        )
                )

    return {
            'data': traces,
            'layout': dict (
                width=1280,
                height=720,

                xaxis={'title':'Timeline',
                        'tickangle':-45,
                        'nticks':20,
                        'tickfont':dict(size=14,color="#7f7f7f"),
                      },

                yaxis=my_yaxis
        )
    }
def visualize():
    app.run_server(debug=True, use_reloader=False)

if __name__ == '__main__':

    app.run_server(debug=True, use_reloader=False)
