# Update all data

In [5]:
# %load ../src/data/GER_data.py
#!/usr/bin/env python

# In[4]:


import pandas as pd
import subprocess
import os
get_ipython().run_line_magic('matplotlib', 'inline')
import numpy as np
import json
import matplotlib as mpl
import matplotlib.pyplot as plt
import requests

def get_current_data_germany():
    data = requests.get('https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/Coronafälle_in_den_Bundesländern/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json')
    json_object = json.loads(data.content)

    full_list = []
    for pos,each_dict in enumerate (json_object['features'][:]):
        full_list.append(each_dict['attributes'])
        
    pd_full_list = pd.DataFrame(full_list)
    pd_full_list.to_csv('../data/raw/GER_state_data.csv', sep=';')
    print('Number of regions rows:' +str(pd_full_list.shape[0]))
    
if __name__ == '__main__':
    get_current_data_germany()



Number of regions rows:16


# Process pipeline

In [6]:
# %load ../src/data/JH_data.py
#!/usr/bin/env python

# In[1]:


import pandas as pd
import subprocess
import os
get_ipython().run_line_magic('matplotlib', 'inline')
import numpy as np
import json
import matplotlib as mpl
import matplotlib.pyplot as plt
import requests


def get_JH_data():
    data_path = '../data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
    pd_raw = pd.read_csv(data_path)
    
    pd_data_base = pd_raw.rename(columns={'Country/Region' : 'country', 'Province/State' : 'state'})
    pd_data_base['state']=pd_data_base['state'].fillna('no')  #ensure a sring, important for indexing
    
    pd_data_base = pd_data_base.drop(['Lat', 'Long'],axis=1)
    pd_relational_model = pd_data_base.set_index(['state', 'country'])                                             .T                                                                 .stack(level = [0,1])                                              .reset_index()                                                     .rename(columns={'level_0' : 'date' ,
                                                            0 : 'confirmed'},
                                                   )
    pd_relational_model['date']=pd_relational_model.date.astype('datetime64[ns]')
    pd_relational_model.to_csv('../data/processed/COVID_relational_confirmed.csv',sep=';',index=False)
    print('Number of rows stored:' +str(pd_relational_model.shape[0]))
    
if __name__ == '__main__':
    get_JH_data()


# In[ ]:






Number of rows stored:248805


# Slope calculation

In [7]:
# %load ../src/data/build_features.py
#!/usr/bin/env python

# In[12]:


import numpy as np
from sklearn import linear_model
reg = linear_model.LinearRegression(fit_intercept=True)
from scipy import signal
import pandas as pd

def get_doubling_time_via_regression(in_array):
    ''' Use a linear regression to approximate the doubling rate'''

    y = np.array(in_array)
    X = np.arange(-1,2).reshape(-1, 1)

    assert len(in_array)==3
    reg.fit(X,y)
    intercept=reg.intercept_
    slope=reg.coef_

    return intercept/slope


def savgol_filter(df_input, column='confirmed', window = 5):
    degree = 1
    df_result = df_input
    
    filter_in=df_input[column].fillna(0)

    result = signal.savgol_filter(np.array(filter_in),
                           5, # window size used for filtering
                           1) # order of fitted polynomial
    
    df_result[column+'_filtered'] = result

    return df_result

def rolling_reg (df_input, col='confirmed'):
    days_back = 3
    result = df_input[col].rolling(
                                window=days_back,
                                min_periods=days_back).apply(get_doubling_time_via_regression, raw=False)
    return result

def calc_filtered_data(df_input,filter_on='confirmed'):
    '''  Calculate savgol filter and return merged data frame
    '''

    must_contain=set(['state','country',filter_on])
    assert must_contain.issubset(set(df_input.columns)), ' Erro in calc_filtered_data not all columns in data frame'
    
    pd_filtered_result=df_input[['state','country',filter_on]].groupby(['state','country']).apply(savgol_filter).reset_index()
    #df_output=pd.merge(df_input,pd_filtered_result[['index',filter_on+'_filtered']],on=['index'],how='left')
    df_output=pd.merge(df_input,pd_filtered_result[[str(filter_on+'_filtered')]],left_index=True,right_index=True,how='left')

    return df_output

def calc_doubling_rate(df_input,filter_on='confirmed'):
    ''' Calculate approximated doubling rate and return merged data frame
    '''

    must_contain=set(['state','country',filter_on])
    assert must_contain.issubset(set(df_input.columns)), ' Erro in calc_filtered_data not all columns in data frame'


    pd_DR_result= df_input.groupby(['state','country']).apply(rolling_reg,filter_on).reset_index()
    pd_DR_result=pd_DR_result.rename(columns={filter_on:filter_on+'_DR',
                             'level_2':'index'})

    #we do the merge on the index of our big table and on the index column after groupby
    #df_output=pd.merge(df_input,pd_DR_result[['index',filter_on+'_DR']],on=['index'],how='left')
    df_output=pd.merge(df_input,pd_DR_result[['index',str(filter_on+'_DR')]],left_index=True,right_on=['index'],how='left')

    return df_output

if __name__ == '__main__':
    test_data = np.array([2,4,6])
    result = get_doubling_time_via_regression(test_data)
    print ('test slope is :' + str(result))
    
    pd_JH_data=pd.read_csv('../data/processed/COVID_relational_confirmed.csv',sep=';',parse_dates=[0])
    pd_JH_data=pd_JH_data.sort_values('date',ascending=True).reset_index(drop=True).copy()
    
    pd_result_larg=calc_filtered_data(pd_JH_data)
    pd_result_larg=calc_doubling_rate(pd_result_larg)
    pd_result_larg=calc_doubling_rate(pd_result_larg,'confirmed_filtered')
    print(pd_result_larg.head())


# In[ ]:






test slope is :[2.]
         index       date    state     country  confirmed  confirmed_filtered  \
0            0 2020-01-22  Alberta      Canada        0.0                 0.0   
158013  158013 2020-01-22       no      Kosovo        0.0                 0.0   
158886  158886 2020-01-22       no      Kuwait        0.0                 0.0   
159759  159759 2020-01-22       no  Kyrgyzstan        0.0                 0.0   
160632  160632 2020-01-22       no        Laos        0.0                 0.0   

        index_x  confirmed_DR  index_y  confirmed_filtered_DR  
0             0           NaN        0                    NaN  
158013        1           NaN   158013                    NaN  
158886        2           NaN   158886                    NaN  
159759        3           NaN   159759                    NaN  
160632        4           NaN   160632                    NaN  


# Visual board

In [None]:
# %load ../src/data/visualize.py
#!/usr/bin/env python

# In[ ]:


import pandas as pd
import subprocess
import os

import dash
dash.__version__
from dash import dcc
from dash import html
from dash.dependencies import Input, Output
import numpy as np
import plotly.graph_objects as go

print(os.getcwd())
df_input_large=pd.read_csv('../data/processed/COVID_final_set.csv',sep=';')

fig = go.Figure()

app = dash.Dash()
app.layout = html.Div([
    
    dcc.Markdown('''
    #  Applied Data Science on COVID-19 data

    Goal of the project is to teach data science by applying a cross industry standard process,
    it covers the full walkthrough of: automated data gathering, data transformations,
    filtering and machine learning to approximating the doubling time, and
    (static) deployment of responsive dashboard.

    '''),

    dcc.Markdown('''
    ## Multi-Select Country for visualization
    '''),
    
    dcc.Dropdown(
        id='country_drop_down',
        options=[ {'label': each,'value':each} for each in df_input_large['country'].unique()],
        value=['US', 'Germany','Italy'], # which are pre-selected
        multi=True
    ),

    dcc.Markdown('''
        ## Select Timeline of confirmed COVID-19 cases or the approximated doubling time
        '''),


    dcc.Dropdown(
    id='doubling_time',
    options=[
        {'label': 'Timeline Confirmed ', 'value': 'confirmed'},
        {'label': 'Timeline Confirmed Filtered', 'value': 'confirmed_filtered'},
        {'label': 'Timeline Doubling Rate', 'value': 'confirmed_DR'},
        {'label': 'Timeline Doubling Rate Filtered', 'value': 'confirmed_filtered_DR'},
    ],
    value='confirmed',
    multi=False
    ),

    dcc.Graph(figure=fig, id='main_window_slope')
])



@app.callback(
    Output('main_window_slope', 'figure'),
    [Input('country_drop_down', 'value'),
    Input('doubling_time', 'value')])
def update_figure(country_list,show_doubling):


    if 'doubling_rate' in show_doubling:
        my_yaxis={'type':"log",
               'title':'Approximated doubling rate over 3 days (larger numbers are better #stayathome)'
              }
    else:
        my_yaxis={'type':"log",
                  'title':'Confirmed infected people (source johns hopkins csse, log-scale)'
              }


    traces = []
    for each in country_list:

        df_plot=df_input_large[df_input_large['country']==each]

        if show_doubling=='doubling_rate_filtered':
            df_plot=df_plot[['state','country','confirmed','confirmed_filtered','confirmed_DR','confirmed_filtered_DR','date']].groupby(['country','date']).agg(np.mean).reset_index()
        else:
            df_plot=df_plot[['state','country','confirmed','confirmed_filtered','confirmed_DR','confirmed_filtered_DR','date']].groupby(['country','date']).agg(np.sum).reset_index()
       #print(show_doubling)


        traces.append(dict(x=df_plot.date,
                                y=df_plot[show_doubling],
                                mode='markers+lines',
                                opacity=0.9,
                                name=each
                        )
                )

    return {
            'data': traces,
            'layout': dict (
                width=1280,
                height=720,

                xaxis={'title':'Timeline',
                        'tickangle':-45,
                        'nticks':20,
                        'tickfont':dict(size=14,color="#7f7f7f"),
                      },

                yaxis=my_yaxis
        )
    }

if __name__ == '__main__':

    app.run_server(debug=True, use_reloader=False)


# In[ ]:




