# Check the path

In [1]:
import os
if os.path.split(os.getcwd())[-1]=='notebooks':
    os.chdir("../")

'Your base path is at: '+os.path.split(os.getcwd())[-1]

'Your base path is at: ads_covid-19'

In [2]:
cd=os.getcwd()

In [3]:
cd

'F:\\MS CVT study material\\Enterprise Data Science\\ads_covid-19'

# 1. Data Extraction and saving in local directory

In [4]:
# %load src/data/GER_data.py
#!/usr/bin/env python

# In[4]:


import os
import subprocess
import numpy as np
import pandas as pd
import requests
import json
from datetime import datetime

def get_JH_data():

    git_pull = subprocess.Popen( "/usr/bin/git pull" ,
                         cwd = os.path.dirname( 'data/raw/COVID-19/' ),
                         shell = True,
                         stdout = subprocess.PIPE,
                         stderr = subprocess.PIPE )
    (out, error) = git_pull.communicate()


    print("Error : " + str(error))
    print("out : " + str(out))


def get_current_data_germany():

    data=requests.get('https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/RKI_Landkreisdaten/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json')

    json_obj=json.loads(data.content)
    full_list=[]
    for pos,each_dict in enumerate (json_obj['features'][:]):
        full_list.append(each_dict['attributes'])

    pd_full_list=pd.DataFrame(full_list)
    pd_full_list.to_csv('data/raw/GER_state_data.csv',sep=';')
    print(' Number of regions rows: '+str(pd_full_list.shape[0]))

if __name__ == '__main__':
    get_JH_data()
    get_current_data_germany()



Error : b'The system cannot find the path specified.\r\n'
out : b''
 Number of regions rows: 411


# 2. Process pipeline

In [5]:
# %load src/data/JH_data.py
#!/usr/bin/env python

# In[8]:


import pandas as pd
import numpy as np

from datetime import datetime


def store_relational_JH_data():

    data_path='data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
    pd_raw=pd.read_csv(data_path)

    pd_data_base=pd_raw.rename(columns={'Country/Region':'country',
                      'Province/State':'state'})

    pd_data_base['state']=pd_data_base['state'].fillna('no')

    pd_data_base=pd_data_base.drop(['Lat','Long'],axis=1)


    pd_relational_model=pd_data_base.set_index(['state','country']).T                                                              .stack(level=[0,1])                                             .reset_index()                                                  .rename(columns={'level_0':'date',
                                                   0:'confirmed'},
                                                  )

    pd_relational_model['date']=pd_relational_model.date.astype('datetime64[ns]')

    pd_relational_model.to_csv('data/processed/COVID_relational_confirmed.csv',sep=';',index=False)
    print(' Number of rows stored: '+str(pd_relational_model.shape[0]))
    print(' Latest date is: '+str(max(pd_relational_model.date)))
if __name__ == '__main__':

    store_relational_JH_data()


# In[ ]:






 Number of rows stored: 248805
 Latest date is: 2022-06-12 00:00:00


# 3 Data Filter and Doubling Rate Calculation

In [6]:
# %load src/data/build_features.py
#!/usr/bin/env python

# In[1]:


import numpy as np
from sklearn import linear_model
reg = linear_model.LinearRegression(fit_intercept=True)
import pandas as pd
from scipy import signal

def get_doubling_time_via_regression(in_array):

    y = np.array(in_array)
    X = np.arange(-1,2).reshape(-1, 1)

    assert len(in_array)==3
    reg.fit(X,y)
    intercept=reg.intercept_
    slope=reg.coef_

    return intercept/slope


def savgol_filter(df_input,column='confirmed',window=5):

    degree=1
    df_result=df_input

    filter_in=df_input[column].fillna(0) # attention with the neutral element here

    result=signal.savgol_filter(np.array(filter_in),
                           window, # window size used for filtering
                           1)
    df_result[str(column+'_filtered')]=result
    return df_result

def rolling_reg(df_input,col='confirmed'):

    days_back=3
    result=df_input[col].rolling(
                window=days_back,
                min_periods=days_back).apply(get_doubling_time_via_regression,raw=False)



    return result




def calc_filtered_data(df_input,filter_on='confirmed'):

    must_contain=set(['state','country',filter_on])
    assert must_contain.issubset(set(df_input.columns)), ' Erro in calc_filtered_data not all columns in data frame'

    df_output=df_input.copy()

    pd_filtered_result=df_output[['state','country',filter_on]].groupby(['state','country']).apply(savgol_filter)#.reset_index()

    df_output=pd.merge(df_output,pd_filtered_result[[str(filter_on+'_filtered')]],left_index=True,right_index=True,how='left')
    return df_output.copy()





def calc_doubling_rate(df_input,filter_on='confirmed'):

    must_contain=set(['state','country',filter_on])
    assert must_contain.issubset(set(df_input.columns)), ' Erro in calc_filtered_data not all columns in data frame'


    pd_DR_result= df_input.groupby(['state','country']).apply(rolling_reg,filter_on).reset_index()

    pd_DR_result=pd_DR_result.rename(columns={filter_on:filter_on+'_DR',
                             'level_2':'index'})

    df_output=pd.merge(df_input,pd_DR_result[['index',str(filter_on+'_DR')]],left_index=True,right_on=['index'],how='left')
    df_output=df_output.drop(columns=['index'])

    return df_output


if __name__ == '__main__':
    test_data_reg=np.array([2,4,6])
    result=get_doubling_time_via_regression(test_data_reg)
    print('the test slope is: '+str(result))

    pd_JH_data=pd.read_csv('data/processed/COVID_relational_confirmed.csv',sep=';',parse_dates=[0])
    pd_JH_data=pd_JH_data.sort_values('date',ascending=True).copy()

    pd_result_large=calc_filtered_data(pd_JH_data)
    pd_result_large=calc_doubling_rate(pd_result_large)
    pd_result_large=calc_doubling_rate(pd_result_large,'confirmed_filtered')


    mask=pd_result_large['confirmed']>100
    pd_result_large['confirmed_filtered_DR']=pd_result_large['confirmed_filtered_DR'].where(mask, other=np.NaN)
    pd_result_large.to_csv('data/processed/COVID_final_set.csv',sep=';',index=False)
    print(pd_result_large[pd_result_large['country']=='Germany'].tail())


# In[ ]:






the test slope is: [2.]
             date state  country   confirmed  confirmed_filtered  \
134437 2022-06-08    no  Germany  26660652.0          26656885.2   
134438 2022-06-09    no  Germany  26738530.0          26717986.4   
134439 2022-06-10    no  Germany  26803867.0          26763232.2   
134440 2022-06-11    no  Germany  26803867.0          26799484.5   
134441 2022-06-12    no  Germany  26809245.0          26835736.8   

        confirmed_DR  confirmed_filtered_DR  
134437    327.568089             481.395649  
134438    342.872444             435.026463  
134439    373.345664             502.368685  
134440    819.813827             656.708178  
134441   9968.635056             739.249220  


# 4 Dash Board Visualization

In [None]:
# %load src/data/visualize.py
#!/usr/bin/env python

# In[ ]:


import pandas as pd
import numpy as np
import dash
dash.__version__
from dash import dcc
from dash import html
from dash.dependencies import Input, Output,State
import plotly.graph_objects as go
import os
print(os.getcwd())
df_input_large=pd.read_csv('data/processed/COVID_final_set.csv',sep=';')


fig = go.Figure()

app = dash.Dash()
app.layout = html.Div([

    dcc.Markdown('''
    # Enterprise Data Science Delivery 3
    ##  Applied Data Science on COVID-19 data
    
    The aim of this delivery is to get a better understanding of the applied data science concepts 
    which includes automated data pooling, transforming raw data into useful data, defining and applying 
    various filters for doubling rate and confirmed cases and greating a dynamic dashboard to view statistics
    of various countries.

    '''),

    dcc.Markdown('''
    ## Select countries to display the covid cases data
    '''),


    dcc.Dropdown(
        id='country_drop_down',
        options=[ {'label': each,'value':each} for each in df_input_large['country'].unique()],
        value=['US','India','Italy'],
        multi=True
    ),

    dcc.Markdown('''
        ## Select Timeline from the dropdown for confirmed COVID-19 cases or approximated doubling time of cases
        '''),

    dcc.Dropdown(
    id='doubling_time',
    options=[
        {'label': 'Confirmed cases timeline ', 'value': 'confirmed'},
        {'label': 'Confirmed filtered cases timeline', 'value': 'confirmed_filtered'},
        {'label': 'Doubling Rate timeline', 'value': 'confirmed_DR'},
        {'label': 'Doubling Rate filtered timeline', 'value': 'confirmed_filtered_DR'},
    ],
    value='confirmed',
    multi=False
    ),

    dcc.Graph(figure=fig, id='main_window_slope')
])

@app.callback(
    Output('main_window_slope', 'figure'),
    [Input('country_drop_down', 'value'),
    Input('doubling_time', 'value')])
def update_figure(country_list,show_doubling):

    if 'doubling_rate' in show_doubling:
        my_yaxis={'type':"log",
               'title':'Approximated doubling rate over 3 days (larger numbers are better #stayathome)'
              }
    else:
        my_yaxis={'type':"log",
                  'title':'Confirmed infected people (source johns hopkins csse, log-scale)'
              }


    traces = []
    for each in country_list:

        df_plot=df_input_large[df_input_large['country']==each]

        if show_doubling=='confirmed_DR':
            df_plot=df_plot[['state','country','confirmed','confirmed_filtered','confirmed_DR','confirmed_filtered_DR','date']].groupby(['country','date']).agg(np.mean).reset_index()
        else:
            df_plot=df_plot[['state','country','confirmed','confirmed_filtered','confirmed_DR','confirmed_filtered_DR','date']].groupby(['country','date']).agg(np.sum).reset_index()

        traces.append(dict(x=df_plot.date,
                                y=df_plot[show_doubling],
                                mode='markers+lines',
                                line_width = 0.5, 
                                marker_size = 0.005,
                                opacity=0.9,
                                name=each
                        )
                )

    return {
            'data': traces,
            'layout': dict (
                width=1400,
                height=600,

                xaxis={'title':'Timeline',
                        'tickangle':-45,
                        'nticks':20,
                        'tickfont':dict(size=14,color="#7f7f7f"),
                      },

                yaxis=my_yaxis
        )
    }

if __name__ == '__main__':

    app.run_server(debug=True, use_reloader=False)


# In[ ]:






F:\MS CVT study material\Enterprise Data Science\ads_covid-19
Dash is running on http://127.0.0.1:8050/



[33m * Tip: There are .env or .flaskenv files present. Do "pip install python-dotenv" to use them.[0m


 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: on
