# Business understanding

We would like to track Corona VIRUS spread across countries and with personal local information
  
The general information is not so relevant for me,
I would like to have a deep dive local development of the spread

# Goals
1. We would like to understand the data quality
2. everything should be automated as much as possible:
    How many clicks do we need to execute the full pipeline
    
# Contraints
1. Each notebook should be left clean and ready for full execution

# Data understanding
* RKI, webscrape (webscraping) https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html
* John Hopkins (GITHUB) https://github.com/CSSEGISandData/COVID-19.git
* REST API services to retreive data https://npgeo-corona-npgeo-de.hub.arcgis.com/

In [1]:
import subprocess
import os
import json
import pandas as pd
import requests

from bs4 import BeautifulSoup
from datetime import datetime



pd.set_option('display.max_rows', 500)

## GITHUB csv data

In [2]:
git_pull = subprocess.Popen( "/usr/bin/git pull" , 
                     cwd = os.path.dirname( '../data/raw/COVID-19/' ), 
                     shell = True, 
                     stdout = subprocess.PIPE, 
                     stderr = subprocess.PIPE )
(out, error) = git_pull.communicate()


print("Error : " + str(error)) 
print("out : " + str(out))



Error : b''
out : b'Already up to date.\n'


In [3]:
data_path = '../data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
pd_raw = pd.read_csv(data_path)
pd_raw

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,8/7/20,8/8/20,8/9/20,8/10/20,8/11/20,8/12/20,8/13/20,8/14/20,8/15/20,8/16/20
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,37015,37054,37054,37162,37269,37345,37424,37431,37551,37596
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,6151,6275,6411,6536,6676,6817,6971,7117,7260,7380
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,34155,34693,35160,35712,36204,36699,37187,37664,38133,38583
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,955,955,955,963,963,977,981,989,989,989
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,1538,1572,1672,1679,1735,1762,1815,1852,1879,1906
5,,Antigua and Barbuda,17.0608,-61.7964,0,0,0,0,0,0,...,92,92,92,92,92,92,92,93,93,93
6,,Argentina,-38.4161,-63.6167,0,0,0,0,0,0,...,235677,241811,246499,253868,260911,268574,276072,282437,289100,294569
7,,Armenia,40.0691,45.0382,0,0,0,0,0,0,...,39985,40185,40410,40433,40593,40794,41023,41299,41495,41663
8,Australian Capital Territory,Australia,-35.4735,149.0124,0,0,0,0,0,0,...,113,113,113,113,113,113,113,113,113,113
9,New South Wales,Australia,-33.8688,151.2093,0,0,0,0,3,4,...,3851,3861,3875,3897,3915,3927,3936,3945,3950,3957


## Webscrapping

In [4]:
page = requests.get("https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html")
soup = BeautifulSoup(page.content, 'html.parser')
html_table = soup.find('table')
all_rows = html_table.find_all('tr')
final_data_list = []

for pos, rows in enumerate(all_rows):
    col_list = [each_col.get_text(strip=True) for each_col in rows.find_all('td')]
    final_data_list.append(col_list)
    
pd.DataFrame(final_data_list).dropna().rename(columns={0:'state', \
                            1:'Anzahl', 2:'Differenz zum Vortag', 3:'Fälle in den letzten 7 Tagen',\
                            4:'7-Tage-Inzidenz', 5:'Todesfälle'})

Unnamed: 0,state,Anzahl,Differenz zum Vortag,Fälle in den letzten 7 Tagen,7-Tage-Inzidenz,Todesfälle
2,Baden-Württem­berg,38.512,+32,606.0,55,1.859
3,Bayern,52.984,+96,1048.0,80,2.631
4,Berlin,10.253,+15,531.0,142,224.0
5,Branden­burg,3.703,+6,56.0,22,169.0
6,Bremen,1.845,+4,41.0,60,56.0
7,Hamburg,5.887,+9,160.0,87,264.0
8,Hessen,13.486,+53,734.0,117,526.0
9,Meck­lenburg-Vor­pommern,973.0,+1,32.0,20,20.0
10,Nieder­sachsen,15.428,+44,412.0,52,656.0
11,Nord­rhein-West­falen,54.653,+263,2.694,150,1.778


## REST_API calls


In [5]:

data=requests.get('https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/Coronaf%C3%A4lle_in_den_Bundesl%C3%A4ndern/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json')
json_object=json.loads(data.content)
type(json_object)
# json_object.keys()
json_object['features'][0]

full_list = []
for pos, each_dict in enumerate (json_object['features'][:]):
    full_list.append(each_dict['attributes'])

pd.DataFrame(full_list)

Unnamed: 0,OBJECTID_1,LAN_ew_AGS,LAN_ew_GEN,LAN_ew_BEZ,LAN_ew_EWZ,OBJECTID,Fallzahl,Aktualisierung,AGS_TXT,GlobalID,faelle_100000_EW,Shape__Area,Shape__Length,Death
0,1,1,Schleswig-Holstein,Land,2896712,15,3790,1597615200000,1,fc5ba936-c95c-432c-8a33-9eb2f30b660f,130.837998,45737310000.0,2881496.0,158
1,2,2,Hamburg,Freie und Hansestadt,1841179,6,5887,1597615200000,2,0f3e860c-5181-4d3f-a421-1d51f50315ea,319.740775,2089396000.0,418800.2,264
2,3,3,Niedersachsen,Land,7982448,9,15428,1597615200000,3,3fd77024-c29b-4843-9be8-682ad48e60c9,193.274043,129983600000.0,4008988.0,656
3,4,4,Bremen,Freie Hansestadt,682986,5,1845,1597615200000,4,4132268b-54de-4327-ac1e-760e915112f1,270.137309,1119157000.0,335717.7,56
4,5,5,Nordrhein-Westfalen,Land,17932651,10,54653,1597615200000,5,561d658f-3ee5-46e3-bc95-3528c6558ab9,304.768101,87829360000.0,2648673.0,1778
5,6,6,Hessen,Land,6265809,7,13486,1597615200000,6,93277ac4-e8fc-48c7-8940-028dc2ed66af,215.231585,52359130000.0,2148244.0,526
6,7,7,Rheinland-Pfalz,Land,4084844,11,8148,1597615200000,7,e9b4296f-9be2-4e53-9a58-ccf1396cb03d,199.469062,47838770000.0,1774430.0,242
7,8,8,Baden-Württemberg,Land,11069533,1,38512,1597615200000,8,80394ddf-c6a4-4a6e-be8e-0259a81b22a9,347.909889,81517320000.0,2544320.0,1859
8,9,9,Bayern,Freistaat,13076721,2,52984,1597615200000,9,1ff920f4-62cd-4a4f-b8c9-f042f2a3e00a,405.178026,163485500000.0,3898618.0,2631
9,10,10,Saarland,Land,990509,12,3000,1597615200000,10,e3396a6f-8a30-4fdf-8df7-def77dd38bea,302.874583,6060692000.0,562678.9,174


## API access via RESt service. e.g. USA data
example of a REST conform interface (attention registration mandatory) 

www.smartable.ai

In [6]:

url_endpoint = 'https://api.smartable.ai/coronavirus/stats/US'
headers = {
    'Cache-Control': 'no-cache',
    'Subscription-Key': '28ee4219700f48718be78b057beb7eb4',
}

response = requests.get(url_endpoint, headers=headers)
US_dict = json.loads(response.content)
with open ('../data/raw/SMARTABLE/US/data.txt', 'w') as outfile:
    json.dump(US_dict, outfile, indent=2)