In [1]:
import pandas as pd
pd.set_option('display.max_rows', 500)
import requests
from bs4 import BeautifulSoup
import json

# Data Understanding

* RKI, webscrape (webscraping): https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html
* John Hopkins (GITHUB): https://github.com/CSSEGISandData/COVID-19
* REST API services to retreive data: https://npgeo-corona-npgeo-de.hub.arcgis.com/

# GITHUB csv data

git clone/pull https://github.com/CSSEGISandData/COVID-19

In [2]:
data_path="..\\data\\raw\\COVID-19\\csse_covid_19_data\\csse_covid_19_time_series\\time_series_covid19_confirmed_global.csv"
pd_raw=pd.read_csv(data_path)

In [3]:
pd_raw

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,2/2/21,2/3/21,2/4/21,2/5/21,2/6/21,2/7/21,2/8/21,2/9/21,2/10/21,2/11/21
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,55121,55174,55231,55265,55330,55335,55359,55384,55402,55420
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,79934,80941,81993,83082,84212,85336,86289,87528,88671,89776
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,107841,108116,108381,108629,108629,109088,109313,109559,109782,110049
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,10017,10070,10137,10172,10206,10251,10275,10312,10352,10391
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,19900,19937,19996,20030,20062,20086,20112,20163,20210,20261
5,,Antigua and Barbuda,17.0608,-61.7964,0,0,0,0,0,0,...,249,249,268,277,288,299,316,316,350,381
6,,Argentina,-38.4161,-63.6167,0,0,0,0,0,0,...,1943548,1952744,1961635,1970009,1976689,1980347,1985501,1993295,2001034,2008345
7,,Armenia,40.0691,45.0382,0,0,0,0,0,0,...,167231,167421,167568,167726,167937,168088,168177,168300,168496,168676
8,Australian Capital Territory,Australia,-35.4735,149.0124,0,0,0,0,0,0,...,118,118,118,118,118,118,118,118,118,118
9,New South Wales,Australia,-33.8688,151.2093,0,0,0,0,3,4,...,5114,5117,5117,5119,5120,5123,5125,5129,5132,5134


# Web Scraping

In [4]:
page = requests.get("https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html")

In [5]:
soup = BeautifulSoup(page.content, 'html.parser')

In [6]:
html_table=soup.find('table')

In [7]:
all_rows=html_table.find_all('tr')

In [8]:
final_data_list=[]

In [9]:
for pos,rows in enumerate(all_rows):
    
    col_list=[each_col.get_text(strip=True) for each_col in rows.find_all('td')]
    final_data_list.append(col_list)

In [10]:
pd.DataFrame(final_data_list).dropna().rename(columns={0:'state'})

Unnamed: 0,state,1,2,3,4,5
2,Baden-Württem­berg,304.257,924.0,5.927,53,7.675
3,Bayern,418.456,1.67,8.197,62,11.547
4,Berlin,124.134,485.0,2.119,58,2.574
5,Branden­burg,72.569,374.0,1.932,77,2.76
6,Bremen,16.797,80.0,444.0,65,306.0
7,Hamburg,48.458,193.0,1.066,58,1.178
8,Hessen,179.467,687.0,3.945,63,5.398
9,Meck­lenburg-Vor­pommern,21.928,167.0,1.072,67,614.0
10,Nieder­sachsen,151.569,1.139,4.814,60,3.773
11,Nord­rhein-West­falen,506.531,1.881,10.64,59,11.963


# Rest API Calls

In [11]:
data=requests.get('https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/Coronaf%C3%A4lle_in_den_Bundesl%C3%A4ndern/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json')

In [12]:
json_object=json.loads(data.content)

In [13]:
type(json_object)

dict

In [14]:
json_object.keys()

dict_keys(['objectIdFieldName', 'uniqueIdField', 'globalIdFieldName', 'geometryProperties', 'geometryType', 'spatialReference', 'fields', 'features'])

In [15]:
full_list=[]
for pos,each_dict in enumerate(json_object['features'][:]):
    full_list.append(each_dict['attributes'])

In [16]:
pd.DataFrame(full_list)

Unnamed: 0,AGS_TXT,AdmUnitId,Aktualisierung,Death,Fallzahl,GlobalID,LAN_ew_AGS,LAN_ew_BEZ,LAN_ew_EWZ,LAN_ew_GEN,OBJECTID,OBJECTID_1,Shape__Area,Shape__Length,cases7_bl,cases7_bl_per_100k,cases7_bl_per_100k_txt,death7_bl,faelle_100000_EW
0,1,1,1613084400000,1101,39144,fc5ba936-c95c-432c-8a33-9eb2f30b660f,1,Land,2903773,Schleswig-Holstein,15,1,45737310000.0,2881496.0,1786,61.506185,615,11,1348.039258
1,2,2,1613084400000,1178,48458,0f3e860c-5181-4d3f-a421-1d51f50315ea,2,Freie und Hansestadt,1847253,Hamburg,6,2,2089396000.0,418800.2,1066,57.707309,577,3,2623.246518
2,3,3,1613084400000,3773,151569,3fd77024-c29b-4843-9be8-682ad48e60c9,3,Land,7993608,Niedersachsen,9,3,129983600000.0,4008988.0,4814,60.223118,602,14,1896.127506
3,4,4,1613084400000,306,16797,4132268b-54de-4327-ac1e-760e915112f1,4,Freie Hansestadt,681202,Bremen,5,4,1119157000.0,335717.7,444,65.178904,652,1,2465.788415
4,5,5,1613084400000,11963,506531,561d658f-3ee5-46e3-bc95-3528c6558ab9,5,Land,17947221,Nordrhein-Westfalen,10,5,87829360000.0,2648673.0,10640,59.284944,593,38,2822.336673
5,6,6,1613084400000,5398,179467,93277ac4-e8fc-48c7-8940-028dc2ed66af,6,Land,6288080,Hessen,7,6,52359130000.0,2148244.0,3945,62.737751,627,29,2854.082645
6,7,7,1613084400000,2846,97141,e9b4296f-9be2-4e53-9a58-ccf1396cb03d,7,Land,4093903,Rheinland-Pfalz,11,7,47838770000.0,1774430.0,2152,52.565974,526,6,2372.821242
7,8,8,1613084400000,7675,304257,80394ddf-c6a4-4a6e-be8e-0259a81b22a9,8,Land,11100394,Baden-Württemberg,1,8,81517320000.0,2544320.0,5927,53.394501,534,27,2740.956762
8,9,9,1613084400000,11547,418456,1ff920f4-62cd-4a4f-b8c9-f042f2a3e00a,9,Freistaat,13124737,Bayern,2,9,163485500000.0,3898618.0,8197,62.454585,625,32,3188.300078
9,10,10,1613084400000,805,27281,e3396a6f-8a30-4fdf-8df7-def77dd38bea,10,Land,986887,Saarland,12,10,6060692000.0,562678.9,718,72.754024,728,1,2764.348907


# API access via REST service, e.g. USA data
example of a REST conform interface (attention registration mandatory)
www.smartable.ai

In [17]:
import requests

url = "https://coronavirus-smartable.p.rapidapi.com/stats/v1/US/"
key = open("C:\\Users\\Marshall.McDougall\\Documents\\Keys\\smartable-key.txt").read()

headers = {
    'x-rapidapi-key': key,
    'x-rapidapi-host': "coronavirus-smartable.p.rapidapi.com"
    }

response = requests.request("GET", url, headers=headers)

In [18]:
US_dict=json.loads(response.content)
with open("..//data//raw//SMARTABLE//US_data.txt", 'w') as outfile:
    json.dump(US_dict, outfile, indent=2)