In [2]:
import subprocess
import os

import pandas as pd

import requests
from bs4 import BeautifulSoup

import json


pd.set_option('display.max_rows', 500)

![CRISP_DM](CRISP_DM.png )

# Data Understanding

* RKI, webscrape (webscraping) https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html
* John Hopkins (GITHUB) https://github.com/CSSEGISandData/COVID-19.git 
* REST API services to retreive data https://npgeo-corona-npgeo-de.hub.arcgis.com/

## GITHUB csv data

git clone/pull https://github.com/CSSEGISandData/COVID-19.git

In [11]:
git_pull = subprocess.Popen( "/usr/bin/git pull" , 
                     cwd = os.path.dirname( 'E:/EDS/Project/ads_covid-19/data/raw/COVID-19/' ), 
                     shell = True, 
                     stdout = subprocess.PIPE, 
                     stderr = subprocess.PIPE )
(out, error) = git_pull.communicate()


print("Error : " + str(error)) 
print("out : " + str(out))

Error : b'The system cannot find the path specified.\r\n'
out : b''


In [12]:
data_path='E:/EDS/Project/ads_covid-19/data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
pd_raw=pd.read_csv(data_path)

In [13]:
pd_raw.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,5/13/20,5/14/20,5/15/20,5/16/20,5/17/20,5/18/20,5/19/20,5/20/20,5/21/20,5/22/20
0,,Afghanistan,33.0,65.0,0,0,0,0,0,0,...,5226,5639,6053,6402,6664,7072,7653,8145,8676,9216
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,880,898,916,933,946,948,949,964,969,981
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,6253,6442,6629,6821,7019,7201,7377,7542,7728,7918
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,760,761,761,761,761,761,761,762,762,762
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,45,48,48,48,48,50,52,52,58,60


## Webscrapping

In [15]:
page = requests.get("https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html")

In [16]:
soup = BeautifulSoup(page.content, 'html.parser')

In [17]:
html_table = soup.find('table')

In [18]:
all_rows = html_table.find_all('tr')

In [19]:
final_data_list = []

In [21]:
for pos,rows in enumerate (all_rows):
    col_list = [each_col.get_text(strip=True) for each_col in rows.find_all('td')]
    final_data_list.append(col_list)

In [22]:
pd_daily_status=pd.DataFrame(final_data_list).dropna().rename(columns={0:'state',
                                                       1:'cases',
                                                       2:'changes',
                                                       3:'cases_per_100k',
                                                       4:'fatal',
                                                       5:'comment'})

In [23]:
pd_daily_status.head()

Unnamed: 0,state,cases,changes,cases_per_100k,fatal,comment
2,Baden-Württem­berg,40.358,185,1.45,131,1.863
3,Bayern,55.414,353,1.835,140,2.634
4,Berlin,10.786,58,416.0,111,226.0
5,Branden­burg,3.795,8,70.0,28,169.0
6,Bremen,1.913,8,62.0,91,56.0


## REST API calls

In [24]:
## data request for Germany
data=requests.get('https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/RKI_Landkreisdaten/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json')

In [28]:
json_object=json.loads(data.content)

In [29]:
type(json_object)

dict

In [30]:
json_object.keys()

dict_keys(['objectIdFieldName', 'uniqueIdField', 'globalIdFieldName', 'geometryProperties', 'geometryType', 'spatialReference', 'fields', 'features'])

In [31]:
full_list=[]
for pos,each_dict in enumerate (json_object['features'][:]):
    full_list.append(each_dict['attributes'])

In [32]:
pd_full_list=pd.DataFrame(full_list)
pd_full_list.head()

Unnamed: 0,OBJECTID,ADE,GF,BSG,RS,AGS,SDV_RS,GEN,BEZ,IBZ,...,cases,deaths,cases_per_100k,cases_per_population,BL,BL_ID,county,last_update,cases7_per_100k,recovered
0,1,4.0,4.0,1.0,1001,1001,10010000000,Flensburg,Kreisfreie Stadt,40.0,...,72,3,80.443332,0.080443,Schleswig-Holstein,1,SK Flensburg,"25.08.2020, 00:00 Uhr",7.82088,
1,2,4.0,4.0,1.0,1002,1002,10020000000,Kiel,Kreisfreie Stadt,40.0,...,422,11,170.471989,0.170472,Schleswig-Holstein,1,SK Kiel,"25.08.2020, 00:00 Uhr",12.118862,
2,3,4.0,4.0,1.0,1003,1003,10030000000,Lübeck,Kreisfreie Stadt,40.0,...,208,1,95.765154,0.095765,Schleswig-Holstein,1,SK Lübeck,"25.08.2020, 00:00 Uhr",2.762456,
3,4,4.0,4.0,1.0,1004,1004,10040000000,Neumünster,Kreisfreie Stadt,40.0,...,101,3,127.064803,0.127065,Schleswig-Holstein,1,SK Neumünster,"25.08.2020, 00:00 Uhr",6.290337,
4,5,4.0,4.0,1.0,1051,1051,10510044044,Dithmarschen,Kreis,42.0,...,148,4,111.10277,0.111103,Schleswig-Holstein,1,LK Dithmarschen,"25.08.2020, 00:00 Uhr",1.501389,


In [33]:
pd_full_list.to_csv('E:/EDS/Project/ads_covid-19/data/raw/NPGEO/GER_state_data.csv',sep=';')

In [34]:
pd_full_list.shape[0]

412

## API access via REST service, e.g. USA data

example of a REST conform interface (attention registration mandatory)

www.smartable.ai

In [35]:
# US for full list
headers = {
    'Cache-Control': 'no-cache',
    'Subscription-Key': '28ee4219700f48718be78b057beb7eb4',
}

response = requests.get('https://api.smartable.ai/coronavirus/stats/US', headers=headers)
print(response)

<Response [200]>


In [36]:
US_dict=json.loads(response.content) # imports string
with open('E:/EDS/Project/ads_covid-19/data/raw/SMARTABLE/US_data.json', 'w') as outfile:
    json.dump(US_dict, outfile,indent=2)

In [54]:
print(json.dumps(US_dict,indent=2)) #string dump

{
  "location": {
    "long": -95.712891,
    "countryOrRegion": "United States",
    "provinceOrState": null,
    "county": null,
    "isoCode": "US",
    "lat": 37.09024
  },
  "updatedDateTime": "2020-08-21T16:03:19.3794953Z",
  "stats": {
    "totalConfirmedCases": 5680334,
    "newlyConfirmedCases": 236,
    "totalDeaths": 185896,
    "newDeaths": 23,
    "totalRecoveredCases": 2796278,
    "newlyRecoveredCases": 0,
    "history": [
      {
        "date": "2020-01-22T00:00:00",
        "confirmed": 1,
        "deaths": 0,
        "recovered": 0
      },
      {
        "date": "2020-01-23T00:00:00",
        "confirmed": 1,
        "deaths": 0,
        "recovered": 0
      },
      {
        "date": "2020-01-24T00:00:00",
        "confirmed": 2,
        "deaths": 0,
        "recovered": 0
      },
      {
        "date": "2020-01-25T00:00:00",
        "confirmed": 2,
        "deaths": 0,
        "recovered": 0
      },
      {
        "date": "2020-01-26T00:00:00",
        "confir

## Individual States US

In [37]:
US_dict['stats']['breakdowns'][0]

{'location': {'long': 144.793731,
  'countryOrRegion': 'United States',
  'provinceOrState': 'Guam',
  'county': None,
  'isoCode': None,
  'lat': 13.444304},
 'totalConfirmedCases': 32,
 'newlyConfirmedCases': 0,
 'totalDeaths': 1,
 'newDeaths': 0,
 'totalRecoveredCases': 0,
 'newlyRecoveredCases': 0}

In [38]:
full_list_US_country=[]
for pos,each_dict in enumerate (US_dict['stats']['breakdowns'][:]):
    flatten_dict=each_dict['location']
    flatten_dict.update(dict(list(US_dict['stats']['breakdowns'][pos].items())[1: 7]) 
    )
    full_list_US_country.append(flatten_dict)

In [39]:
pd.DataFrame(full_list_US_country).to_csv('../data/raw/SMARTABLE/full_list_US_country.csv',sep=';',index=False)