![](CRISP_DM.png)

In [1]:
# importing the required packages

import subprocess
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json

# setting the  limit for displaying maximum amount of rows values of dataframe
pd.set_option('display.max_rows', 500)

# Data Understanding
 Here we have three options to obtain the data. They are given below:
*   John Hopkins (GITHUB) https://github.com/CSSEGISandData/COVID-19.git
*   RKI, webscrape (webscraping) https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html
*   REST API services to retreive data https://npgeo-corona-npgeo-de.hub.arcgis.com/

NOTE: These methods will be explained in breif subsequently .



# 1. John Hopkins (GITHUB)

* getting the data from the following link by using the command:
        git clone/pull https://github.com/CSSEGISandData/COVID-19.git

In [2]:
# Getting the data from GITHUB and storing in the local drive

git_pull= subprocess.Popen("git pull",
                          cwd=os.path.dirname('../data/raw/COVID-19/' ),
                          shell= True,
                          stdout=subprocess.PIPE, 
                          stderr=subprocess.PIPE )

(out,error)= git_pull.communicate()

print("Error: " + str(error))
print("out :" + str(out))

Error: b'From https://github.com/CSSEGISandData/COVID-19\n   b7ffac4f..dd98296e  master     -> origin/master\n   41bb2630..2e3a793d  web-data   -> origin/web-data\n'
out :b'Updating b7ffac4f..dd98296e\nFast-forward\n csse_covid_19_data/csse_covid_19_daily_reports_us/08-18-2020.csv | 4 ++--\n 1 file changed, 2 insertions(+), 2 deletions(-)\n'


In [3]:
# specifying the absolute data path
data_path='../data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'

pd_raw=pd.read_csv(data_path)

In [4]:
pd_raw.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,8/10/20,8/11/20,8/12/20,8/13/20,8/14/20,8/15/20,8/16/20,8/17/20,8/18/20,8/19/20
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,37162,37269,37345,37424,37431,37551,37596,37599,37599,37599
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,6536,6676,6817,6971,7117,7260,7380,7499,7654,7812
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,35712,36204,36699,37187,37664,38133,38583,39025,39444,39847
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,963,963,977,981,989,989,989,1005,1005,1024
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,1679,1735,1762,1815,1852,1879,1906,1935,1966,2015


# Webscrapping
+ RKI, webscrape (webscraping) [Robert-koch website with case-counts](https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html)
    

In [5]:
# Retrieving the page  to be read
page=requests.get("https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html")

In [6]:
soup= BeautifulSoup(page.content,'html.parser')

In [7]:
# Retrieving the entire Homepage
html_table=soup.find('table')

In [8]:
# find all rows in table
all_rows=html_table.find_all('tr')

In [9]:
final_data_list=[]

In [10]:
for pos, rows in enumerate(all_rows):
    
    col_list=[each_col.get_text(strip=True) for each_col in rows.find_all('td')] # 'td' for each individual data element
    final_data_list.append(col_list)
       

In [11]:
# pushing into the data frame and renaming the columns

pd_daily_status=pd.DataFrame(final_data_list).dropna().rename(columns={0:'state',
                                                       1:'cases',
                                                       2:'changes',
                                                       3:'cases_per_100k',
                                                       4:'fatal',
                                                       5:'comment'})

In [12]:
pd_daily_status.head()

Unnamed: 0,state,cases,changes,cases_per_100k,fatal,comment
2,Baden-Württem­berg,39.227,259,959.0,87,1.86
3,Bayern,54.101,394,1.595,122,2.631
4,Berlin,10.463,69,400.0,107,224.0
5,Branden­burg,3.74,12,60.0,24,169.0
6,Bremen,1.877,12,55.0,81,56.0


# REST API calls

In [13]:
# requesting data  for Germany
data=requests.get('https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/Coronaf%C3%A4lle_in_den_Bundesl%C3%A4ndern/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json')

In [14]:
# using JSON package to load data 
json_object=json.loads(data.content)

# verifying the data type of 'json_object'
type(json_object)

dict

In [15]:
# generate the an empty list called 'full_list' and than append data from 'json_object'
full_list=[]
for pos,each_dict in enumerate (json_object['features'][:]):
    full_list.append(each_dict['attributes'])

In [16]:
# converting  into data frame
pd_full_list=pd.DataFrame(full_list)
pd_full_list.head()

Unnamed: 0,OBJECTID_1,LAN_ew_AGS,LAN_ew_GEN,LAN_ew_BEZ,LAN_ew_EWZ,OBJECTID,Fallzahl,Aktualisierung,AGS_TXT,GlobalID,faelle_100000_EW,Shape__Area,Shape__Length,Death
0,1,1,Schleswig-Holstein,Land,2896712,15,3851,1597874400000,1,fc5ba936-c95c-432c-8a33-9eb2f30b660f,132.943834,45737310000.0,2881496.0,160
1,2,2,Hamburg,Freie und Hansestadt,1841179,6,5954,1597874400000,2,0f3e860c-5181-4d3f-a421-1d51f50315ea,323.379747,2089396000.0,418800.2,264
2,3,3,Niedersachsen,Land,7982448,9,15724,1597874400000,3,3fd77024-c29b-4843-9be8-682ad48e60c9,196.982179,129983600000.0,4008988.0,658
3,4,4,Bremen,Freie Hansestadt,682986,5,1877,1597874400000,4,4132268b-54de-4327-ac1e-760e915112f1,274.822617,1119157000.0,335717.7,56
4,5,5,Nordrhein-Westfalen,Land,17932651,10,55770,1597874400000,5,561d658f-3ee5-46e3-bc95-3528c6558ab9,310.996963,87829360000.0,2648673.0,1790


In [17]:
# saving the dataframe in CSV format
pd_full_list.to_csv('../data/raw/NPGEO/GER_state_data.csv',sep=';')


# API access via REST service, e.g. USA data

example of a REST conform interface (attention registration mandatory)

www.smartable.ai


In [18]:
url_endpoint='https://api.smartable.ai/coronavirus/stats/US'

# defining header
headers ={
# Request headers
'Cache-Control': 'no-cache',
'Subscription-Key': '1419c648254c4bf29be9cb9f05856bae',
}
response= requests.get(url_endpoint,headers=headers)

In [19]:
# checking the response
print(response)

<Response [200]>


In [20]:
# store information to data file

# 1. converting to a dictionary

US_dict= json.loads(response.content) # importing string
with open ('../data/raw/SMARTABLE/US_data.txt','w') as outfile: # open data file
    json.dump(US_dict,outfile,indent=2) # dumpin data into output file'US_data.txt'  

# Individual US States

In [21]:
US_dict['stats']['breakdowns'][0]

{'location': {'long': 144.793731,
  'countryOrRegion': 'United States',
  'provinceOrState': 'Guam',
  'county': None,
  'isoCode': None,
  'lat': 13.444304},
 'totalConfirmedCases': 32,
 'newlyConfirmedCases': 0,
 'totalDeaths': 1,
 'newDeaths': 0,
 'totalRecoveredCases': 0,
 'newlyRecoveredCases': 0}

In [22]:
full_list_US_country=[]
for pos,each_dict in enumerate (US_dict['stats']['breakdowns'][:]):
    flatten_dict=each_dict['location']
    flatten_dict.update(dict(list(US_dict['stats']['breakdowns'][pos].items())[1: 7]) 
    )
    full_list_US_country.append(flatten_dict)

In [23]:
pd.DataFrame(full_list_US_country).to_csv('../data/raw/SMARTABLE/full_list_US_country.csv',sep=';',index=False)