In [1]:
import pandas as pd # library for data analysis
import requests # library to handle requests
from bs4 import BeautifulSoup # library to parse HTML documents

# get the response in the form of html
wikiurl="https://en.wikipedia.org/wiki/Template:COVID-19_pandemic_death_rates"
table_class="wikitable sortable jquery-tablesorter"
response=requests.get(wikiurl)
print(response.status_code)

200


In [2]:
soup = BeautifulSoup(response.text, 'html.parser')
casetable=soup.find('table',{'class':"wikitable"})
df=pd.read_html(str(casetable))
# convert list to dataframe
cases_df=pd.DataFrame(df[0])
print(cases_df)

                    Country  Confirmed cases  Deaths Case fatality rate  \
0                San Marino             2333      59               2.5%   
1                   Belgium           641411   19361               3.0%   
2                  Slovenia           117305    2631               2.2%   
3    Bosnia and Herzegovina           110454    4024               3.6%   
4                     Italy          2067487   73029               3.5%   
..                      ...              ...     ...                ...   
166        Papua New Guinea              780       9               1.2%   
167                Thailand             6690      61               0.9%   
168                Tanzania              509      21               4.1%   
169                 Vietnam             1454      35               2.4%   
170                  Taiwan              795       7               0.9%   

     Deaths per 100,000 population  
0                           174.63  
1                        

In [3]:
local_outbreak_items = pd.read_csv("reference.csv")

In [4]:
reconciled = cases_df.merge(local_outbreak_items, left_on="Country", right_on="countryLabel").drop_duplicates()

In [5]:
set(cases_df["Country"]) - set(reconciled["Country"])

{'Bahamas',
 'China',
 'Congo',
 'DR Congo',
 'Gambia',
 'Palestine',
 'United States'}

In [6]:
template_to_label = {
    'Bahamas': 'The Bahamas',
    'China': 'mainland China',
    'Congo': 'Republic of the Congo',
    'DR Congo': 'Democratic Republic of the Congo',
    'Gambia': 'The Gambia',
    'Palestine': 'State of Palestine',
    'United States': 'United States of America' 
}

In [7]:
cases_df.Country = cases_df.Country.replace(template_to_label)

In [8]:
reconciled = cases_df.merge(local_outbreak_items, left_on="Country", right_on="countryLabel").drop_duplicates()

In [9]:
set(cases_df["Country"]) - set(reconciled["Country"])

set()

In [10]:
reconciled

Unnamed: 0,Country,Confirmed cases,Deaths,Case fatality rate,"Deaths per 100,000 population",item,countryLabel,itemLabel
0,San Marino,2333,59,2.5%,174.63,Q87123561,San Marino,2020 COVID-19 pandemic in San Marino
1,Belgium,641411,19361,3.0%,169.51,Q84446340,Belgium,2020 COVID-19 pandemic in Belgium
2,Slovenia,117305,2631,2.2%,127.26,Q87250948,Slovenia,COVID-19 pandemic in Slovenia
3,Bosnia and Herzegovina,110454,4024,3.6%,121.06,Q87250770,Bosnia and Herzegovina,2020 COVID-19 pandemic in Bosnia and Herzegovina
4,Italy,2067487,73029,3.5%,120.85,Q84104992,Italy,COVID-19 pandemic in Italy
...,...,...,...,...,...,...,...,...
169,Papua New Guinea,780,9,1.2%,0.10,Q88137634,Papua New Guinea,2020 COVID-19 pandemic in Papua New Guinea
170,Thailand,6690,61,0.9%,0.09,Q83873566,Thailand,2020 COVID-19 pandemic in Thailand
171,Tanzania,509,21,4.1%,0.04,Q87770827,Tanzania,2020 COVID-19 pandemic in Tanzania
172,Vietnam,1454,35,2.4%,0.04,Q83873057,Vietnam,COVID-19 pandemic in Vietnam


In [11]:
reconciled["Case fatality rate"] = [float(i.replace("%",""))/100 for i in reconciled["Case fatality rate"]]

In [12]:
reconciled

Unnamed: 0,Country,Confirmed cases,Deaths,Case fatality rate,"Deaths per 100,000 population",item,countryLabel,itemLabel
0,San Marino,2333,59,0.025,174.63,Q87123561,San Marino,2020 COVID-19 pandemic in San Marino
1,Belgium,641411,19361,0.030,169.51,Q84446340,Belgium,2020 COVID-19 pandemic in Belgium
2,Slovenia,117305,2631,0.022,127.26,Q87250948,Slovenia,COVID-19 pandemic in Slovenia
3,Bosnia and Herzegovina,110454,4024,0.036,121.06,Q87250770,Bosnia and Herzegovina,2020 COVID-19 pandemic in Bosnia and Herzegovina
4,Italy,2067487,73029,0.035,120.85,Q84104992,Italy,COVID-19 pandemic in Italy
...,...,...,...,...,...,...,...,...
169,Papua New Guinea,780,9,0.012,0.10,Q88137634,Papua New Guinea,2020 COVID-19 pandemic in Papua New Guinea
170,Thailand,6690,61,0.009,0.09,Q83873566,Thailand,2020 COVID-19 pandemic in Thailand
171,Tanzania,509,21,0.041,0.04,Q87770827,Tanzania,2020 COVID-19 pandemic in Tanzania
172,Vietnam,1454,35,0.024,0.04,Q83873057,Vietnam,COVID-19 pandemic in Vietnam


In [13]:
from wikidataintegrator import wdi_core, wdi_login
from wikidataintegrator.wdi_helpers import try_write
import os
import pandas as pd
import pprint
from IPython.display import clear_output
from getpass import getpass

WBUSER = "TiagoLubiana"  
WBPASS = getpass(prompt='Enter your password: ')  
login = wdi_login.WDLogin(WBUSER, WBPASS)


Enter your password:  ············


https://www.wikidata.org/w/api.php
Successfully logged in as TiagoLubiana


In [14]:
from datetime import datetime

today_in_wikidata_format = datetime.today().strftime('+%Y-%m-%dT00:00:00Z')
today_in_wikidata_format

'+2020-12-30T00:00:00Z'

In [15]:
statements = []
for i, row in reconciled.iterrows():
    s = "Q102044164"
    p = "P527"
    o = row["item"]
    q1 = "P1120"
    oq1 = row["Deaths"]
    q2 = "P1603"
    oq2 = row["Confirmed cases"]
    q3 = "P3457"
    oq3 = row["Case fatality rate"]
    q4 = "P585"
    oq4 = today_in_wikidata_format
    r1 = "P854"
    or1 = "https://coronavirus.jhu.edu/data/mortality"
    r2 = "P143"
    or2 = "Q328"
    r3 = "P4656"
    or3 = "https://en.wikipedia.org/wiki/Template:COVID-19_pandemic_death_rates"


    qualifier_list = [wdi_core.WDQuantity(oq1, q1,  is_qualifier=True),
                  wdi_core.WDQuantity(oq2, q2,  is_qualifier=True),
                  wdi_core.WDQuantity(oq3, q3,  is_qualifier=True), 
                  wdi_core.WDTime(oq4, q4,  is_qualifier=True)]
    
    reference_list = [[wdi_core.WDUrl(or1, r1, is_reference=True),
                  wdi_core.WDItemID(or2, r2, is_reference=True),
                  wdi_core.WDUrl(or3, r3, is_reference=True)]]
    
    statements.extend([wdi_core.WDItemID(value= o, prop_nr=p, qualifiers=qualifier_list, references=reference_list)])
    
item = wdi_core.WDItemEngine(wd_item_id=s, data=statements)   
item.write(login)

'Q102044164'