In [6]:
import pandas as pd # library for data analysis
import requests # library to handle requests
from bs4 import BeautifulSoup # library to parse HTML documents

# get the response in the form of html
wikiurl="https://en.wikipedia.org/wiki/Template:COVID-19_pandemic_death_rates"
table_class="wikitable sortable jquery-tablesorter"
response=requests.get(wikiurl)
print(response.status_code)

200


In [7]:
soup = BeautifulSoup(response.text, 'html.parser')
casetable=soup.find('table',{'class':"wikitable"})
df=pd.read_html(str(casetable))
# convert list to dataframe
cases_df=pd.DataFrame(df[0])
print(cases_df)

              Country  Confirmed cases  Deaths Case fatality rate  \
0          San Marino             2176      56               2.6%   
1             Belgium           629109   18821               3.0%   
2            Slovenia           107805    2418               2.2%   
3                Peru           998475   37173               3.7%   
4               Italy          1977370   69842               3.5%   
..                ...              ...     ...                ...   
166  Papua New Guinea              761       9               1.2%   
167          Thailand             5716      60               1.0%   
168          Tanzania              509      21               4.1%   
169           Vietnam             1420      35               2.5%   
170            Taiwan              770       7               0.9%   

     Deaths per 100,000 population  
0                           165.75  
1                           164.78  
2                           116.96  
3                      

In [10]:
local_outbreak_items = pd.read_csv("reference.csv")

In [11]:
reconciled = cases_df.merge(local_outbreak_items, left_on="Country", right_on="countryLabel").drop_duplicates()

In [12]:
set(cases_df["Country"]) - set(reconciled["Country"])

{'Bahamas',
 'China',
 'Congo',
 'DR Congo',
 'Gambia',
 'Palestine',
 'United States'}

In [13]:
template_to_label = {
    'Bahamas': 'The Bahamas',
    'China': 'mainland China',
    'Congo': 'Republic of the Congo',
    'DR Congo': 'Democratic Republic of the Congo',
    'Gambia': 'The Gambia',
    'Palestine': 'State of Palestine',
    'United States': 'United States of America' 
}

In [14]:
cases_df.Country = cases_df.Country.replace(template_to_label)

In [15]:
reconciled = cases_df.merge(local_outbreak_items, left_on="Country", right_on="countryLabel").drop_duplicates()

In [16]:
set(cases_df["Country"]) - set(reconciled["Country"])

set()

In [17]:
reconciled

Unnamed: 0,Country,Confirmed cases,Deaths,Case fatality rate,"Deaths per 100,000 population",item,countryLabel,itemLabel
0,San Marino,2176,56,2.6%,165.75,Q87123561,San Marino,2020 COVID-19 pandemic in San Marino
1,Belgium,629109,18821,3.0%,164.78,Q84446340,Belgium,2020 COVID-19 pandemic in Belgium
2,Slovenia,107805,2418,2.2%,116.96,Q87250948,Slovenia,COVID-19 pandemic in Slovenia
3,Peru,998475,37173,3.7%,116.20,Q87587763,Peru,COVID-19 pandemic in Peru
4,Italy,1977370,69842,3.5%,115.57,Q84104992,Italy,COVID-19 pandemic in Italy
...,...,...,...,...,...,...,...,...
169,Papua New Guinea,761,9,1.2%,0.10,Q88137634,Papua New Guinea,2020 COVID-19 pandemic in Papua New Guinea
170,Thailand,5716,60,1.0%,0.09,Q83873566,Thailand,2020 COVID-19 pandemic in Thailand
171,Tanzania,509,21,4.1%,0.04,Q87770827,Tanzania,2020 COVID-19 pandemic in Tanzania
172,Vietnam,1420,35,2.5%,0.04,Q83873057,Vietnam,COVID-19 pandemic in Vietnam


In [26]:
reconciled["Case fatality rate"] = [float(i.replace("%",""))/100 for i in reconciled["Case fatality rate"]]

In [27]:
reconciled

Unnamed: 0,Country,Confirmed cases,Deaths,Case fatality rate,"Deaths per 100,000 population",item,countryLabel,itemLabel
0,San Marino,2176,56,0.026,165.75,Q87123561,San Marino,2020 COVID-19 pandemic in San Marino
1,Belgium,629109,18821,0.030,164.78,Q84446340,Belgium,2020 COVID-19 pandemic in Belgium
2,Slovenia,107805,2418,0.022,116.96,Q87250948,Slovenia,COVID-19 pandemic in Slovenia
3,Peru,998475,37173,0.037,116.20,Q87587763,Peru,COVID-19 pandemic in Peru
4,Italy,1977370,69842,0.035,115.57,Q84104992,Italy,COVID-19 pandemic in Italy
...,...,...,...,...,...,...,...,...
169,Papua New Guinea,761,9,0.012,0.10,Q88137634,Papua New Guinea,2020 COVID-19 pandemic in Papua New Guinea
170,Thailand,5716,60,0.010,0.09,Q83873566,Thailand,2020 COVID-19 pandemic in Thailand
171,Tanzania,509,21,0.041,0.04,Q87770827,Tanzania,2020 COVID-19 pandemic in Tanzania
172,Vietnam,1420,35,0.025,0.04,Q83873057,Vietnam,COVID-19 pandemic in Vietnam


In [28]:
from wikidataintegrator import wdi_core, wdi_login
from wikidataintegrator.wdi_helpers import try_write
import os
import pandas as pd
import pprint
from IPython.display import clear_output
from getpass import getpass

WBUSER = "TiagoLubiana"  
WBPASS = getpass(prompt='Enter your password: ')  
login = wdi_login.WDLogin(WBUSER, WBPASS)


Enter your password:  ············


https://www.wikidata.org/w/api.php
Successfully logged in as TiagoLubiana


In [49]:
statements = []
for i, row in reconciled.iterrows():
    s = "Q102044164"
    p = "P527"
    o = row["item"]
    q1 = "P1603"
    oq1 = row["Deaths"]
    q2 = "P1120"
    oq2 = row["Confirmed cases"]
    q3 = "P3457"
    oq3 = row["Case fatality rate"]
    q4 = "P585"
    oq4 = "+2020-12-23T00:00:00Z"
    r1 = "P854"
    or1 = "https://coronavirus.jhu.edu/data/mortality"
    r2 = "P143"
    or2 = "Q328"
    r3 = "P4656"
    or3 = "https://en.wikipedia.org/wiki/Template:COVID-19_pandemic_death_rates"


    qualifier_list = [wdi_core.WDQuantity(oq1, q1,  is_qualifier=True),
                  wdi_core.WDQuantity(oq2, q2,  is_qualifier=True),
                  wdi_core.WDQuantity(oq3, q3,  is_qualifier=True), 
                  wdi_core.WDTime(oq4, q4,  is_qualifier=True)]
    
    reference_list = [[wdi_core.WDUrl(or1, r1, is_reference=True),
                  wdi_core.WDItemID(or2, r2, is_reference=True),
                  wdi_core.WDUrl(or3, r3, is_reference=True)]]
    
    statements.extend([wdi_core.WDItemID(value= o, prop_nr=p, qualifiers=qualifier_list, references=reference_list)])
    
item = wdi_core.WDItemEngine(wd_item_id=s, data=statements)   
item.write(login)

'Q102044164'