### Exploring data from datahub.io

Acquiring data from [here](https://github.com/datasets/covid-19) and maybe soon be able to automate the integration of that data with wikidata.

Just some things to think about (jvfe):
- How to properly reference the data? Choose [datahub.io](https://datahub.io/core/covid-19) as the reference?
    - They aggregate it from various sources
    
    
- I've acquired the country outbreak items via the following query and modified it slightly to better merge the items.
```
SELECT ?item ?itemLabel ?countryid ?countryidLabel
WHERE 
{
  ?item p:P31 ?statement. 
      ?statement ps:P31 wd:Q3241045. 
      ?statement pq:P642 wd:Q84263196.
      ?statement pq:P3005 ?countryid.
      ?countryid wdt:P31 wd:Q6256.
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}
```

In [1]:
import pandas as pd

In [2]:
from datetime import date, time, timedelta
yesterday = date.today() - timedelta(days=1)
today = date.today()

yesterday_table = yesterday.strftime("%Y-%m-%d")
today_table = today.strftime("%Y-%m-%d")


In [3]:
countries = pd.read_csv("https://raw.githubusercontent.com/datasets/covid-19/master/data/countries-aggregated.csv")
wdt_items = pd.read_csv("../data/country_outbreaks.csv")

In [4]:
full = pd.merge(countries, wdt_items, on="Country")
full

Unnamed: 0,Date,Country,Confirmed,Recovered,Deaths,item,itemLabel,countryid
0,2020-01-22,Afghanistan,0,0,0,Q87768605,2020 coronavirus pandemic in Afghanistan,Q889
1,2020-01-23,Afghanistan,0,0,0,Q87768605,2020 coronavirus pandemic in Afghanistan,Q889
2,2020-01-24,Afghanistan,0,0,0,Q87768605,2020 coronavirus pandemic in Afghanistan,Q889
3,2020-01-25,Afghanistan,0,0,0,Q87768605,2020 coronavirus pandemic in Afghanistan,Q889
4,2020-01-26,Afghanistan,0,0,0,Q87768605,2020 coronavirus pandemic in Afghanistan,Q889
...,...,...,...,...,...,...,...,...
13095,2020-04-26,Zimbabwe,31,2,4,Q88164033,2020 coronavirus pandemic in Zimbabwe,Q954
13096,2020-04-27,Zimbabwe,32,5,4,Q88164033,2020 coronavirus pandemic in Zimbabwe,Q954
13097,2020-04-28,Zimbabwe,32,5,4,Q88164033,2020 coronavirus pandemic in Zimbabwe,Q954
13098,2020-04-29,Zimbabwe,32,5,4,Q88164033,2020 coronavirus pandemic in Zimbabwe,Q954


In [5]:
from datetime import datetime
#Most recent data seems to be from the day before

query = "Date == @yesterday_table"
recent = full.query(query)

## that does not happen aways, though.


dates_in_full = [datetime.strptime(date, "%Y-%m-%d") for date in full["Date"]]
most_recent_date = max(dates_in_full).strftime("%Y-%m-%d")

# pd.query did not work
recent = full[full["Date"] == most_recent_date ]

recent.head()

Unnamed: 0,Date,Country,Confirmed,Recovered,Deaths,item,itemLabel,countryid
99,2020-04-30,Afghanistan,2171,260,64,Q87768605,2020 coronavirus pandemic in Afghanistan,Q889
199,2020-04-30,Algeria,4006,1779,450,Q87202921,2020 coronavirus pandemic in Algeria,Q262
299,2020-04-30,Angola,27,7,2,Q88082534,2020 coronavirus pandemic in Angola,Q916
399,2020-04-30,Antigua and Barbuda,24,11,3,Q87708331,2020 coronavirus pandemic in Antigua and Barbuda,Q781
499,2020-04-30,Argentina,4428,1256,218,Q87235137,2020 coronavirus pandemic in Argentina,Q414


In [6]:
# The following countries appear to be updated manually from more specific sources.
idx = recent['Country'].isin(['US', 'United Kingdom', 'France', 'Sweden', 'Brazil', 'Netherlands',
                                 'China', 'Italy', 'Spain', 'Germany', 'Iran', 'Mexico', 'Argentina',
                                 'Canada', 'Spain', 'Norway', 'Uruguay'])
not_manual = recent[~idx]

In [7]:
yesterday_wdt = yesterday.strftime("+%Y-%m-%dT00:00:00Z/11")
today_wdt = today.strftime("+%Y-%m-%dT00:00:00Z/11")

with open(f'../data/{today_table}.qs', 'w') as file:
    for index, row in not_manual.iterrows():
        print(
              row['item'] + "|P1603|" + str(int(row['Confirmed'])) + "|P585|" + yesterday_wdt + "|S854|" + '"' + 
                    "https://github.com/datasets/covid-19" + '"' +
                    "|S813|" + today_wdt + "\n" +
              row['item'] + "|P1120|" + str(int(row['Deaths'])) + "|P585|" + yesterday_wdt + "|S854|" + '"' + 
                    "https://github.com/datasets/covid-19" + '"' +
                    "|S813|" + today_wdt + "\n" +
              row['item'] + "|P8010|" + str(int(row['Recovered'])) + "|P585|" + yesterday_wdt + "|S854|" + '"' + 
                    "https://github.com/datasets/covid-19" + '"' +
                    "|S813|" + today_wdt + "\n",
                file = file)

In [8]:
%run check_last_update_for_country_items.py

In [9]:
country_outbreak_items_of_interest = list(recent["item"])

# Api only takes 50 at a time, so we have to cut it.


# implementation from    https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
def get_chunks(l, n):
    n = max(1, n)
    return (l[i:i+n] for i in range(0, len(l), n))

        
chunks_of_country_outbreak_items_of_interest = list(get_chunks(country_outbreak_items_of_interest, 50))        
        
outbreak_item_to_timestamp = {}

for chunk in chunks_of_country_outbreak_items_of_interest:
    outbreak_item_to_timestamp.update(get_timestamp_of_last_edits(chunk))


In [10]:
recent["timestamp_of_last_edit"] = recent["item"].map(outbreak_item_to_timestamp)

recent.head(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Date,Country,Confirmed,Recovered,Deaths,item,itemLabel,countryid,timestamp_of_last_edit
99,2020-04-30,Afghanistan,2171,260,64,Q87768605,2020 coronavirus pandemic in Afghanistan,Q889,2020-04-30T13:07:10Z


In [11]:
from datetime import datetime

def convert_timestamp_to_time_until_now(timestamp):

    time_in_datetime_format = datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%SZ")
    diff = datetime.now() - time_in_datetime_format
    return(diff)

In [12]:
recent["time_from_last_edit_until_now"] = recent["timestamp_of_last_edit"].map(convert_timestamp_to_time_until_now)

recent.head(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Date,Country,Confirmed,Recovered,Deaths,item,itemLabel,countryid,timestamp_of_last_edit,time_from_last_edit_until_now
99,2020-04-30,Afghanistan,2171,260,64,Q87768605,2020 coronavirus pandemic in Afghanistan,Q889,2020-04-30T13:07:10Z,22:42:44.574807


In [13]:
outdated_items = recent[recent["time_from_last_edit_until_now"] > timedelta(hours=23)]

In [14]:
outdated_items.head(5)

Unnamed: 0,Date,Country,Confirmed,Recovered,Deaths,item,itemLabel,countryid,timestamp_of_last_edit,time_from_last_edit_until_now
299,2020-04-30,Angola,27,7,2,Q88082534,2020 coronavirus pandemic in Angola,Q916,2020-04-30T10:07:12Z,1 days 01:42:42.574844
399,2020-04-30,Antigua and Barbuda,24,11,3,Q87708331,2020 coronavirus pandemic in Antigua and Barbuda,Q781,2020-04-30T10:07:27Z,1 days 01:42:27.574856
599,2020-04-30,Australia,6766,5742,93,Q83873548,2020 coronavirus pandemic in Australia,Q408,2020-04-30T12:30:06Z,0 days 23:19:48.574876
899,2020-04-30,Bangladesh,7667,160,168,Q87540454,2020 coronavirus pandemic in Bangladesh,Q902,2020-04-30T10:08:50Z,1 days 01:41:04.574905
999,2020-04-30,Barbados,81,39,7,Q87902902,2020 coronavirus pandemic in Barbados,Q244,2020-04-30T10:09:03Z,1 days 01:40:51.574914


In [21]:

table_date_in_wikidata_format = datetime.strptime(outdated_items["Date"].values[0], "%Y-%m-%d").strftime("+%Y-%m-%dT00:00:00Z/11")

point_in_time = "|P585|" +  table_date_in_wikidata_format

today_wdt = today.strftime("+%Y-%m-%dT00:00:00Z/11")


reference_URL = "|S854|" + '"' +  "https://datahub.io/core/covid-19" + '"'
retrieved_in  = "|S813|" + today_wdt 
filename_in_archive = "|S7793|" + '"' +  "r/countries-aggregated.csv" + '"'

reference = reference_URL +  retrieved_in + filename_in_archive


with open(f'../data/{today_table}_outdated_items.qs', 'w') as file:
    for index, row in outdated_items.iterrows():
        print(
              row['item'] + "|P1603|" + str(int(row['Confirmed'])) + point_in_time + reference + "\n" +
              row['item'] + "|P1120|" + str(int(row['Deaths'])) + point_in_time + reference + "\n" +
              row['item'] + "|P8010|" + str(int(row['Recovered'])) + point_in_time + reference                    + "\n",
                file = file)