# Get COVID Related Wikipedia Pages

This notebook retrieves COVID-19 related wikiepdia pages from wikimedia dumps which created before set up time.  
You can utilize below codes to get wikipedia pages from dump with other topic.

**Set up values**  
* Target Language (2-letter language code needed)
* Seed QIDs (Target Wikidata articles's QID)
* Sample Date

**Procedure of retrieving**  
1. Wikidata Pageids(in Pagelinks)
2. Wikidata QIDs
3. For each languages
   - Get Wikipedia Pageids
   - Get Page Title
   - Set Category of articles

In [68]:
import json,gzip,glob
import csv
import pandas as pd
import re
import requests

def download_dumps(languages,date):
    if not os.path.exists("/dumps"):
        os.mkdir("dumps")
    destination_folder = os.path.abspath("/dumps")

    #Wikidata Pagelinks & Pages - SQL File
    pagelinks_url = "https://dumps.wikimedia.org/wikidatawiki/{}/wikidatawiki-{}-pagelinks.sql.gz".format(date,date)
    pages_url = "https://dumps.wikimedia.org/wikidatawiki/{}/wikidatawiki-{}-page.sql.gz".format(date,date)

    wget.download(pagelinks_url,out = destination_folder)
    wget.download(pages_url,out = destination_folder)

    #Wikipedia - SQL File
    for language in languages:
        url = "https://dumps.wikimedia.org/{}wiki/{}/{}wiki-{}-page_props.sql.gz".format(language,date,language,date)
        wget.download(url,out = destination_folder)

## Set Target Datas

In [55]:
#Time Set (Nov.20th)
date = '20201120'

#Target Language 
lang = ['en','ko','ar','de','es','fr','it','ja','pt','zh','ru']

#Target Seeds of Wikidata Article
seeds = ['Q81068910','Q82069695','Q84263196']

#download dumps
download_dumps(lang,date)

## 1. Extract Wikidata Pageid from pagelinks

In [None]:
#Get seeds from pagelink.sql file (Alternative : "|".join(seeds))
command = 'grep -E "{}" wikidatawiki-{}-pagelinks.sql > COVID_item_zgrepmatch.txt'.format("|".join(seeds),date)
!$command

#Get wikidata page ids which linked to COVID-19 seed items
wikidataPagesIds = []
f = open('COVID_item_zgrepmatch.txt')
for l  in f:
    for s in seeds:
        wikidataPagesIds.extend(re.findall("\(([0-9]+),0,\'%s\'" % s,l))

## 2. Get Wikidata Article QIDs

In [63]:
#DB Connection : Before Running below parts, you should import downloaded dump sql files to DB
import pymysql

#Get related QIDs from wikidata page ids (dump filename : wikidata_page.sql.gz)
conn = pymysql.connect(host='node200',user='jaehyeon',password='', db='testdb',charset='utf8')
cursor = conn.cursor(pymysql.cursors.DictCursor)

sql = "SELECT page_id,page_title FROM page WHERE page_id IN ({})".format(','.join(wikidataPagesIds))
cursor.execute(sql)
COVID_related_QIDs = [page['page_title'].decode() for page in cursor.fetchall() if page['page_title'].decode('utf8')[0]=='Q'] 
conn.close()

## 3.1 Get Wikipedia PageIDs

In [None]:
#Retrieve pageid for each languages (dump filename : wikipedia_page_prop.sql.gz)
conn = pymysql.connect(host='node200',user='jaehyeon',password='', db='testdb',charset='utf8')
cursor = conn.cursor(pymysql.cursors.DictCursor)

wikidata_dict = {}
sql_QIDs = ','.join("'{0}'".format(QID) for QID in COVID_related_QIDs)
for lan in lang:
    sql = f"SELECT pp_page,pp_propname,pp_value FROM {lan}_page_props WHERE pp_value IN ({sql_QIDs}) AND pp_propname = 'wikibase_item'"
    cursor.execute(sql)
    id_dict = {}
    for page in cursor.fetchall():
        id_dict[page['pp_value'].decode()] = {'QID' : page['pp_value'].decode(),
                           'PageID' : page['pp_page'],
                           'Language' : lan+"wiki"} 
    wikidata_dict[lan] = id_dict
conn.close()

## 3.2 Get Wikipedia Title 

In [None]:
n = 50
for l in lang:
    print(l)
    qids = list(wikidata_dict[l].keys())
    seg_qid_list = [qids[i * n:(i + 1) * n] for i in range((len(qids) + n - 1) // n )] 
    for ids in seg_qid_list:
        url_ids = '|'.join(ids)
        url = "https://www.wikidata.org/w/api.php?action=wbgetentities&format=json&props=sitelinks&ids={}&sitefilter={}wiki".format(url_ids,l)
        entities = requests.get(url).json().get('entities')
        print(entitites)
        for k,v in entities.items():
            item = entities.get(k)
            if item:
                sitelink = item.get('sitelinks')
                if sitelink:
                    title = sitelink.get(l+'wiki').get('title')
                    if title:
                        wikidata_dict[l][k]['Title'] = title

## 3.3 Set Category of Articles
Region & Human Category pages are selected from Wikidata "instance of" label. If label is "human" then category is "Human". Else if label is "outbreak", category is "Region". Label data can be collected from wikidata query.

In [66]:
##Get Wikidata label data from wikimedia api
def get_label_qid(target_label,qid):
    url = "https://www.wikidata.org/w/api.php?action=wbgetentities&ids=%s&format=json&languages=en"%(qid)
    entities = requests.get(url).json().get('entities')
    return entities[qid]['claims']['P31'][0]['mainsnak']['datavalue']['value']['id']

In [None]:
for l in lang:
    for k, v in wikidata_dict[l].items():
        label == get_label_qid('P31',qid) # P31 : instance of
        if k in seeds:
            v['category'] = 'Bio-Med'
        elif label == 'Q3241045': # Q3241045 : disease outbreak
            v['category'] = 'Region'
        elif label == 'Q5': # Q5 : human
            v['category'] = 'Human'
        else:
            v['category'] = 'Others'
            

## 4. CSV Save

In [None]:
for l in lang:
    name = "clean_list_{}/wikipedia_list_cleaned_{}_{}.csv".format(date,date,l)
    with open(name,'w') as f:
        writer = csv.writer(f)
        writer.writerow(('QID','page id','wiki_db','item','category'))
        for key, value in wikidata_dict[l].items():
            writer.writerow(list(value.values()))