# Summary

quantify the extent of missing sitelins in wikidata, i.e. the extent to which articles in wikipedia have not been linked to wikidata items


In [1]:
import os, sys
import datetime
import calendar
import time
import pandas as pd
import string
import random

import findspark
findspark.init('/usr/lib/spark2')
from pyspark.sql import SparkSession
from pyspark.sql import functions as F, types as T, Window
import wmfdata.spark as wmfspark


In [2]:
## regular
spark_config = {}
spark_config = {
    "spark.driver.memory": "2g",
    "spark.dynamicAllocation.maxExecutors": 64,
    "spark.executor.memory": "8g",
    "spark.executor.cores": 4,
    "spark.sql.shuffle.partitions": 256
}
spark_config = {}
os.environ['PYSPARK_DRIVER_PYTHON'] = 'notebook'
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3.5'

spark = wmfspark.get_session(
    app_name='Pyspark notebook', 
    extra_settings=spark_config
)
spark

# Fraction of pages with a wikidata item for different wikis





In [64]:
snapshot = '2020-03'
col_wiki_pageid = F.concat(F.col('wiki_db'),F.lit('-'),F.col('page_id'))
df_pages = (
    spark.read.table('wmf_raw.mediawiki_page')   
    .where(F.col('snapshot')==snapshot)
    .where(F.col('wiki_db').endswith('wiki'))
    .where(F.col('wiki_db')!='wikidatawiki')
    .where(F.col('page_namespace')==0)
    .where(F.col('page_is_redirect')==False)
    .withColumn('wiki_pageid',col_wiki_pageid)
    .select('wiki_pageid','wiki_db')
)

wd_snapshot = '2020-04-06'
col_wiki_pageid = F.concat(F.col('wiki_db'),F.lit('-'),F.col('page_id'))
df_wd = (
    spark.read.table('wmf.wikidata_item_page_link')
    .where(F.col('snapshot') == wd_snapshot)
    .where(F.col('wiki_db').endswith('wiki'))
    .where(F.col('wiki_db')!='wikidatawiki')
    .where( F.col('page_namespace') == 0 )
    .withColumn('wiki_pageid',col_wiki_pageid)
    .select('wiki_pageid','item_id')
)
df_coverage = (
    df_pages
    .join(
        df_wd,
        on='wiki_pageid',
        how='left_outer'
    )
    .withColumn(
        'sitelink_exist',
        F.coalesce(F.col('item_id').isNotNull().cast('int'),F.lit(0))
    )
)
df_count = (df_coverage
    .groupBy('wiki_db')
    .agg(
         F.count(F.col('sitelink_exist')).alias('N_pages'),
         F.sum(F.col('sitelink_exist')).alias('N_sitelinks')
    )
    .withColumn(
        'fraction_sitelinks',
        F.col('N_sitelinks')/F.col('N_pages')
    )
    .orderBy('N_pages',ascending=False)
).toPandas()

In [68]:
df_count.head()

Unnamed: 0,wiki_db,N_pages,N_sitelinks,fraction_sitelinks
0,enwiki,6046911,6028510,0.996957
1,cebwiki,5378782,4763077,0.885531
2,svwiki,3735863,3731624,0.998865
3,dewiki,2414815,2409645,0.997859
4,frwiki,2195756,2191171,0.997912


In [69]:
df_count.to_csv('coverage_wikidata.csv')