# Get the full graph from a wiki

We want to get all links between pages in a wikipedia.

The challenges are:
- the pagelinks table contains the page-id for the source-page and the page-title for the target page; thus we have to join the page-table to get also pageids for the target-pages
- the pagelinks table contains links to redirect pages; we resolve redirects by joining the redirect-table
- we want to add the wikidata-item for each page; for this we merge the wikidata_item_page_link-table

The output are 2 tables in /user/mgerlach/graph/
- graph-<wiki>-<snapshot>_links.parquet
    - adjacency list in the form: page_id_from page_id_to
    
- graph-<wiki>-<snapshot>_nodes.parquet
    - list of all nodes contained in the adjacency list in the form: page_id page_title item_id

- with parameters:
    - <wiki> = the wikipedia, e.g. enwiki
    - <snapshot> = time-stamp of snapshot, e.g. 2020-07

In [22]:
import os, sys
import datetime
import calendar
import time
import string
import random

import findspark
findspark.init('/usr/lib/spark2')
from pyspark.sql import SparkSession
from pyspark.sql import functions as F, types as T, Window
import wmfdata.spark as wmfspark

## defining the spark session
os.environ['PYSPARK_DRIVER_PYTHON'] = 'notebook'
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3.7'

spark = wmfspark.get_session(
    app_name='Pyspark notebook', 
    type='large'
)
spark

In [23]:
## define wiki and snapshot for analysis
# wiki = 'simplewiki'
wiki = 'enwiki'
snapshot = '2020-07'

## Table with all links

- only namespace 0
- resolve redirects

In [24]:
## all pages in the main namespace (incl redirects)
# page_id, page_title, page_is_redirect
df_pages = (
    ## select table
    spark.read.table('wmf_raw.mediawiki_page')
    ## select wiki project
    .where( F.col('wiki_db') == wiki )
    .where( F.col('snapshot') == snapshot )
    ## main namespace
    .where(F.col('page_namespace') == 0 )
    .select(
        'page_id',
        'page_title',
        'page_is_redirect'
    )
)

In [25]:
## redirects table with page_ids from/to
## we join the pages table to get page_id for the redirected-to page
df_redirect = (
    ## select table
    spark.read.table('wmf_raw.mediawiki_redirect')
    ## select wiki project
    .where( F.col('wiki_db') == wiki )
    .where( F.col('snapshot') == snapshot )
    .where(F.col('rd_namespace') == 0 )
    .select(
        F.col('rd_from').alias('page_id_from'),
        F.col('rd_title').alias('page_title')
    )
    
    ## get the page-ids for the redirected-to pages
    .join(df_pages,on='page_title',how='inner')
    
    ## select only page-ids
    .select(
        F.col('page_id_from').alias('rd_from'),
        F.col('page_id').alias('rd_to')
    )
)

In [26]:
## get the pagelinks table with page_id_from and page_id_to
## only keep links starting from non-redirect pages
## join pages-table to get page-ids for redirect-to pages
df_links = (
    ## select table
    spark.read.table('wmf_raw.mediawiki_pagelinks')
    ## select wiki project
    .where( F.col('wiki_db') == wiki )
    .where( F.col('snapshot') == snapshot )
    
    ## namespace of source and target page
    .where(F.col('pl_from_namespace') == 0 )
    .where(F.col('pl_namespace') == 0 )
    
    .withColumnRenamed('pl_from','page_id_from')
    .withColumnRenamed('pl_title','page_title')
    
    ## only keep links that originate from a page that is not a redirect 
    ## by joining the pages-table with the non-redirect pages
    .join(
        df_redirect.withColumnRenamed('rd_from','page_id_from'),
        on = 'page_id_from',
        how = 'left_anti'
    )
    ## map page_title_to page_id_to by joining the pages-df
    .join(
        df_pages,
        on='page_title',
        how='inner'
    )
    .withColumnRenamed('page_id','page_id_to')
    .select('page_id_from','page_id_to')
)

In [27]:
## resolve the redirects in the links-table by joining the redirect table
df_links_resolved = (
    df_links
    ## join in the redirects
    .join(
        df_redirect,
        df_links['page_id_to'] == df_redirect['rd_from'],
        how = 'left'
    )
    ## select the redirected link (otherwise keep the old)
    .withColumn('page_id_to_resolved', F.coalesce(F.col('rd_to'),F.col('page_id_to')) )
    .select(
        F.col('page_id_from').alias('page_id_from'),
        F.col('page_id_to_resolved').alias('page_id_to')
    )
    ## remove duplicate links
    .distinct()
    .select(
        'page_id_from',
        'page_id_to'
    )
)

In [28]:
FILE_out = '/user/mgerlach/graph/graph-%s-%s_links.parquet'%(wiki,snapshot)
df_links_resolved.write.mode('overwrite').parquet(FILE_out)


## table with all nodes
- pid
- page_title
- merge qid

In [29]:
## join the wikidata-item to each pageview
## we keep only pageviews for which we have a correpsionding wikidata-item id

## table with mapping wikidata-ids to page-ids
## partition wikidb and page-id ordered by snapshot
w_wd = Window.partitionBy(F.col('wiki_db'),F.col('page_id')).orderBy(F.col('snapshot').desc())
df_wd = (
    spark.read.table('wmf.wikidata_item_page_link')
    ## snapshot: this is a partition!
    .where(F.col('snapshot') >= '2020-07-01') ## resolve issues with non-mathcing wikidata-items
    ## only wikis (enwiki, ... not: wikisource)
    .where(F.col('wiki_db')==wiki)
    .withColumn('item_id_latest',F.first(F.col('item_id')).over(w_wd))
    .select(
        'page_id',
        F.col('item_id_latest').alias('item_id')
    )
    .drop_duplicates()
)

In [30]:
df_from = df_links_resolved.select('page_id_from').distinct().withColumnRenamed('page_id_from','page_id')
df_to = df_links_resolved.select('page_id_to').distinct().withColumnRenamed('page_id_to','page_id')
df_nodes_sel = df_from.join(df_to,on='page_id',how='outer')


In [31]:
# all nodes from the pages-table which appear in the links_resolved-table (from/to)
df_nodes = (
    df_pages
    .join(
        df_nodes_sel,
        on = 'page_id',
        how = 'left_semi'
    )
    .join(df_wd,on='page_id',how='left')
    .select(
        'page_id',
        'page_title',
        'item_id'
    )
)

In [32]:
FILE_out = '/user/mgerlach/graph/graph-%s-%s_nodes.parquet'%(wiki,snapshot)
df_nodes.write.mode('overwrite').parquet(FILE_out)

## check 

#### links file

In [33]:
FILE_in = '/user/mgerlach/graph/graph-%s-%s_links.parquet'%(wiki,snapshot)
df_check = spark.read.load(FILE_in)

In [34]:
df_check.show(2)

+------------+----------+
|page_id_from|page_id_to|
+------------+----------+
|    18914017|      2088|
|    13907534|      6824|
+------------+----------+
only showing top 2 rows



In [35]:
df_check.count()

514160225

#### nodes file

In [36]:
FILE_in = '/user/mgerlach/graph/graph-%s-%s_nodes.parquet'%(wiki,snapshot)
df_check = spark.read.load(FILE_in)

In [37]:
df_check.show(2)

+-------+--------------------+---------+
|page_id|          page_title|  item_id|
+-------+--------------------+---------+
|   3234|Archbishopric_of_...|Q15471939|
|   5390| Conversion_of_units|  Q618655|
+-------+--------------------+---------+
only showing top 2 rows



In [38]:
df_check.count()

6151649