## Summary: Graph and graphframes

- build the hyperlink graph for a wikipedia resolving redirects
- use graphframes to do graph-processing in spark
    - for this we build a custom spark-env and install the graphframes packages for graph-processing.

In [1]:
import os, sys
import numpy as np
import pandas as pd
import wmfdata
import json
import bz2
import datetime


import wmfdata.spark as wmfspark
import findspark
findspark.init('/usr/lib/spark2')
from pyspark.sql import SparkSession
from pyspark.sql import functions as F, types as T, Window

You are using wmfdata v1.0.1, but v1.0.3 is available.

To update, run `pip install --upgrade git+https://github.com/neilpquinn/wmfdata/wmfdata.git@release`.

To see the changes, refer to https://github.com/neilpquinn/wmfdata/blob/release/CHANGELOG.md


In [2]:
os.environ['PYSPARK_DRIVER_PYTHON'] = 'notebook'
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3.5'

spark_config = {
    ## this configuration adds graphframes
    'spark.driver.extraJavaOptions':
        ' '.join('-D{}={}'.format(k, v) for k, v in {
            'http.proxyHost': 'webproxy.eqiad.wmnet',
            'http.proxyPort': '8080',
            'https.proxyHost': 'webproxy.eqiad.wmnet',
            'https.proxyPort': '8080',
        }.items()),
    'spark.jars.packages':'graphframes:graphframes:0.6.0-spark2.3-s_2.11'
}


spark = wmfspark.get_session(
    app_name='Pyspark notebook',
    extra_settings=spark_config
)
spark

## Dataframe with nodes and links

We not only have to join pages and pagelinks table to get pageids for both source and target pages,
but also join the redirects-table in order to resolve the redirect links

In [3]:
## define wiki and snapshot for analysis
wiki = 'swwiki'
snapshot = '2020-05'

In [4]:
## all pages in the main namespace (incl redirects)
# page_id, page_title, page_is_redirect
df_pages = (
    ## select table
    spark.read.table('wmf_raw.mediawiki_page')
    ## select wiki project
    .where( F.col('wiki_db') == wiki )
    .where( F.col('snapshot') == snapshot )
    ## main namespace
    .where(F.col('page_namespace') == 0 )
    .select('page_id','page_title','page_is_redirect')
)

In [5]:
## redirects table with page_ids from/to
## we join the pages table to get page_id for the redirected-to page
df_redirect = (
    ## select table
    spark.read.table('wmf_raw.mediawiki_redirect')
    ## select wiki project
    .where( F.col('wiki_db') == wiki )
    .where( F.col('snapshot') == snapshot )
    .where(F.col('rd_namespace') == 0 )
    .select(
        F.col('rd_from').alias('page_id_from'),
        F.col('rd_title').alias('page_title_to')
    )
    .join(df_pages.withColumnRenamed('page_title','page_title_to'),on='page_title_to',how='inner')
    .withColumnRenamed('page_id','page_id_to')
    .select(
        F.col('page_id_from').alias('rd_from'),
        F.col('page_id_to').alias('rd_to')
    )
)

In [6]:
## get the pagelinks table with page_id_from and page_id_to
## only keep links starting from non-redirect pages
## join pages-table to get page-ids for redirect-to pages
df_links = (
    ## select table
    spark.read.table('wmf_raw.mediawiki_pagelinks')
    ## select wiki project
    .where( F.col('wiki_db') == wiki )
    .where( F.col('snapshot') == snapshot )
    
    ## namespace of source and target page
    .where(F.col('pl_from_namespace') == 0 )
    .where(F.col('pl_namespace') == 0 )
    
    .withColumnRenamed('pl_from','page_id_from')
    .withColumnRenamed('pl_title','page_title_to')
    
    ## only keep links that originate from a page that is not a redirect 
    ## by joining the pages-table with the non-redirect pages
    .join(
        df_pages.where(F.col('page_is_redirect')==0).withColumnRenamed('page_id','page_id_from'),
        on = 'page_id_from',
        how = 'left_semi'
    )
    ## map page_title_to page_id_to by joining the pages-df
    .join(
        df_pages.withColumnRenamed('page_title','page_title_to'),
        on='page_title_to',
        how='inner'
    )
    .withColumnRenamed('page_id','page_id_to')
    .select('page_id_from','page_id_to')
)

In [7]:
## resolve the redirects in the links-table by joining the redirect table
df_links_resolved = (
    df_links
    ## join in the redirects
    .join(
        df_redirect,
        df_links['page_id_to'] == df_redirect['rd_from'],
        how = 'left_outer'
    )
    ## select the redirected link (otherwise keep the old)
    .withColumn('page_id_to_resolved', F.coalesce(F.col('rd_to'),F.col('page_id_to')) )
    .select(
        F.col('page_id_from').alias('page_id_from'),
        F.col('page_id_to_resolved').alias('page_id_to')
    )
    ## remove duplicate links
    .distinct()
)
df_links_resolved.show(10,False)

+------------+----------+
|page_id_from|page_id_to|
+------------+----------+
|4181        |5800      |
|75924       |7405      |
|36228       |14888     |
|36265       |14888     |
|75982       |18504     |
|21532       |21549     |
|31037       |31066     |
|5791        |34164     |
|11112       |44649     |
|82088       |69449     |
+------------+----------+
only showing top 10 rows



In [8]:
## all nodes from the pages-table which appear in the links_resolved-table (from/to)

df_pages_resolved = (
    df_pages
    .join(
        df_links_resolved,
        (df_pages['page_id'] == df_links_resolved['page_id_from']) | 
        (df_pages['page_id'] == df_links_resolved['page_id_to']),
        how = 'left_semi'
    )
    .distinct()
)
df_pages_resolved.show()

+-------+--------------------+----------------+
|page_id|          page_title|page_is_redirect|
+-------+--------------------+----------------+
|  13786|           Wafinisia|           false|
|  14102|      Thomas_Sankara|           false|
|  14109|    Picha_ya_kiwamba|           false|
|  89034|Mia_saba_themanin...|           false|
|  89052|          Paulo_Hanh|           false|
|  89351|Elimu_kwa_watoto_...|           false|
|   8043|            Kijapani|           false|
|  39930|  Manchester_(maana)|           false|
|  82614|             1997_KK|           false|
| 107547|       Nyotabadilifu|           false|
| 107639|    Mto_Siga_(Naili)|           false|
|  10720|                1301|           false|
|  42621|Orodha_ya_Mawazir...|           false|
|  84324|Kanisa_la_Kiortho...|           false|
|  84365|   Aspreno_wa_Napoli|           false|
| 108930|   Kisiwa_cha_Makove|           false|
| 108982|    Kisiwa_cha_Iriga|           false|
| 109169|   Kisiwa_cha_Mlinzi|          

## Using graphframe to do graph-stuff with spark

for graphframes we need a nodes dataframe and a links dataframe with a specific structure

- the nodes need to have a column: 'id'
- the edges need to have 2 columns: 'src', 'dst'


In [9]:
from graphframes import *

In [10]:
## renaming columns for the nodes-df 
df_nodes = (
    df_pages_resolved
    .withColumn('id',F.col('page_id'))
    .select('id','page_id','page_title')
)
df_nodes.show(10)

+------+-------+--------------------+
|    id|page_id|          page_title|
+------+-------+--------------------+
| 11937|  11937|                 473|
| 11986|  11986|                 423|
| 12120|  12120|                 304|
| 12159|  12159|                 265|
| 84324|  84324|Kanisa_la_Kiortho...|
| 84365|  84365|   Aspreno_wa_Napoli|
| 84495|  84495|      Mnjegere-kubwa|
|108469| 108469|               Lenzi|
| 85326|  85326|Mia_nne_na_hamsin...|
| 85381|  85381|Mia_nne_na_tisini...|
+------+-------+--------------------+
only showing top 10 rows



In [11]:
## renaming columns for the edges-df 
df_edges = (
    df_links_resolved
    .withColumnRenamed('page_id_from','src')
    .withColumnRenamed('page_id_to','dst')
)
df_edges.show()

+------+------+
|   src|   dst|
+------+------+
|  4181|  5800|
| 75924|  7405|
| 36228| 14888|
| 36265| 14888|
| 75982| 18504|
| 21532| 21549|
| 31037| 31066|
|  5791| 34164|
| 11112| 44649|
| 82088| 69449|
| 82176| 69449|
| 81795| 69449|
| 80471| 69449|
| 91604|  4023|
|  9938| 81337|
| 84742| 84798|
| 10431| 91727|
| 71693|119355|
| 96378|  1201|
|120631|  1201|
+------+------+
only showing top 20 rows



In [12]:
## create graph object
g = GraphFrame(df_nodes, df_edges)

In [13]:
## get degree of each nodes from high to low
deg = g.degrees.sort(F.col('degree'),ascending=False)
## join the page title
(deg.join(df_nodes,deg['id']==df_nodes['page_id'])
    .select('degree','page_id','page_title')
    .sort('degree',ascending=False)
    .show()
)

+------+-------+--------------------+
|degree|page_id|          page_title|
+------+-------+--------------------+
| 10429|   1191|            Tanzania|
|  9762|  14068|                Maji|
|  8063|   5594|               Mwaka|
|  7834|  93640|       Mito_ya_Kenya|
|  6959|  93654|            Mto_Suam|
|  6711|  12444|     Orodha_ya_Miaka|
|  6443| 110205|Orodha_ya_mito_ya...|
|  6388|   1676|               Kenya|
|  6166|   3033|                 Mto|
|  6047|  87732|          Umba_(mto)|
|  6028|  92131|            Mto_Mara|
|  6011|  92070|            Mto_Lumi|
|  5942|  92229|      Ruvu_(Pangani)|
|  5386|     65|              Madola|
|  5327|   4023|            Marekani|
|  5255|   1991|          Mto_Kagera|
|  5092|   2184|                Nile|
|  4860|   2185|Bahari_ya_Mediter...|
|  4652|   2048|              Uganda|
|  4360|   3736|Orodha_ya_Watakat...|
+------+-------+--------------------+
only showing top 20 rows



In [None]:
## get page rank for each node from high to low
pr = g.pageRank(resetProbability=0.15, tol=0.01)
pr.vertices.sort('pagerank',ascending=False).show()