## Get all redirects (title_from, title_to) for a wiki

In [1]:
import os, sys
import numpy as np
import pandas as pd
import wmfdata
import json
import bz2
import datetime


import wmfdata.spark as wmfspark
import findspark
findspark.init('/usr/lib/spark2')
from pyspark.sql import SparkSession
from pyspark.sql import functions as F, types as T, Window

You are using wmfdata v1.0.1, but v1.0.3 is available.

To update, run `pip install --upgrade git+https://github.com/neilpquinn/wmfdata/wmfdata.git@release`.

To see the changes, refer to https://github.com/neilpquinn/wmfdata/blob/release/CHANGELOG.md


In [2]:
os.environ['PYSPARK_DRIVER_PYTHON'] = 'notebook'
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3.5'

spark_config = {}
spark = wmfspark.get_session(
    app_name='Pyspark notebook',
    extra_settings=spark_config
)
spark

## Get all redirects

In [3]:
## define wiki and snapshot for analysis
wiki = 'swwiki'
snapshot = '2020-05'

## all pages in the main namespace (incl redirects)
# page_id, page_title, page_is_redirect
df_pages = (
    ## select table
    spark.read.table('wmf_raw.mediawiki_page')
    ## select wiki project
    .where( F.col('wiki_db') == wiki )
    .where( F.col('snapshot') == snapshot )
    ## main namespace
    .where(F.col('page_namespace') == 0 )
    .select('page_id','page_title','page_is_redirect')
)

## redirects table with page_ids from/to
## we join the pages table to get page_id for the redirected-to page
df_redirect = (
    ## select table
    spark.read.table('wmf_raw.mediawiki_redirect')
    ## select wiki project
    .where( F.col('wiki_db') == wiki )
    .where( F.col('snapshot') == snapshot )
    .where(F.col('rd_namespace') == 0 )
    .select(
        F.col('rd_from').alias('page_id_from'),
        F.col('rd_title').alias('page_title_to')
    )
    .join(df_pages.withColumnRenamed('page_id','page_id_from'),on='page_id_from',how='inner')
    .withColumnRenamed('page_title','page_title_from')
    .select(
        F.col('page_title_from'),
        F.col('page_title_to')
    )
)
df_redirect.show(10)

+--------------------+-----------------+
|     page_title_from|    page_title_to|
+--------------------+-----------------+
|     Pius_XII_(Papa)|    Papa_Pius_XII|
|          Mwaka_1936|             1936|
|        Musoma_mjini|     Musoma_(mji)|
|              Babado|         Barbados|
|               Cheki|           Ucheki|
|    Mobotu_Sese_Seko| Mobutu_Sese_Seko|
|   Lugha_ya_Kiisanzu|         Kiisanzu|
|Giacomo_della_Chiesa|Papa_Benedikto_XV|
|             Tolkien|   J.R.R._Tolkien|
|             Aviceda|    Kipanga-kekeo|
+--------------------+-----------------+
only showing top 10 rows

