In [1]:
import os, sys
import datetime
import calendar
import time
import string
import random

import findspark
findspark.init('/usr/lib/spark2')
from pyspark.sql import SparkSession
from pyspark.sql import functions as F, types as T, Window
import wmfdata.spark as wmfspark

## defining the spark session
spark_config = {}
# regular
spark_config = {
    "spark.driver.memory": "2g",
    "spark.dynamicAllocation.maxExecutors": 64,
    "spark.executor.memory": "8g",
    "spark.executor.cores": 4,
    "spark.sql.shuffle.partitions": 256
}
# ## big
# spark_config = {
#     "spark.driver.memory": "4g",
#     "spark.dynamicAllocation.maxExecutors": 128,
#     "spark.executor.memory": "8g",
#     "spark.executor.cores": 4,
#     "spark.sql.shuffle.partitions": 512,
#     "spark.sql.broadcastTimeout": 3600
# }
# spark_config = {
#     "spark.dynamicAllocation.maxExecutors": 128,
#     "spark.executor.memory": "16g",
#     "spark.driver.memory": "12g",
#     "spark.executor.memoryOverhead":"4g",
#     "spark.num.executors":50,
#     "spark.driver.maxResultSize":"32g",
# }
os.environ['PYSPARK_DRIVER_PYTHON'] = 'notebook'
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3.7'

spark = wmfspark.get_session(
    app_name='Pyspark notebook', 
    extra_settings=spark_config
)
spark

In [22]:
## define wiki and snapshot for analysis
wiki = 'simplewiki'
wiki = 'enwiki'
snapshot = '2020-07'

In [23]:
## all pages in the main namespace (incl redirects)
# page_id, page_title, page_is_redirect
df_pages = (
    ## select table
    spark.read.table('wmf_raw.mediawiki_page')
    ## select wiki project
    .where( F.col('wiki_db') == wiki )
    .where( F.col('snapshot') == snapshot )
    ## main namespace
    .where(F.col('page_namespace') == 0 )
    .select(
        'page_id',
        'page_title',
        'page_is_redirect'
    )
)

In [24]:
## redirects table with page_ids from/to
## we join the pages table to get page_id for the redirected-to page
df_redirect = (
    ## select table
    spark.read.table('wmf_raw.mediawiki_redirect')
    ## select wiki project
    .where( F.col('wiki_db') == wiki )
    .where( F.col('snapshot') == snapshot )
    .where(F.col('rd_namespace') == 0 )
    .select(
        F.col('rd_from').alias('page_id_from'),
        F.col('rd_title').alias('page_title')
    )
    
    ## get the page-ids for the redirected-to pages
    .join(df_pages,on='page_title',how='inner')
    
    ## select only page-ids
    .select(
        F.col('page_id_from').alias('rd_from'),
        F.col('page_id').alias('rd_to')
    )
)
# df_redirect.show(2)

In [25]:
## get the pagelinks table with page_id_from and page_id_to
## only keep links starting from non-redirect pages
## join pages-table to get page-ids for redirect-to pages
df_links = (
    ## select table
    spark.read.table('wmf_raw.mediawiki_pagelinks')
    ## select wiki project
    .where( F.col('wiki_db') == wiki )
    .where( F.col('snapshot') == snapshot )
    
    ## namespace of source and target page
    .where(F.col('pl_from_namespace') == 0 )
    .where(F.col('pl_namespace') == 0 )
    
    .withColumnRenamed('pl_from','page_id_from')
    .withColumnRenamed('pl_title','page_title')
    
    ## only keep links that originate from a page that is not a redirect 
    ## by joining the pages-table with the non-redirect pages
    .join(
        df_redirect.withColumnRenamed('rd_from','page_id_from'),
        on = 'page_id_from',
        how = 'left_anti'
    )
    ## map page_title_to page_id_to by joining the pages-df
    .join(
        df_pages,
        on='page_title',
        how='inner'
    )
    .withColumnRenamed('page_id','page_id_to')
    .select('page_id_from','page_id_to')
)

In [26]:
## resolve the redirects in the links-table by joining the redirect table
df_links_resolved = (
    df_links
    ## join in the redirects
    .join(
        df_redirect,
        df_links['page_id_to'] == df_redirect['rd_from'],
        how = 'left'
    )
    ## select the redirected link (otherwise keep the old)
    .withColumn('page_id_to_resolved', F.coalesce(F.col('rd_to'),F.col('page_id_to')) )
    .select(
        F.col('page_id_from').alias('page_id_from'),
        F.col('page_id_to_resolved').alias('page_id_to')
    )
    ## remove duplicate links
    .distinct()
    .select(
        'page_id_from',
        'page_id_to'
    )
#     .orderBy('page_id_from','page_id_to')
)

In [27]:
# FILE_out = '/user/mgerlach/graph/test_%s_%s'%(wiki,snapshot)
# df_links_resolved.write.mode('overwrite').csv(path=FILE_out, compression="gzip", header=True, sep="\t")

In [28]:
# ## converting to a local file
base_dir_hdfs = '/user/mgerlach/graph'
PATH_OUT = '/home/mgerlach/REPOS/reader-embedding/output/graph/'
# filename_save = 'graph_%s_%s_nodes'%(wiki,snapshot)
filename_save = 'test_%s_%s_links.csv.gz'%(wiki,snapshot)
output_hdfs_dir = os.path.join(base_dir_hdfs,filename_save)
# os.system('hadoop fs -rm -r %s'%output_hdfs_dir)
df_links_resolved.write.mode('overwrite').csv(path=output_hdfs_dir, compression="gzip", header=False, sep="\t")
base_dir_local =  PATH_OUT
output_local_dir_tmp = os.path.join(base_dir_local,'tmp',filename_save)
output_local_file = os.path.join(base_dir_local,filename_save)


os.system('hadoop fs -copyToLocal %s %s'%(output_hdfs_dir,output_local_dir_tmp))
## concatenate and unzip into single file
# os.system('cat %s/* | gunzip > %s'%(output_local_dir_tmp,output_local_file))
os.system('cat %s/* > %s'%(output_local_dir_tmp,output_local_file))
## remove set of tmp-dirs
os.system('rm -rf %s'%output_local_dir_tmp)
## remove hadoop data
os.system('hadoop fs -rm -r %s'%output_hdfs_dir)

0

In [29]:
## join the wikidata-item to each pageview
## we keep only pageviews for which we have a correpsionding wikidata-item id

## table with mapping wikidata-ids to page-ids
## partition wikidb and page-id ordered by snapshot
w_wd = Window.partitionBy(F.col('wiki_db'),F.col('page_id')).orderBy(F.col('snapshot').desc())
df_wd = (
    spark.read.table('wmf.wikidata_item_page_link')
    ## snapshot: this is a partition!
    .where(F.col('snapshot') >= '2020-07-01') ## resolve issues with non-mathcing wikidata-items
    ## only wikis (enwiki, ... not: wikisource)
    .where(F.col('wiki_db')==wiki)
    .withColumn('item_id_latest',F.first(F.col('item_id')).over(w_wd))
    .select(
        'page_id',
        F.col('item_id_latest').alias('item_id')
    )
    .drop_duplicates()
)

In [30]:
df_from = df_links_resolved.select('page_id_from').distinct().withColumnRenamed('page_id_from','page_id')
df_to = df_links_resolved.select('page_id_to').distinct().withColumnRenamed('page_id_to','page_id')
df_nodes_sel = df_from.join(df_to,on='page_id',how='outer')


In [31]:
# all nodes from the pages-table which appear in the links_resolved-table (from/to)
df_nodes = (
    df_pages
    .join(
        df_nodes_sel,
        on = 'page_id',
        how = 'left_semi'
    )
    .join(df_wd,on='page_id',how='left')
    .select(
        'page_id',
        'page_title',
        'item_id'
    )
)

base_dir_hdfs = '/user/mgerlach/graph'
PATH_OUT = '/home/mgerlach/REPOS/reader-embedding/output/graph/'
filename_save = 'test_%s_%s_nodes.csv.gz'%(wiki,snapshot)
output_hdfs_dir = os.path.join(base_dir_hdfs,filename_save)
df_nodes.write.mode('overwrite').csv(path=output_hdfs_dir, compression="gzip", header=False, sep="\t")
base_dir_local =  PATH_OUT
output_local_dir_tmp = os.path.join(base_dir_local,'tmp',filename_save)
output_local_file = os.path.join(base_dir_local,filename_save)
os.system('hadoop fs -copyToLocal %s %s'%(output_hdfs_dir,output_local_dir_tmp))
## concatenate and unzip into single file
# os.system('cat %s/* | gunzip > %s'%(output_local_dir_tmp,output_local_file))
os.system('cat %s/*  > %s'%(output_local_dir_tmp,output_local_file))
## remove set of tmp-dirs
os.system('rm -rf %s'%output_local_dir_tmp)
## remove hadoop data
os.system('hadoop fs -rm -r %s'%output_hdfs_dir)

0