# This notebook tries to extract the plain text from wikitext from all articles in a wiki

In [1]:
import os, sys
import string
import pickle
import pandas as pd
import mwparserfromhell
import re
import urllib
import pyarrow.parquet as pq

import findspark
findspark.init('/usr/lib/spark2')
from pyspark.sql import SparkSession
from pyspark.sql import functions as F, types as T, Window
import wmfdata.spark as wmfspark

spark =wmfspark.get_session(
    type='yarn-large', 
    app_name='Pyspark notebook'
)
spark

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.


In [2]:
wiki_id = 'enwiki'
snapshot='2021-01'
PATH_hadoop = "/user/mgerlach/test/"

In [3]:
## this extracts the plain text from wikitext
# the regexes are taken from here:
# https://github.com/epfl-dlab/WikiPDA/blob/master/PaperAndCode/TopicsExtractionPipeline/GenerateDataframes.py
links_regex = re.compile(
    r"\[\[(?P<link>[^\n\|\]\[\<\>\{\}]{0,256})(?:\|(?P<anchor>[^\[]*?))?\]\]"
)
references_regex = re.compile(r"<ref[^>]*>[^<]+<\/ref>")
def get_plain_text_without_links(row):
    """ Replace the links with a dot to interrupt the sentence and get the plain text """
    wikicode = row.wikitext
    # do some regexes 
    wikicode_without_links = re.sub(links_regex, ".", wikicode)
    wikicode_without_links = re.sub(references_regex, ".", wikicode_without_links)
    # try to strip the markup using mwparserfromhell
    try:
        text = mwparserfromhell.parse(wikicode_without_links).strip_code()
    except:
        text = wikicode_without_links
    return T.Row(pid=row.pid, title=row.title, text=text.lower())

In [4]:
## all articles in main namespace: pid, title, and wikitext
articles = (
    ## select table
    spark.read.table('wmf.mediawiki_wikitext_current')
    ## select wiki project and snapshot
    .where( F.col('wiki_db') == wiki_id )
    .where( F.col('snapshot') == snapshot )
    ## main namespace
    .where(F.col('page_namespace') == 0 )
    ## no redirect-pages
    .where(F.col('page_redirect_title')=='')
    .where(F.col('revision_text').isNotNull())
    .where(F.length(F.col('revision_text'))>0)
    .select(
        F.col("page_id").alias("pid"),
        F.col("page_title").alias("title"),
        F.col("revision_text").alias("wikitext")
    )
)
articles.show()

+--------+--------------------+--------------------+
|     pid|               title|            wikitext|
+--------+--------------------+--------------------+
|60972024|            Inubaran|{{Infobox prepare...|
|66448144|Warekena Velha la...|{{distinguish|War...|
|49721649|              VPS35L|{{Infobox_gene}}
...|
|34644923|Panagiotis Vosniadis|{{short descripti...|
|  345759|   Guarani languages|{{short descripti...|
|41498124|Balón de Oro (Mex...|{{About|the Mexic...|
|   18306|      Latin alphabet|{{pp-protected|sm...|
|44671427|      Vasiliy Averin|{{short descripti...|
| 6183007|Beaver River (Okl...|{{Infobox river
|...|
|53233810|John Weston (1651...|'''John Weston'''...|
|51502469|Ross Creek (Misso...|'''Ross Creek''' ...|
|  782332|         Vernon Hill|{{other people}}
...|
|47548598|Garth Hudson (foo...|{{short descripti...|
|  516838| Micro-g environment|[[File:STS-130 En...|
|65708528|South Carolina Hi...|'''South Carolina...|
|59558748|South Pudong Road...|{{Infobox stati

In [5]:
## apply the filtering of the text
plain_text_without_links = spark.createDataFrame(articles.rdd.map(get_plain_text_without_links))
plain_text_without_links.show()

+--------+--------------------+--------------------+
|     pid|                text|               title|
+--------+--------------------+--------------------+
|60972024|inubaran, is a . ...|            Inubaran|
|66448144|warekena velha (g...|Warekena Velha la...|
|49721649|vps35l is a . enc...|              VPS35L|
|34644923|panagiotis vosnia...|Panagiotis Vosniadis|
|  345759|the guarani langu...|   Guarani languages|
|41498124|balón de oro, als...|Balón de Oro (Mex...|
|   18306|the latin alphabe...|      Latin alphabet|
|44671427|vasiliy kuzmich a...|      Vasiliy Averin|
| 6183007|the beaver river ...|Beaver River (Okl...|
|53233810|john weston (1651...|John Weston (1651...|
|51502469|ross creek is a ....|Ross Creek (Misso...|
|  782332|. 
vernon w. hill...|         Vernon Hill|
|47548598|george "garth" wi...|Garth Hudson (foo...|
|  516838|thumb|300px|the ....| Micro-g environment|
|65708528|south carolina hi...|South Carolina Hi...|
|59558748|south pudong road...|South Pudong Ro

In [6]:
## try to write the table to parquet-file
FILE_hadoop = PATH_hadoop+"%s.plain_text_without_links.parquet"%wiki_id
plain_text_without_links.write.mode("overwrite").parquet(FILE_hadoop)

Py4JJavaError: An error occurred while calling o181.parquet.
: org.apache.spark.SparkException: Job aborted.
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:198)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:159)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:104)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:102)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.doExecute(commands.scala:122)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:80)
	at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:80)
	at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:676)
	at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:676)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:676)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:285)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:271)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:229)
	at org.apache.spark.sql.DataFrameWriter.parquet(DataFrameWriter.scala:566)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 189 in stage 3.0 failed 4 times, most recent failure: Lost task 189.3 in stage 3.0 (TID 995, an-worker1098.eqiad.wmnet, executor 240): ExecutorLostFailure (executor 240 exited caused by one of the running tasks) Reason: Container killed by YARN for exceeding memory limits.  55.3 GB of 45.9 GB virtual memory used. Consider boosting spark.yarn.executor.memoryOverhead or disabling yarn.nodemanager.vmem-check-enabled because of YARN-4714.
Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1889)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1876)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:167)
	... 33 more
