# Wiktionary-dumps

### Starting a session in spark

In [11]:
import os, sys
import datetime
import calendar
import time
import string
import random
import mwparserfromhell

import findspark
findspark.init('/usr/lib/spark2')
from pyspark.sql import SparkSession
from pyspark.sql import functions as F, types as T, Window
import wmfdata.spark as wmfspark

## defining the spark session
spark_config = {}
spark = wmfspark.get_session(
    app_name='Pyspark notebook', 
    type='regular'
#     extra_settings=spark_config
)
spark

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.


# Abbreviations: pagetitles in Wiktionary ending in a full stop

### Defining a snapshot

In [5]:
## define a snapshot
snapshot = "2022-09"

### We first get a list with all relevant projects (in this case: wiktionary)

In [6]:
## get a list of all wikipedia-projects (e.g. not wikidata)
df_projects = (
    spark.read.table('wmf_raw.mediawiki_project_namespace_map')
    .where(F.col("snapshot") == snapshot)
    .where(F.col("hostname").contains("wiktionary"))
    .select(F.col("dbname").alias("wiki_db"))
    .distinct()
)
df_projects.orderBy("wiki_db").show()

+-------------+
|      wiki_db|
+-------------+
| afwiktionary|
| amwiktionary|
|angwiktionary|
| anwiktionary|
| arwiktionary|
|astwiktionary|
| aywiktionary|
| azwiktionary|
|bclwiktionary|
| bewiktionary|
| bgwiktionary|
|bjnwiktionary|
| bnwiktionary|
| brwiktionary|
| bswiktionary|
| cawiktionary|
|chrwiktionary|
| cowiktionary|
|csbwiktionary|
| cswiktionary|
+-------------+
only showing top 20 rows



In [9]:
## all wikipedia pages in the main namespace: 
## wiki_db, page_id, page_title, page_is_redirect
df_pages = (
    ## select table
    spark.read.table('wmf_raw.mediawiki_page')
    ## select snapshot
    .where( F.col('snapshot') == snapshot )

    ## filter wiki_dbs
    .where(F.col("wiki_db") == "enwiktionary")
    # in case we want to get all wiktionaries
#     .join(
#         df_projects,
#         on = "wiki_db",
#         how = "inner"
#     )
    ## main namespace
    .where(F.col('page_namespace') == 0 )

    # TODO: add other punctuation symbols
    .where(F.col("page_title").endswith("."))
    
    ## 
    .select(
        "wiki_db",
        'page_title',
    )
)
# show some examples
df_pages.show(20,truncate=False)

+------------+----------+
|wiki_db     |page_title|
+------------+----------+
|enwiktionary|A.M.      |
|enwiktionary|i.e.      |
|enwiktionary|e.g.      |
|enwiktionary|imp.      |
|enwiktionary|gram.     |
|enwiktionary|cf.       |
|enwiktionary|p._pr.    |
|enwiktionary|vb._n.    |
|enwiktionary|chem.     |
|enwiktionary|pl.       |
|enwiktionary|compar.   |
|enwiktionary|superl.   |
|enwiktionary|et_al.    |
|enwiktionary|уст.      |
|enwiktionary|A.K.A.    |
|enwiktionary|inc.      |
|enwiktionary|Ltd.      |
|enwiktionary|assoc.    |
|enwiktionary|ave.      |
|enwiktionary|st.       |
+------------+----------+
only showing top 20 rows



In [10]:
df_pages.count()

4832

# Parsing the wikitext

An entry in wiktionary can belong to different languages.

For example [уст.](https://en.wiktionary.org/wiki/%D1%83%D1%81%D1%82.) is a page in English Wiktionary but assigned to Russian.

For this, we need to extract the section titles from the wikitext.

In [39]:
@F.udf(returnType=T.ArrayType(T.StringType()))
def get_languages(wikitext):
    wikicode = mwparserfromhell.parse(wikitext)
    sections = wikicode.get_sections(levels=[2],flat=True)
    languages = [s.strip().strip("==") for s in sections]
    return languages

In [41]:
## all wikipedia pages in the main namespace: 
## wiki_db, page_id, page_title, page_is_redirect
df_pages = (
    ## select table
    spark.read.table('wmf.mediawiki_wikitext_current')
    ## select snapshot
    .where( F.col('snapshot') == snapshot )

    ## filter wiki_dbs
    .where(F.col("wiki_db") == "enwiktionary")
    # in case we want to get all wiktionaries
#     .join(
#         df_projects,
#         on = "wiki_db",
#         how = "inner"
#     )
    ## main namespace
    .where(F.col('page_namespace') == 0 )

    # TODO: add other punctuation symbols
    .where(F.col("page_title").endswith("."))
    
    # check that there is a wikitext
    .where(F.col("revision_text").isNotNull())
    .where(F.length(F.col("revision_text")) > 0)    

    # extract section titles (languages)
    .withColumn("languages", get_languages(F.col("revision_text")) )
    ## 
    .select(
        "wiki_db",
        "page_title",
        "languages"
    )
)
# show some examples
df_pages.show(20,truncate=False)

+------------+----------+-----------------------+
|wiki_db     |page_title|languages              |
+------------+----------+-----------------------+
|enwiktionary|Subcomm.  |[English]              |
|enwiktionary|astron.   |[English]              |
|enwiktionary|ism.      |[Hungarian]            |
|enwiktionary|add.      |[English]              |
|enwiktionary|prisl.    |[Slovene]              |
|enwiktionary|comp.     |[English]              |
|enwiktionary|非...才...|[]                     |
|enwiktionary|aschwed.  |[German]               |
|enwiktionary|d.o.f.    |[English]              |
|enwiktionary|urgerm.   |[German]               |
|enwiktionary|C.Bssn.   |[English, French]      |
|enwiktionary|Admin.    |[English]              |
|enwiktionary|Gr.Tr.    |[German]               |
|enwiktionary|Fg.       |[German, Italian]      |
|enwiktionary|Z.O.Z.    |[]                     |
|enwiktionary|พฤ.       |[Thai]                 |
|enwiktionary|vars.     |[Translingual, Finnish]|
|e