In [1]:
import findspark

findspark.init("/usr/lib/spark2")
import os
import sys

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import sum as _sum
import os.path

import wmfdata.spark as wmfspark
## defining the spark session
spark_config = {}
spark = wmfspark.get_session(
    app_name='Pyspark notebook', 
    type='regular'
#     extra_settings=spark_config
)
spark

import mwparserfromhell
import pandas as pd
import urllib
from pyspark.sql import types as T
import datetime
import dateutil.relativedelta
import pyarrow.parquet as pq

You are using wmfdata v1.3.2, but v2.0.0 is available.

To update, run `pip install --upgrade git+https://github.com/wikimedia/wmfdata-python.git@release --ignore-installed`.

To see the changes, refer to https://github.com/wikimedia/wmfdata-python/blob/release/CHANGELOG.md
PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.


# abbreviations

In [11]:
# abbr_list = ["time.","up."]
abbr_df = pd.read_csv("abbr_refined.csv")
abbr_list = [w for w in abbr_df["abbreviations"].tolist() if len(w[:-1])>0]
abbr_list_without_punct = [w[:-1] for w in abbr_list]
# print(abbr_list,abbr_list_without_punct)
set_words = set(abbr_list+abbr_list_without_punct)
# print(set_words)



In [12]:
def normalise_title(title):
    """
    Normalising title (links)
    - deal with quotes
    - strip()
    - '_'--> ' '
    - capitalize first letter
    """
    title = urllib.parse.unquote(title)
    title = title.strip()
    if len(title) > 0:
        title = title[0].upper() + title[1:]
    n_title = title.replace("_", " ")
    if "#" in n_title:
        n_title = n_title.split("#")[0]
    return n_title


def extract_article(row):
    """Extract the content of the article.
    normalize the titles"""
    #     redirect = row.page_redirect_title if row.page_redirect_title is not None else ""
    return T.Row(
        pid=row.page_id,
        title=normalise_title(row.page_title),
        title_rd=normalise_title(row.page_redirect_title),
        wikitext=row.revision_text,
    )


def get_plain_text_without_links(row):
    """Replace the links with a dot to interrupt the sentence and get the plain text"""
    wikicode = row.wikitext
    text = mwparserfromhell.parse(wikicode).strip_code()  # made change
    return T.Row(pid=row.pid, title=normalise_title(row.title), text=text)


def get_valid_ngrams(row):
    text = row.text  # one article
    words = text.split()
    word_len = len(words)

    found_words = []
    for w in words:
        if w in set_words:
            found_words.append(w)
    return [
        T.Row(
            pid=row.pid,
            w = w
        )
        for w in found_words
    ]

In [13]:
snapshot = "2022-11" # latest snapshot, comment out if you want to use the code above
wiki_id = "simplewiki" # list of wikipedia projects, comment out if you want to use the code below

In [14]:
# load the wikitext-table
wikipedia_all = (
    ## select table
    spark.read.table("wmf.mediawiki_wikitext_current")
    ## select wiki project and snapshot
    .where(F.col("wiki_db") == wiki_id)
    .where(F.col("snapshot") == snapshot)
    ## main namespace (0 is the main namespace)
    .where(F.col("page_namespace") == 0)
    ## remove empty articles
    .where(F.col("revision_text").isNotNull())
    .where(F.length(F.col("revision_text")) > 0)
)
# extract article from the wikitext-table, as rows
temp_df = wikipedia_all.rdd.map(extract_article).filter(lambda r: r is not None)
wikipedia = spark.createDataFrame(temp_df)
articles = wikipedia.where(F.col("title_rd") == "").select("pid", "title", "wikitext")
chunks = articles.rdd.map(get_plain_text_without_links) # wikitext to plaintext
rows = chunks.flatMap(get_valid_ngrams) #for each article, identify the abbreviations with & without termninating punctuation and their corresponding frequencies

rows_agg = (
    spark.createDataFrame(rows)
    .groupBy("w")
    .agg(F.count("*").alias("occ"))
).cache()


In [15]:
rows_agg.show()

+-------+-----+
|      w|  occ|
+-------+-----+
|      .|15439|
|    Far|  730|
|    17.|  386|
|    del| 4564|
|    Apr|  241|
|    mit|  227|
|    Ur.|   17|
|   vale|   10|
|     им|    3|
|   rat.|   28|
|   chin|   72|
|    nm.|   12|
|    .no|    2|
|   mar.|    6|
|generál|    1|
|    د.ك|    1|
|   Kos.|    4|
|    S.P|    1|
|    .tm|    1|
| Diener|    2|
+-------+-----+
only showing top 20 rows



# aligning the counts of with and without punctuation

In [16]:
df_final = spark.createDataFrame(zip(abbr_list,abbr_list_without_punct),["w_with_punct","w_without_punct"])

df_final = (
    df_final
    # join w_with_punct
    .join(
        rows_agg
        .withColumnRenamed("w","w_with_punct")
        .withColumnRenamed("occ","n_with_punct"),
        on="w_with_punct",
        how="left"
    )
    # join w_without_punct
    .join(
        rows_agg
        .withColumnRenamed("w","w_without_punct")
        .withColumnRenamed("occ","n_without_punct"),
        on="w_without_punct",
        how="left"
    )
    .fillna(0)
)
df_final.show()

+-------------------+--------------------+------------+---------------+
|    w_without_punct|        w_with_punct|n_with_punct|n_without_punct|
+-------------------+--------------------+------------+---------------+
|               K.22|               K.227|           0|              0|
|              མི་སྐ|              མི་སྐུ|           0|              0|
|               прел|               прел.|           0|              0|
|                 .s|                 .sg|           3|              0|
|               арап|               арап.|           0|              0|
|            нефте..|            нефте...|           0|              0|
|                tab|                tab.|           2|             49|
|མཉམ་འབྲེལ་རྒྱལ་ཚོགས|མཉམ་འབྲེལ་རྒྱལ་ཚོགས།|           0|              0|
|                 ᩃᩯ|                 ᩃᩯ᩵|           0|              0|
|                  བ|                  བྷ|           0|              0|
|              Miälk|              Miälk.|           0|         

In [19]:
df_final.orderBy("n_without_punct",ascending=False).show()

+---------------+------------+------------+---------------+
|w_without_punct|w_with_punct|n_with_punct|n_without_punct|
+---------------+------------+------------+---------------+
|            the|        the?|           0|        1878032|
|             of|         of?|           2|        1274015|
|             in|         in.|         940|         997342|
|             in|         in?|           0|         997342|
|            and|        and.|           6|         902533|
|              a|          a!|           1|         694214|
|              a|          a.|         142|         694214|
|              a|          a?|           0|         694214|
|             is|         is.|         716|         571086|
|             to|         to.|         557|         539690|
|             to|         to?|           3|         539690|
|            was|        was.|         319|         431578|
|           from|       from?|          15|         254425|
|            for|        for?|          