In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import pyspark.sql.functions as funcs
from pyspark.sql.window import Window

In [None]:
%matplotlib inline
mpl.style.use({
    "font.size": 16,
    "figure.figsize": (14, 7),
    "axes.grid": True,
    "axes.autolimit_mode": "data",
    "axes.xmargin": 0,
    "axes.ymargin": 0
})

In [3]:
mwh = spark.read.parquet("/wmf/data/wmf/mediawiki/history/snapshot=2019-04")

ns_map_schema = """
    `hostname` string COMMENT 'Canonical URL for the project, for example ja.wikipedia.org', 
    `dbname` string COMMENT 'Database name for the project, for example jawiki', 
    `namespace` int COMMENT 'for example 0, 100, etc.', 
    `namespace_canonical_name` string COMMENT 'the english prefix if exists, otherwise the localized prefix', 
    `namespace_localized_name` string COMMENT 'the localized prefix', 
    `namespace_is_content` int COMMENT 'Whether this namespace is a content namespace'
"""
ns_map = spark.read.csv(
    "/wmf/data/raw/mediawiki/project_namespace_map/snapshot=2019-04", 
    schema=ns_map_schema
)

In [None]:
join_conds = [
    mwh.wiki_db == ns_map.dbname,
    mwh.page_namespace == ns_map.namespace
]

apr_content_edits = (
    mwh.
    filter("""
        event_entity = "revision" and
        event_type = "create" and
        event_timestamp between "2019-04" and "2019-05" and
        event_user_id != 0
    """).
    join(ns_map, join_conds, how="inner").
    filter("namespace_is_content = 1")
)

apr_active_eds = (
    apr_content_edits.
    groupBy("event_user_text").
    count().
    withColumnRenamed("count", "content_edits").
    filter("content_edits >= 5").
    selectExpr("event_user_text as user_name")
)

In [None]:
edits = mwh.filter("""
    event_entity = "revision" and 
    event_type = "create"
""")

# This part dies even with a large kernel!
edit_counts = (
    apr_active_eds.
    join(
        edits,
        [apr_active_eds.user_name == edits.event_user_text],
        how="left"
    ).
    groupBy("event_user_text").
    count().
    withColumnRenamed("count", "edits")
)

edit_counts()

In [None]:
edit_counts.count()

In [None]:
edit_counts["edits"].plot.box()