In [1]:
import wmfdata as wmf

In [2]:
spark = wmf.spark.create_session(type="yarn-large")

SPARK_HOME: /usr/lib/spark3
Using Hadoop client lib jars at 3.2.0, provided by Spark.
PYSPARK_PYTHON=/opt/conda-analytics/bin/python3


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/18 23:06:32 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).
23/08/18 23:06:41 WARN YarnSchedulerBackend$YarnSchedulerEndpoint: Attempted to request executors before the AM has registered!


In [5]:
MWH_SNAPSHOT = "2023-07"
END_TIME = "2023-08-01"

editor_month_query = f"""
    SELECT
        TO_DATE(DATE_TRUNC("MONTH", event_timestamp)) AS month,
        wiki_db AS wiki,
        event_user_text AS user_name,
        COUNT(1) AS edit_count,
        SUM(CAST(page_namespace_is_content AS INT)) AS content_edit_count,
        MAX(SIZE(event_user_is_bot_by_historical) > 0) AS user_is_bot,
        MAX(event_user_is_anonymous) AS user_is_anonymous,
        TO_TIMESTAMP(MIN(
            LEAST(
                event_user_registration_timestamp,
                event_user_creation_timestamp,
                event_user_first_edit_timestamp
            )
        )) AS user_registration_time
    FROM wmf.mediawiki_history mwh
    INNER JOIN canonical_data.wikis cdw
        ON wiki_db = database_code
    WHERE
        event_timestamp < "{END_TIME}"
        AND event_entity = "revision"
        AND event_type = "create"
        AND snapshot = "{MWH_SNAPSHOT}"
        AND database_group IN (
            "commons",
            "incubator",
            "foundation",
            "mediawiki",
            "meta",
            "sources",
            "species",
            "wikibooks",
            "wikidata",
            "wikifunctions",
            "wikinews",
            "wikipedia",
            "wikiquote",
            "wikisource",
            "wikiversity",
            "wikivoyage",
            "wiktionary"
        )
    GROUP BY
        TO_DATE(DATE_TRUNC("MONTH", event_timestamp)),
        wiki_db,
        event_user_text
"""

editor_month = spark.sql(editor_month_query)

In [None]:
(
    editor_month
    .repartition(1)
    .write
    .csv(
        "2023-07_editor_month",
        mode="overwrite",
        compression="gzip",
        sep="\\t",
        header=True,
        timestampFormat="yyyy-MM-dd'T'HH:mm:ss'Z'"
    )
)

In [7]:
!hdfs dfs -du -h 2023-07_editor_month

0      2023-07_editor_month/_SUCCESS
3.4 G  2023-07_editor_month/part-00000-c209dc6c-d061-4c28-8fd2-86ccd1b2a62a-c000.csv.gz


In [8]:
em = spark.read.csv(
    "2023-07_editor_month",
    sep="\\t",
    header=True
)

                                                                                

In [9]:
em_gu = em.filter("wiki = 'guwiki'").toPandas()

                                                                                

In [17]:
em_gu

Unnamed: 0,month,wiki,user_name,edit_count,content_edit_count,user_is_bot,user_is_anonymous,user_registration_time
0,2005-06-01,guwiki,en>Mzajac,1,0,false,true,
1,2005-08-01,guwiki,en>Who,1,0,false,true,
2,2005-11-01,guwiki,217.136.50.143,5,0,false,true,
3,2005-11-01,guwiki,imported>Sikon,1,0,false,true,
4,2005-12-01,guwiki,en>Pinworm,2,0,false,true,
...,...,...,...,...,...,...,...,...
87047,2022-07-01,guwiki,en>TheDJ,1,0,false,true,
87048,2022-08-01,guwiki,106.213.253.31,5,5,false,true,
87049,2022-11-01,guwiki,2405:205:C94C:8C4:0:0:8F3:18A5,1,1,false,true,
87050,2023-04-01,guwiki,શૈલેષભાઈ ગીરજાશંકરભાઈ મહેતા,3,0,false,false,2023-04-20T04:16:57Z


In [45]:
em_gu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87052 entries, 0 to 87051
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   month              87052 non-null  object
 1   wiki               87052 non-null  object
 2   user_name          87041 non-null  object
 3   edits              87052 non-null  object
 4   content_edits      86038 non-null  object
 5   user_is_bot        87052 non-null  object
 6   user_is_anonymous  87042 non-null  object
 7   user_registration  19128 non-null  object
dtypes: object(8)
memory usage: 5.3+ MB


In [19]:
!hdfs dfs -get \
    2023-07_editor_month/part-00000-c209dc6c-d061-4c28-8fd2-86ccd1b2a62a-c000.csv.gz \
    ~/2023-07_editor_month.tsv.gz