# Data Gathering of Non-Tagged Edits

[T266982](https://phabricator.wikimedia.org/T266982) aims to understand the effect of the Homepage with Newcomer Tasks outside of edits made through the Newcomer Tasks module.

This notebook modifies the data gathering query used for the productivity analysis to only gather non-tagged edits, then stores that in a TSV for reading into R later.

In [1]:
import datetime as dt

import numpy as np
import pandas as pd

from wmfdata import spark, mariadb
from growth import utils

In [2]:
## Configuration variables

wmf_snapshot = '2020-10'

## Start and end timestamps of data gathering for each wiki, corresponding to the dates/times
## listed above.

cs_start_ts = dt.datetime(2019, 11, 21, 0, 24, 32)
cs_end_ts = dt.datetime(2020, 5, 14, 0, 0, 0)

## Table name of the canonical user table
canonical_user_table = 'nettrom_growth.newcomer_task_exp_users'

## Filename of the output dataset
nontagged_edit_data_output_filename = 'datasets/newcomer_tasks_nontagged_edits_nov2020.tsv'

In [3]:
## This is the same query as "edit_data_query" in the original data gathering (T230174.ipynb)
## except it excludes any edit tagged with "newcomer task"

nontagged_edit_data_query = '''
WITH edits AS (
    SELECT wiki_db, event_user_id AS user_id,
    -- ns 0 & 1 edits on the first day
    SUM(IF(page_namespace IN (0, 1)
        AND unix_timestamp(event_timestamp) - unix_timestamp(event_user_creation_timestamp) < 86400, 1, 0))
        AS num_article_edits_24hrs,
    -- ns 0 & 1 edits on the first day that were reverted
    SUM(IF(page_namespace IN (0, 1) AND revision_is_identity_reverted = true AND revision_seconds_to_identity_revert < 60*60*48
        AND unix_timestamp(event_timestamp) - unix_timestamp(event_user_creation_timestamp) < 86400, 1, 0))
        AS num_article_reverts_24hrs,
    --  other namespace edits on the first day
    SUM(IF(page_namespace NOT IN (0, 1)
        AND unix_timestamp(event_timestamp) - unix_timestamp(event_user_creation_timestamp) < 86400, 1, 0))
        AS num_other_edits_24hrs,
    -- other namespace reverts on the first day
    SUM(IF(page_namespace NOT IN (0, 1) AND revision_is_identity_reverted = true AND revision_seconds_to_identity_revert < 60*60*48
        AND unix_timestamp(event_timestamp) - unix_timestamp(event_user_creation_timestamp) < 86400, 1, 0))
        AS num_other_reverts_24hrs,
    -- ns 0 & 1 edits on days 1–15
    SUM(IF(page_namespace IN (0, 1)
        AND unix_timestamp(event_timestamp) - unix_timestamp(event_user_creation_timestamp) BETWEEN 86400 AND 15*86400, 1, 0))
        AS num_article_edits_2w,
    -- ns 0 & 1 edits on days 1–15 that were reverted
    SUM(IF(page_namespace IN (0, 1) AND revision_is_identity_reverted = true AND revision_seconds_to_identity_revert < 60*60*48
        AND unix_timestamp(event_timestamp) - unix_timestamp(event_user_creation_timestamp) BETWEEN 86400 AND 15*86400, 1, 0))
        AS num_article_reverts_2w,
    -- other namespace edits on days 1–15
    SUM(IF(page_namespace NOT IN (0, 1)
        AND unix_timestamp(event_timestamp) - unix_timestamp(event_user_creation_timestamp) BETWEEN 86400 AND 15*86400, 1, 0))
        AS num_other_edits_2w,
    -- other namespace reverts on days 1–15
    SUM(IF(page_namespace NOT IN (0, 1) AND revision_is_identity_reverted = true AND revision_seconds_to_identity_revert < 60*60*48
        AND unix_timestamp(event_timestamp) - unix_timestamp(event_user_creation_timestamp) BETWEEN 86400 AND 15*86400, 1, 0))
        AS num_other_reverts_2w
    FROM wmf.mediawiki_history
    WHERE snapshot = "{snapshot}"
    AND event_entity = "revision"
    AND event_type = "create"
    AND wiki_db IN ("cswiki", "kowiki", "arwiki", "viwiki")
    AND event_timestamp > "{start_date}"
    AND array_contains(revision_tags, "newcomer task") = false
    GROUP BY wiki_db, event_user_id
),
users AS (
    SELECT wiki_db, user_id, user_registration_timestamp, reg_on_mobile, is_treatment
    FROM {exp_user_table}
)
SELECT users.wiki_db, users.user_id, users.user_registration_timestamp, reg_on_mobile, is_treatment,
    COALESCE(num_article_edits_24hrs, 0) AS num_article_edits_24hrs,
    COALESCE(num_article_reverts_24hrs, 0) AS num_article_reverts_24hrs,
    COALESCE(num_other_edits_24hrs, 0) AS num_other_edits_24hrs,
    COALESCE(num_other_reverts_24hrs, 0) AS num_other_reverts_24hrs,
    COALESCE(num_article_edits_2w, 0) AS num_article_edits_2w,
    COALESCE(num_article_reverts_2w, 0) AS num_article_reverts_2w,
    COALESCE(num_other_edits_2w, 0) AS num_other_edits_2w,
    COALESCE(num_other_reverts_2w, 0) AS num_other_reverts_2w
FROM users
LEFT JOIN edits
ON users.wiki_db = edits.wiki_db
AND users.user_id = edits.user_id
'''

In [4]:
nontagged_edit_data = spark.run(
    nontagged_edit_data_query.format(
        snapshot = wmf_snapshot,
        start_date = cs_start_ts.date().isoformat(),
        exp_user_table = canonical_user_table
    )
)

In [5]:
len(nontagged_edit_data)

97755

In [None]:
nontagged_edit_data.loc[nontagged_edit_data['num_article_edits_24hrs'] > 0].head()

Write out the canonical edit dataset for importing into R.

In [7]:
nontagged_edit_data.to_csv(nontagged_edit_data_output_filename,
                           header = True, index = False, sep = '\t')