# Feature Impact on Portuguese Wikipedia

We're interested in understanding the impact of the Growth features on newcomers on Portuguese Wikipedia, because shortly after Growth deployed their features to that wiki the community turned off editing by non-registered editors.

The phab task for this work is [T284705](https://phabricator.wikimedia.org/T284705)

In [2]:
import datetime as dt

import numpy as np
import pandas as pd

from wmfdata import spark

# Active Editors

This query is modified from the active editors query listed in the phab task.

In [1]:
wmh_snapshot = '2021-05'
start_date = '2020-11-01'
end_date = '2021-05-01'
end_date_plus_one = '2021-06-01' ## to make it easy to grab edits until the end of the previous month

In [30]:
editors_query = '''
WITH data_months AS (
    SELECT
        explode(sequence(to_date("{START_YYYY_MM_DD}"),
                         to_date("{END_YYYY_MM_DD}"), interval 1 month))
        AS month
),
editors AS (
    SELECT
        user_id,
        hp_enabled AS has_growth_features
    FROM nettrom_growth.hp_variant_test2
    WHERE wiki_db = "ptwiki"
),
edits AS (
    SELECT
        date_format(event_timestamp,'yyyy-MM-01') AS y_m,
        event_user_id,
        COUNT(1) AS num_edits
    FROM wmf.mediawiki_history
    WHERE
        snapshot = '{SNAPSHOT}'
        AND wiki_db='ptwiki'
        AND event_entity = 'revision'
        AND event_type = 'create'
        AND DATE(event_timestamp) >= '{START_YYYY_MM_DD}' AND DATE(event_timestamp) < '{PLUS_ONE_YYYY_MM_DD}'
        AND event_user_is_anonymous = false
        AND page_namespace_is_content
    GROUP BY   
        date_format(event_timestamp,'yyyy-MM-01'),
        event_user_id
)
SELECT
    data_months.month,
    editors.has_growth_features,
    COUNT(1) AS num_registrations,
    COUNT(edits.event_user_id) AS num_editors,
    CAST(COUNT(edits.event_user_id) AS DOUBLE) / COUNT(1) AS prop_editors
FROM data_months
CROSS JOIN editors
LEFT JOIN edits
ON data_months.month = edits.y_m
AND editors.user_id = edits.event_user_id
GROUP BY month, has_growth_features
ORDER BY month, has_growth_features
'''

In [43]:
editors_df = spark.run(editors_query.format(
    SNAPSHOT = wmh_snapshot,
    START_YYYY_MM_DD = start_date,
    END_YYYY_MM_DD = end_date,
    PLUS_ONE_YYYY_MM_DD = end_date_plus_one
))

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.


In [44]:
editors_df['perc_editors'] = editors_df['prop_editors'] * 100
editors_df.drop(columns = 'prop_editors').round(2)

Unnamed: 0,month,has_growth_features,num_registrations,num_editors,perc_editors
0,2020-11-01,0,2104,771,36.64
1,2020-11-01,1,8297,3069,36.99
2,2020-12-01,0,2104,67,3.18
3,2020-12-01,1,8297,277,3.34
4,2021-01-01,0,2104,44,2.09
5,2021-01-01,1,8297,210,2.53
6,2021-02-01,0,2104,34,1.62
7,2021-02-01,1,8297,153,1.84
8,2021-03-01,0,2104,43,2.04
9,2021-03-01,1,8297,142,1.71
