# User Dataset

Like we've done for other experiments, I want a canonical dataset of users that will be included in the analysis. This makes writing queries easier, and ensures that we use the same user base for all analysis.

In [3]:
import json
import datetime as dt

from collections import defaultdict

import numpy as np
import pandas as pd

from wmfdata import hive, spark, mariadb
from growth import utils

In [4]:
## Start timestamp of the Variant C/D experiment, which is when the edit tag bug fix went into place
exp_start_ts = dt.datetime(2020, 10, 28, 18, 40, 2)

exp_end_ts = exp_start_ts + dt.timedelta(days = 28)

## Ordered list of wikis that we'll be gathering data for
## Note that we're excluding euwiki due to their small number of registrations
wikis = ['cswiki', 'kowiki', 'viwiki', 'arwiki', 'ukwiki', 'huwiki', 'srwiki', 'hywiki',
         'frwiki', 'fawiki', 'hewiki', 'ruwiki', 'plwiki', 'ptwiki', 'svwiki', 'trwiki']

## Lists of known users to ignore (e.g. test accounts and experienced users)
known_users = defaultdict(set)
known_users['cswiki'].update([14, 127629, 303170, 342147, 349875, 44133, 100304, 307410, 439792, 444907,
                              454862, 456272, 454003, 454846, 92295, 387915, 398470, 416764, 44751, 132801,
                              137787, 138342, 268033, 275298, 317739, 320225, 328302, 339583, 341191,
                              357559, 392634, 398626, 404765, 420805, 429109, 443890, 448195, 448438,
                              453220, 453628, 453645, 453662, 453663, 453664, 440694, 427497, 272273,
                              458025, 458487, 458049, 59563, 118067, 188859, 191908, 314640, 390445,
                              451069, 459434, 460802, 460885, 79895, 448735, 453176, 467557, 467745,
                              468502, 468583, 468603, 474052, 475184, 475185, 475187, 475188, 294174,
                              402906, 298011])

known_users['kowiki'].update([303170, 342147, 349875, 189097, 362732, 384066, 416362, 38759, 495265,
                              515553, 537326, 566963, 567409, 416360, 414929, 470932, 472019, 485036,
                              532123, 558423, 571587, 575553, 576758, 360703, 561281, 595100, 595105,
                              595610, 596025, 596651, 596652, 596653, 596654, 596655, 596993, 942,
                              13810, 536529])

known_users['viwiki'].update([451842, 628512, 628513, 680081, 680083, 680084, 680085, 680086, 355424,
                              387563, 443216, 682713, 659235, 700934, 705406, 707272, 707303, 707681, 585762])

known_users['arwiki'].update([237660, 272774, 775023, 1175449, 1186377, 1506091, 1515147, 1538902,
                              1568858, 1681813, 1683215, 1699418, 1699419, 1699425, 1740419, 1759328, 1763990])

## Grab the user IDs of known test accounts so they can be added to the exclusion list

def get_known_users(wiki):
    '''
    Get user IDs of known test accounts and return a set of them.
    '''
    
    username_patterns = ["MMiller", "Zilant", "Roan", "KHarlan", "MWang", "SBtest",
                         "Cloud", "Rho2019", "Test"]

    known_user_query = '''
SELECT user_id
FROM user
WHERE user_name LIKE "{name_pattern}%"
    '''
    
    known_users = set()
    
    for u_pattern in username_patterns:
        new_known = mariadb.run(known_user_query.format(
            name_pattern = u_pattern), wiki)
        known_users = known_users | set(new_known['user_id'])

    return(known_users)
        
for wiki in wikis:
    known_users[wiki] = known_users[wiki] | get_known_users(wiki)

In [60]:
## Filename of where the canonical datasets are stored.
tsv_canonical_user_file = '/home/nettrom/src/Growth-homepage-2019/datasets/variant-test-2-canonical-users.tsv'
tsv_datalake_user_file = '/home/nettrom/src/Growth-homepage-2019/datasets/variant-test-2-datalake-users.tsv'
canonical_user_table = 'nettrom_growth.hp_variant_test2'

## Helper Functions

In [6]:
def make_known_users_sql(kd, wiki_column, user_column):
    '''
    Based on the dictionary `kd` mapping wiki names to sets of user IDs of known users,
    create a SQL expression to exclude users based on the name of the wiki matching `wiki_column`
    and the user ID not matching `user_column`
    '''
    
    wiki_exp = '''({w_column} = '{wiki}' AND {u_column} NOT IN ({id_list}))'''
    
    expressions = list()

    ## Iteratively build the expression for each wiki
    for wiki_name, wiki_users in kd.items():
        expressions.append(wiki_exp.format(
            w_column = wiki_column,
            wiki = wiki_name,
            u_column = user_column,
            id_list = ','.join([str(u) for u in wiki_users])
        ))
    
    ## We then join all the expressions with an OR, and we're done.
    return(' OR '.join(expressions))
    

In [7]:
def make_when_then(wiki_list, wiki_column):
    '''
    Take the ordered list of wiki names and turn it into a string
    of "WHEN wiki_column = '{wiki}' THEN '{k}'" where `k` is the index
    of the wiki in the list, so it can be used for ordering results.
    '''

    whens = list()
    
    for k, wiki in enumerate(wiki_list):
        whens.append(f'WHEN {wiki_column} = "{wiki}" THEN "{k:02}"')
    
    ## Join them with line breaks to create the list
    return('\n'.join(whens))


## User Registrations

We get user registrations for users registered between the start and the end of the experiment, on all wikis, with known test accounts excluded.

In [8]:
user_registrations_query = '''
SELECT wiki, event.userid AS user_id,
       dt AS reg_timestamp,
       CAST(event.displaymobile AS INT) AS reg_on_mobile
FROM event_sanitized.serversideaccountcreation
WHERE year = 2020 AND month IN (10, 11)
  AND wiki IN ({wiki_list})
  AND ({known_userid_expression})
  AND event.isselfmade = true
  AND event.isapi = false
  AND dt BETWEEN "{exp_start_timestamp}" AND "{exp_end_timestamp}"
'''

In [9]:
user_registrations = spark.run(
    user_registrations_query.format(
        wiki_list = ','.join(['"{}"'.format(w) for w in wikis]),
        known_userid_expression = make_known_users_sql(known_users, 'wiki', 'event.userid'),
        exp_start_timestamp = exp_start_ts.strftime(utils.hive_format),
        exp_end_timestamp = exp_end_ts.strftime(utils.hive_format)
    )
)

In [10]:
## verify first registration in the dataset
user_registrations['reg_timestamp'].min()

'2020-10-28T18:40:26Z'

In [11]:
## verify last registration in the dataset
user_registrations['reg_timestamp'].max()

'2020-11-25T18:37:56Z'

Those timestamps fits our requirements, the first one is about half a minute after the edit tag bug was deployed, and the last one is a few minutes before the four week cutoff.

In [12]:
def get_mw_regs(wikis, start_timestamp, end_timestamp):

    ## Query to get self-registrations through MediaWiki.
    ## Also grabbing usernames, bot-info in username, bot user group membership.

    ## From Analytics Engineering: https://gerrit.wikimedia.org/r/#/c/analytics/refinery/source/+/504025/
    botUsernamePattern = r"^.*bot([^a-z].*$|$)"
    
    ## Using "CONVERT" to make the regexp case-insensitive
    mw_reg_query = '''
    SELECT "{wiki}" AS wiki,
           user_id, user_name, user_registration,
           IF(CONVERT (user_name USING utf8) REGEXP "{bot_regex}", 1, 0) AS bot_by_name,
           IF(ug_user IS NOT NULL, 1, 0) AS bot_by_group
    FROM user
    JOIN actor
    ON user_id = actor_user
    JOIN logging
    ON log_actor = actor_id
    LEFT JOIN (
        SELECT ug_user
        FROM user_groups
        WHERE ug_group = "bot"
    ) AS ug
    ON user_id = ug_user
    WHERE user_registration >= "{start_ts}"
    AND user_registration < "{end_ts}"
    AND log_type = "newusers"
    AND log_action = "create" -- only self-creations
    '''
    
    regs = list()
    for wiki in wikis:
        regs.append(
            mariadb.run(
                mw_reg_query.format(
                    wiki = wiki,
                    bot_regex = botUsernamePattern,
                    start_ts = start_timestamp.strftime(utils.mw_format),
                    end_ts = end_timestamp.strftime(utils.mw_format)
                ), wiki
            )
        )
                   
        
    return(pd.concat(regs))

In [13]:
mw_regs = get_mw_regs(user_registrations['wiki'].unique(), exp_start_ts, exp_end_ts)

In [14]:
mw_regs.groupby('wiki').agg({'user_id' : 'size'})

Unnamed: 0_level_0,user_id
wiki,Unnamed: 1_level_1
arwiki,8666
cswiki,1572
fawiki,7279
frwiki,15042
hewiki,1728
huwiki,970
hywiki,269
kowiki,1710
plwiki,2751
ptwiki,11122


In [15]:
## What's the number of registrations?

user_registrations.groupby('wiki').agg({'user_id' : 'size'})

Unnamed: 0_level_0,user_id
wiki,Unnamed: 1_level_1
arwiki,7929
cswiki,1466
fawiki,6893
frwiki,14021
hewiki,1629
huwiki,930
hywiki,260
kowiki,1524
plwiki,2593
ptwiki,10412


Given that we can't filter out app registrations, it's difficult to compare these numbers and be sure that they're correct. However, they're not too far off. Can we find all the SSAC users in the MediaWiki database?

In [16]:
all_users = user_registrations.merge(mw_regs, on = ['wiki', 'user_id'])

In [17]:
all_users.groupby('wiki').agg({'user_id' : 'size'})

Unnamed: 0_level_0,user_id
wiki,Unnamed: 1_level_1
arwiki,7929
cswiki,1466
fawiki,6893
frwiki,14021
hewiki,1629
huwiki,930
hywiki,260
kowiki,1524
plwiki,2593
ptwiki,10412


Yeah, looks like they all exist, so let's go with that.

In [18]:
## Dropping the user name column, it's no longer needed.
all_users.drop('user_name', axis = 'columns', inplace = True)

In [19]:
## Removing all bots by name or group membership
all_users = all_users.loc[(all_users['bot_by_name'] == 0) & (all_users['bot_by_group'] == 0)]

In [20]:
all_users.groupby('wiki').agg({'user_id' : 'size'})

Unnamed: 0_level_0,user_id
wiki,Unnamed: 1_level_1
arwiki,7925
cswiki,1465
fawiki,6890
frwiki,14011
hewiki,1627
huwiki,930
hywiki,260
kowiki,1522
plwiki,2591
ptwiki,10401


## Get treatment/control assignments

This involves two operations.

1. Get all users who have the Homepage turned on in their preferences.
2. Get the variant setting for all users.

Users who don't have the Homepage turned on are candidates for the control group, and likewise for the experiment group. Secondary, users are randomly assigned to variants, which we'll use later in the analysis.

In [30]:
def get_prop_settings(wiki, prop, col_name, cast_type, users=None):
    '''
    Query and return a `pandas.DataFrame` with columns `wiki` and `user_id` of all users who have
    the given property turned on in their preferences for that given wiki.
    
    :param wiki: database code of the wiki we're querying
    :type wiki: str
    
    :param prop: the user preference we're querying for
    :type prop: str
    
    :param col_name: name that the column with preference value should have in the
                     resulting DataFrame (e.g. "is_treatment")
    :type col_name: str
    
    :param cast_type: SQL type to cast the property to (in the database this is a BLOB)
    :type cast_type: str
    
    :param users: user IDs of the users we are interested in. This is optional.
    :type users: list
    '''
    
    prop_query = '''
    SELECT "{wiki}" AS wiki, up_user AS user_id,
           CAST(up_value AS {cast_type}) AS {col_name}
    FROM user_properties
    WHERE up_property = "{prop}"
    '''.format(wiki = wiki, prop = prop, cast_type = cast_type, col_name = col_name)
    
    if users is not None:
        prop_query += '''
        AND up_user IN ({})
        '''.format(','.join([str(uid) for uid in users]))
        
    return(mariadb.run(prop_query, wiki))

In [31]:
## Get treatment/control assignments from the MW databases

hp_prefs = pd.concat(
    [get_prop_settings(wiki,
                       'growthexperiments-homepage-enable',
                       'hp_enabled',
                       'UNSIGNED INTEGER') for wiki in wikis]
)

In [24]:
all_users = all_users.merge(hp_prefs, on = ['wiki', 'user_id'], how = 'left').fillna(0)

In [None]:
all_users.groupby(['wiki', 'hp_enabled']).agg({'user_id': 'count'})

I used the above aggregation to check that the Control/Homepage split is 20/80. This is just routine, the random assignment hasn't failed in any of our experiments so far.

In [38]:
## Get variant settings from the MW database

variant_prefs = pd.concat(
    [get_prop_settings(wiki,
                       'growthexperiments-homepage-variant',
                       'hp_variant',
                       'CHAR CHARACTER SET utf8') for wiki in wikis]
)

In [39]:
## Dropping the user name column, it's no longer needed.
all_users.drop('hp_variant', axis = 'columns', inplace = True)

In [40]:
all_users = all_users.merge(variant_prefs, on = ['wiki', 'user_id'], how = 'left').fillna(0)

In [41]:
all_users.groupby(['hp_enabled', 'hp_variant']).agg({'user_id': 'count'})

Unnamed: 0_level_0,Unnamed: 1_level_0,user_id
hp_enabled,hp_variant,Unnamed: 2_level_1
0.0,0,13587
0.0,C,19
0.0,D,23
1.0,0,45
1.0,C,27203
1.0,D,27054


## Users who turned the Homepage on/off in their preferences

Lastly, we identify all users who turned the Homepage on or off in their preferences, as that means they self-selected into or out of our group assignments. These users can therefore not be part of the analysis.

In [42]:
## Second, identify all users who either turned the Homepage on themselves, or at some point
## turned the preference off.

switch_query = '''
SELECT wiki, event.userid AS user_id, event.value
FROM event.prefupdate
WHERE year = 2020 AND month IN (10, 11)
AND wiki IN ({wiki_list})
AND event.property = "{prop}"
'''

In [43]:
switched_users = spark.run(
    switch_query.format(
        wiki_list = ','.join(['"{}"'.format(w) for w in wikis]),
        prop = 'growthexperiments-homepage-enable'
    )
)

How many users switched?

In [44]:
len(switched_users)

31250

That number might be greatly affected by [T260867](https://phabricator.wikimedia.org/T260867), which was deployed some time in October. It might therefore be more meaningful to look at how many switched out of the users in the experiment.

In [None]:
switched_users.groupby(['wiki', 'value']).agg({'user_id' : 'size'})

Ok, so largely users are turning the Homepage *on*, not off. Given the number of users in our dataset, I don't think we're looking at a significant proportion turning it off.

In [46]:
## Left-join with switched users

all_users = all_users.merge(switched_users,
                            on = ['wiki', 'user_id'], how = 'left')

Now, for wiki and registration method, aggregate how many users turned it on or off, and how many didn't change it.

In [None]:
all_users.groupby(['wiki', 'reg_on_mobile', 'value']).agg({'user_id' : 'size'})

In [50]:
len(all_users)

67984

In [51]:
len(all_users.loc[~all_users['value'].isna()])

385

In [52]:
round(100 * len(all_users.loc[~all_users['value'].isna()]) / len(all_users), 1)

0.6

So, at the time of writing (Dec 7, 2020), we've had 385 out of 67,984 users in the experiment change their user preference setting. Most of these to turn the Homepage *on*. This is in line with previous experiments (in both the first Homepage experiment and NEWTEA it was both 0.4%,  and mainly users who turn it on. In other words, everything appears to be normal.

In [53]:
all_users = all_users.loc[all_users['value'].isna()].copy()

In [54]:
## Drop the 'value' column, it's no longer needed
all_users.drop('value', axis = 'columns', inplace = True)

In [55]:
## Drop the 'reg_timestamp' column, it's also no longer needed
all_users.drop('reg_timestamp', axis = 'columns', inplace = True)

In [None]:
all_users.head()

In [57]:
## Turn hp_enabled into an integer

all_users['hp_enabled'] = all_users['hp_enabled'].astype(int)

In [None]:
all_users.head()

Rename the `wiki` column to `wiki_db`, since that's what's used in `mediawiki_history`.

In [62]:
all_users.rename(columns = {'wiki' : 'wiki_db'}, inplace = True)

Export the dataset to a TSV, both with and without the header for reading into R and the Data Lake.

In [63]:
## Export users and usage data to TSVs for reading into R for analysis

all_users.to_csv(tsv_canonical_user_file, sep = '\t', header = True, index = False)

In [64]:
## Write the dataset out into a TSV so it can be imported into the Data Lake.
## NOTE: this file cannot contain a header line.

all_users.to_csv(tsv_datalake_user_file, sep = '\t',
                 columns = ['wiki_db', 'user_id', 'user_registration',
                            'reg_on_mobile', 'hp_enabled', 'hp_variant'],
                 header = False, index = False)

In [65]:
## Query to create the canonical user dataset table in Hive

create_table_query = '''
CREATE TABLE {table_name}
(wiki_db STRING COMMENT "wiki this user registered on",
 user_id BIGINT COMMENT "user_id of this user on the given wiki",
 user_registration STRING COMMENT "UTC timestamp of user registration",
 reg_on_mobile INT COMMENT "whether the user registered on the mobile site",
 hp_enabled INT COMMENT "whether the user has the Homepage enabled or not",
 hp_variant STRING COMMENT "which variant the user was in"
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY "\t"
'''

In [66]:
hive.run(create_table_query.format(
    table_name = canonical_user_table
))

Importing the data was done manually through the command line in order to have a meaningful reference to the actual local file.

In [67]:
import_data_query = '''
LOAD DATA LOCAL INPATH "{local_path}"
OVERWRITE INTO TABLE {table_name}
'''

In [70]:
print(import_data_query.format(
    local_path = tsv_datalake_user_file,
    table_name = canonical_user_table
))


LOAD DATA LOCAL INPATH "/home/nettrom/src/Growth-homepage-2019/datasets/variant-test-2-datalake-users.tsv"
OVERWRITE INTO TABLE nettrom_growth.hp_variant_test2

