# Welcome Survey Activation Rate

Investigation of first day activation rate for the A/B test of the Welcome Survey.

In [1]:
import datetime as dt
import pandas as pd
import wmfdata as wmf

You can find the source for `wmfdata` at https://github.com/neilpquinn/wmfdata


In [2]:
%load_ext rpy2.ipython

In [3]:
%%R
library(data.table)




In [4]:
## Czech Wikipedia
## User IDs of known users to exclude (Stephane, Elena, and Marshall's accounts)
cs_known_users = set([322106, 339583, 341191, 341611, 433381, 433382, 433511, 404765, 421667,
                      427625, 437386])

cs_start_timestamp = '20181119195401'

## We set the end timestamp to be the end of the first Monday in January, which is the most
## recent whole week period since the survey was deployed.
cs_end_timestamp = dt.datetime(2019, 1, 1, 0, 0, 0).strftime('%Y%m%d%H%M%S')

## Korean Wikipedia
## User IDs of known users to exclude
ko_known_users = set([384066, 539296, 539299, 539302, 539303, 539304, 539305, 539306, 539307,
                      539298, 416361, 416360, 413162, 495265, 518393, 518394, 518396, 530285,
                      531579, 531785, 536786, 536787, 542720, 542721, 542722, 543192, 543193,
                      544145, 544283])

## Timestamp of the user registration of the first user who reasonably saw the survey
ko_start_timestamp = '20181119201024'
ko_end_timestamp = dt.datetime(2019, 1, 1, 0, 0, 0).strftime('%Y%m%d%H%M%S')

In [5]:
## Grab the user IDs of all of Elena's and Marshall's accounts so they can be added to the
## exclusion lists.

known_user_query = '''
SELECT user_id
FROM {wiki}.user
WHERE user_registration >= "{start_timestamp}"
AND user_name LIKE "{name_pattern}%"
'''

cs_known = wmf.mariadb.run(known_user_query.format(
  wiki = 'cswiki',
  start_timestamp = cs_start_timestamp,
  name_pattern = "MMiller"),
                          fmt = 'raw')

## Union the two sets
cs_known_users = cs_known_users | set([r[0] for r in cs_known])

cs_known = wmf.mariadb.run(known_user_query.format(
  wiki = 'cswiki',
  start_timestamp = cs_start_timestamp,
  name_pattern = "Zilant"),
                          fmt = 'raw')

cs_known_users = cs_known_users | set([r[0] for r in cs_known])

ko_known = wmf.mariadb.run(known_user_query.format(
  wiki = 'kowiki',
  start_timestamp = cs_start_timestamp,
  name_pattern = "MMiller"),
                          fmt = 'raw')

## Union the two sets
ko_known_users = ko_known_users | set([r[0] for r in ko_known])

ko_known = wmf.mariadb.run(known_user_query.format(
  wiki = 'kowiki',
  start_timestamp = cs_start_timestamp,
  name_pattern = "Zilant"),
                          fmt = 'raw')

ko_known_users = ko_known_users | set([r[0] for r in ko_known])

In [7]:
## Step 1: get a list of accounts with information about their creation from the
## ServerSideAccountCreation schema, filtering out API-created accounts.

## Name of the schema we're querying
table_name = 'log.ServerSideAccountCreation_17719237'

## Query to get accounts created through the API with information about mobile/desktop
mob_query = '''SELECT event_userid, event_displayMobile
FROM {ssac_table}
WHERE timestamp >= "{start_timestamp}"
AND timestamp < "{end_timestamp}"
AND event_isApi = 0
AND event_isSelfMade = 1
AND wiki = "{wiki}"
'''

cs_mob_users = wmf.mariadb.run(mob_query.format(ssac_table = table_name,
                                                    start_timestamp = cs_start_timestamp,
                                                    end_timestamp = cs_end_timestamp,
                                                    wiki = 'cswiki'),
                              host = "logs")
ko_mob_users = wmf.mariadb.run(mob_query.format(ssac_table = table_name,
                                                    start_timestamp = ko_start_timestamp,
                                                    end_timestamp = ko_end_timestamp,
                                                    wiki = 'kowiki'),
                              host = "logs")

In [8]:
## Step 2: Get data on the group assignment for these users.

group_query = '''
SELECT up_user,
  CASE
    WHEN up_value LIKE "%exp1_group2%" THEN "control"
    WHEN up_value LIKE "%exp1_group1%" THEN "target"
  END AS exp_group
FROM {wiki}.user
JOIN {wiki}.user_properties
ON user_id = up_user
JOIN {wiki}.logging
ON up_user = log_user
WHERE user_registration >= "{start_timestamp}"
AND user_registration < "{end_timestamp}"
AND up_property = "welcomesurvey-responses"
AND up_user NOT IN ({id_list})
AND up_value <> ""
AND log_type = "newusers"
AND log_action != "autocreate"
'''

cs_groups_raw = wmf.mariadb.run(group_query.format(
    wiki = 'cswiki',
    start_timestamp = cs_start_timestamp,
    end_timestamp = cs_end_timestamp,
    id_list = ",".join([str(id) for id in cs_known_users])))

ko_groups_raw = wmf.mariadb.run(group_query.format(
    wiki = 'kowiki',
    start_timestamp = ko_start_timestamp,
    end_timestamp = ko_end_timestamp,
    id_list = ",".join([str(id) for id in ko_known_users])))

In [9]:
## Now join cs_mob_users and cs_groups on user_id and up_user
cs_groups = pd.merge(cs_mob_users, cs_groups_raw, left_on='event_userid', right_on='up_user')
ko_groups = pd.merge(ko_mob_users, ko_groups_raw, left_on='event_userid', right_on='up_user')

In [29]:
## Step 3: Count the number of edits within the first 24 hours.

## FIXME: also check the archive table!

editcount_query = '''
SELECT user_id, SUM(num_edits) AS num_edits
FROM ((SELECT user_id, COUNT(rev_id) AS num_edits
  FROM {wiki}.user
  JOIN {wiki}.revision
  ON user_id=rev_user
  WHERE rev_timestamp > user_registration
  AND rev_timestamp < DATE_FORMAT(
    DATE_ADD(
        STR_TO_DATE(user_registration,
                    "%Y%m%d%H%i%S"),
        INTERVAL 1 DAY),
    "%Y%m%d%H%i%S")
  AND user_id IN ({id_list})
  GROUP BY user_id)
UNION ALL
(SELECT user_id, COUNT(ar_id) AS num_edits
  FROM {wiki}.user
  JOIN {wiki}.archive
  ON user_id=ar_user
  WHERE ar_timestamp > user_registration
  AND ar_timestamp < DATE_FORMAT(
    DATE_ADD(
        STR_TO_DATE(user_registration,
                    "%Y%m%d%H%i%S"),
        INTERVAL 1 DAY),
    "%Y%m%d%H%i%S")
  AND user_id IN ({id_list})
  GROUP BY user_id)) AS users_edits
GROUP BY user_id
'''

cs_counts_raw = wmf.mariadb.run(editcount_query.format(
    wiki = 'cswiki',
    id_list = ",".join([str(id) for id in cs_groups['event_userid']])))

ko_counts_raw = wmf.mariadb.run(editcount_query.format(
    wiki = 'kowiki',
    id_list = ",".join([str(id) for id in ko_groups['event_userid']])))


In [30]:
## Left join the group assignment frame with the counts frame

cs_counts = pd.merge(cs_groups, cs_counts_raw, left_on='event_userid', right_on='user_id',
                    how='left').fillna(0)
ko_counts = pd.merge(ko_groups, ko_counts_raw, left_on='event_userid', right_on='user_id',
                    how='left').fillna(0)


In the tables below: exp_group "True" means the user was in the survey group, and num_edits "True" means the user made at least one edit within 24 hours after registration.

In [31]:
pd.crosstab(cs_counts.exp_group == 'target', cs_counts.num_edits > 0,
            margins=True, margins_name="Total")

num_edits,False,True,Total
exp_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,526,406,932
True,508,418,926
Total,1034,824,1858


In [32]:
pd.crosstab(cs_counts.exp_group == 'target', cs_counts.num_edits > 0,
            margins=True, margins_name="Total", normalize = 'index').round(3)*100

num_edits,False,True
exp_group,Unnamed: 1_level_1,Unnamed: 2_level_1
False,56.4,43.6
True,54.9,45.1
Total,55.7,44.3


In [33]:
pd.crosstab(ko_counts.exp_group == 'target', ko_counts.num_edits > 0,
           margins=True, margins_name="Total")

num_edits,False,True,Total
exp_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,1556,519,2075
True,1611,473,2084
Total,3167,992,4159


In [34]:
pd.crosstab(ko_counts.exp_group == 'target', ko_counts.num_edits > 0,
           margins=True, margins_name="Total", normalize = 'index').round(3)*100

num_edits,False,True
exp_group,Unnamed: 1_level_1,Unnamed: 2_level_1
False,75.0,25.0
True,77.3,22.7
Total,76.1,23.9


In [35]:
%%R -i cs_counts

cs_counts = data.table(cs_counts)

n_edited_control = length(cs_counts[exp_group == 'control' & num_edits > 0]$user_id);
n_edited_survey = length(cs_counts[exp_group == 'target' & num_edits > 0]$user_id);

n_users_control = length(cs_counts[exp_group == 'control']$user_id);
n_users_survey = length(cs_counts[exp_group == 'target']$user_id);

prop.test(
    c(n_edited_control, n_edited_survey),
    c(n_users_control, n_users_survey)  
);

  res = PandasDataFrame.from_items(items)



	2-sample test for equality of proportions with continuity correction

data:  c(n_edited_control, n_edited_survey) out of c(n_users_control, n_users_survey)
X-squared = 0.40697, df = 1, p-value = 0.5235
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.06203174  0.03046860
sample estimates:
   prop 1    prop 2 
0.4356223 0.4514039 



In [36]:
%%R -i ko_counts

ko_counts = data.table(ko_counts)

n_edited_control = length(ko_counts[exp_group == 'control' & num_edits > 0]$user_id);
n_edited_survey = length(ko_counts[exp_group == 'target' & num_edits > 0]$user_id);

n_users_control = length(ko_counts[exp_group == 'control']$user_id);
n_users_survey = length(ko_counts[exp_group == 'target']$user_id);

prop.test(
    c(n_edited_control, n_edited_survey),
    c(n_users_control, n_users_survey)  
);


	2-sample test for equality of proportions with continuity correction

data:  c(n_edited_control, n_edited_survey) out of c(n_users_control, n_users_survey)
X-squared = 2.9426, df = 1, p-value = 0.08627
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.003224608  0.049530831
sample estimates:
   prop 1    prop 2 
0.2501205 0.2269674 



# Conclusion

For both Wikipedias we find a small but not statistically significant difference in activation rate between users who saw the survey and those who did not.

The number of registrations on Korean Wikipedia during this six week period is much higher than expected. Based on our analysis of historical data, we would expect it to be lower by about 1,000 users. We also find that the activation rate (average of 23.2%) is much lower than our historical analysis suggested. It seems that Korean Wikipedia has attracted a lot of registrations, but ones that do not make any edits.

# Side quest

In [18]:
## Side quest: Grab the number of edits since registration for all users.

totaledits_query = '''
SELECT user_id, count(rev_id) AS num_edits
FROM {wiki}.user
JOIN {wiki}.revision
ON user_id=rev_user
WHERE user_id IN ({id_list})
GROUP BY user_id
'''

cs_totalcounts_raw = wmf.mariadb.run(totaledits_query.format(
    wiki = 'cswiki',
    id_list = ",".join([str(id) for id in cs_groups['event_userid']])))

ko_totalcounts_raw = wmf.mariadb.run(editcount_query.format(
    wiki = 'kowiki',
    id_list = ",".join([str(id) for id in ko_groups['event_userid']])))

In [19]:
## Left join the group assignment frame with the counts frame

cs_totalcounts = pd.merge(cs_groups, cs_totalcounts_raw, left_on='event_userid', right_on='user_id',
                    how='left').fillna(0)
ko_totalcounts = pd.merge(ko_groups, ko_totalcounts_raw, left_on='event_userid', right_on='user_id',
                    how='left').fillna(0)

In [20]:
cs_totalcounts.sort_values('num_edits').groupby('exp_group')['num_edits'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
exp_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
control,932.0,1.344421,3.410737,0.0,0.0,0.0,1.0,34.0
target,926.0,1.37041,4.436939,0.0,0.0,0.0,1.0,97.0


In [21]:
ko_totalcounts.sort_values('num_edits').groupby('exp_group')['num_edits'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
exp_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
control,2075.0,0.95759,3.562384,0.0,0.0,0.0,0.0,57.0
target,2084.0,0.930422,4.048607,0.0,0.0,0.0,0.0,88.0


In [22]:
cs_totalcounts.groupby('exp_group')['num_edits'].sum()

exp_group
control    1253.0
target     1269.0
Name: num_edits, dtype: float64

In [23]:
ko_totalcounts.groupby('exp_group')['num_edits'].sum()

exp_group
control    1987.0
target     1939.0
Name: num_edits, dtype: float64