In [22]:
## Load Python libraries

import pymysql

import datetime as dt
import pandas as pd

import phpserialize as ps

from wmfdata import hive, mariadb

In [23]:
## Load the RPython library so we can use R for graphs

%load_ext rpy2.ipython

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [24]:
%%R
library(ggplot2);
library(data.table);
library(zoo);
library(tidyr);
library(RColorBrewer);

In [66]:
## Some configuration variables.
## Start and end timestamps allow us to speed up database queries.

start_time = dt.datetime(2019, 1, 1, 0, 0, 0)
end_time = dt.datetime(2019, 3, 22, 0, 0, 0)

## The wikis that we are interested in studying
wikis = ['itwiki', 'arwiki', 'fawiki', 'metawiki', 'mediawikiwiki']

## Mapping from wiki DB name to host/port information
dbhost_map = dict()

## Mapping from wiki DB name to database connection
dbconn_map = dict()

## Format strings:
## MediaWiki database timestamp format
mw_format = "%Y%m%d%H%M%S"
hive_format = "%Y-%m-%dT%H:%M:%S"


In [67]:
# The second function needs dnspython to work
import dns.resolver
import glob

def get_mediawiki_section_dbname_mapping(mw_config_path, use_x1):
    db_mapping = {}
    if use_x1:
        dblist_section_paths = [mw_config_path.rstrip('/') + '/dblists/all.dblist']
    else:
        dblist_section_paths = glob.glob(mw_config_path.rstrip('/') + '/dblists/s[0-9]*.dblist')
    for dblist_section_path in dblist_section_paths:
        with open(dblist_section_path, 'r') as f:
            for db in f.readlines():
                db_mapping[db.strip()] = dblist_section_path.strip().rstrip('.dblist').split('/')[-1]

    return db_mapping


def get_dbstore_host_port(db_mapping, use_x1, dbname):
    if dbname == 'staging':
        shard = 'staging'
    elif use_x1:
        shard = 'x1'
    else:
        try:
            shard = db_mapping[dbname]
        except KeyError:
            raise RuntimeError("The database {} is not listed among the dblist files of the supported sections."
                               .format(dbname))
    answers = dns.resolver.query('_' + shard + '-analytics._tcp.eqiad.wmnet', 'SRV')
    host, port = str(answers[0].target), answers[0].port
    return (host,port)

wikidb_map = get_mediawiki_section_dbname_mapping('/srv/mediawiki-config', False)

In [68]:
for wiki in wikis:
    dbhost_map[wiki] = get_dbstore_host_port(wikidb_map, False, wiki)


In [80]:
for wiki in wikis:
    dbconn_map[wiki] = pymysql.connect(
        host = dbhost_map[wiki][0],
        port = dbhost_map[wiki][1],
        database = wiki,
        read_default_file = '/etc/mysql/conf.d/research-client.cnf',
        charset = 'utf8'
    )

In [11]:
## Code form wmfdata to decode bytestrings returned from the database into UTF-8 strings

def try_decode(cell):
    try:
        return cell.decode(encoding = "utf-8")
    except AttributeError:
        return cell


In [49]:
def extract_params(row):
    '''
    Extract relevant block parameters from the given row of partial block log
    data, and return a new `pandas.Series` that can be used to update a data frame
    with columns for those parameters.
    '''
    params = ps.loads(row['log_params'].encode('utf-8'), decode_strings=True)
    
    try:
        duration = params['5::duration']
    except KeyError:
        duration = None
        
    try:
        flags = params['6::flags']
    except KeyError:
        flags = None
        
    try:
        num_pages = len(params['7::restrictions'])
    except KeyError:
        num_pages = None
    
    return(pd.Series([duration, flags, num_pages]))

In [84]:
## SQL query to get data on partial blocks, adapted from
## https://github.com/dayllanmaza/wikireplicas-reports/blob/master/generators/partial_blocks.py

def get_partial_blocks(wikis, dbconns, start_timestamp, end_timestamp):
    pb_query = '''
    SELECT DATABASE() AS wiki,
           log_timestamp,
           log_params,
           log_user_text AS blocker,
           log_title AS blockee,
           comment_text AS reason
    FROM {wiki}.logging
    LEFT JOIN {wiki}.comment
    ON log_comment_id=comment_id
    WHERE log_timestamp >= "{start_timestamp}"
    AND log_timestamp < "{end_timestamp}"
    AND log_type = "block"
    AND log_action = "block" -- only interested in initial blocks created
    AND log_params LIKE '%"sitewide";b:0;%'
    '''
    
    pbs = []
    for wiki in wikis:
        df = pd.read_sql_query(
            pb_query.format(
                wiki = wiki,
                start_timestamp = start_timestamp.strftime(mw_format),
                end_timestamp = end_timestamp.strftime(mw_format)),
            dbconns[wiki])
        df = df.applymap(try_decode).rename(columns = try_decode)
        
        ## Turn the timestamps into datetime objects, and add a log_date string for convenience
        df['log_timestamp'] = pd.to_datetime(df['log_timestamp'], format=mw_format, utc=True)
        df['log_date'] = df['log_timestamp'].apply(lambda x: str(x.date()))

        df[['block_duration', 'block_flags', 'block_num_pages']] = df.apply(extract_params, axis=1)
        
        pbs.append(df)
    
    return(pd.concat(pbs))

partial_blocks = get_partial_blocks(wikis, dbconn_map, start_time, end_time)


In [85]:
partial_blocks.head()

Unnamed: 0,wiki,log_timestamp,log_params,blocker,blockee,reason,log_date,block_duration,block_flags,block_num_pages
0,itwiki,2019-01-16 12:15:11+00:00,"a:4:{s:11:""5::duration"";s:8:""infinite"";s:8:""6:...",Daimona Eaytoy,Eaytoy_Daimona,Test blocchi parziali,2019-01-16,infinite,nocreate,1.0
1,itwiki,2019-01-16 12:36:56+00:00,"a:4:{s:11:""5::duration"";s:8:""infinite"";s:8:""6:...",Daimona Eaytoy,Eaytoy_Daimona,,2019-01-16,infinite,nocreate,1.0
2,itwiki,2019-01-16 12:52:53+00:00,"a:4:{s:11:""5::duration"";s:7:""2 hours"";s:8:""6::...",Ruthven,151.20.139.29,[[WP:Vandalismo|Vandalismi]],2019-01-16,2 hours,"anononly,nocreate",1.0
3,itwiki,2019-01-16 13:01:55+00:00,"a:4:{s:11:""5::duration"";s:7:""8 hours"";s:8:""6::...",Ruthven,93.55.168.167,[[WP:Vandalismo|Vandalismi]],2019-01-16,8 hours,"anononly,nocreate",1.0
4,itwiki,2019-01-16 13:26:10+00:00,"a:3:{s:11:""5::duration"";s:9:""5 minutes"";s:8:""6...",Buggia,Wolframio,,2019-01-16,5 minutes,,


In [51]:
%%R -i it_pbs

it_pbs = data.table(it_pbs);
it_pbs[, log_date := as.Date(log_date)];

dates = seq.Date(min(it_pbs$log_date), max(it_pbs$log_date), by='day');
dates = data.table(log_date = dates);

it_pbs_per_day = it_pbs[, list(num_blocks=sum(.N)), by=log_date];
it_pbs_per_day = it_pbs_per_day[dates, on = 'log_date'];
it_pbs_per_day[is.na(num_blocks), num_blocks := 0];
head(it_pbs_per_day);

     log_date num_blocks
1: 2019-01-16          7
2: 2019-01-17          1
3: 2019-01-18          0
4: 2019-01-19          1
5: 2019-01-20          0
6: 2019-01-21          2


In [91]:
%%R -i partial_blocks

partial_blocks = data.table(partial_blocks);
partial_blocks[, log_date := as.Date(log_date)];

## Some configuration variables
graph_dir = 'graphs/';
pb_graph_prefix = 'partial_blocks_per_day_';
pb_graph_suffix = '.png';

make_pb_graphs = function(pbs, graph_dir, prefix, suffix) {
    wikis = unique(pbs$wiki);
    
    for(w in wikis) {
        ## Grab the subset for this wiki, as we have different date ranges for each
        wiki_blocks = pbs[wiki == w];
        
        ## Make a date sequence from the first to the last date, and left join against
        ## the data to fill in any dates with 0 blocks.

        dates = seq.Date(min(wiki_blocks$log_date), max(wiki_blocks$log_date), by='day');
        dates = data.table(log_date = dates);

        blocks_per_day = wiki_blocks[, list(num_blocks=sum(.N)), by=log_date];
        blocks_per_day = blocks_per_day[dates, on = 'log_date'];
        blocks_per_day[is.na(num_blocks), num_blocks := 0];

        ## Add the moving averages
        blocks_per_day[
            , num_blocks_1wma := rollapply(
                num_blocks,
                width = 7,
                FUN = mean,
                na.rm = TRUE,
                fill = 0,
                align = 'right')];
        blocks_per_day[
            , num_blocks_2wma := rollapply(
                num_blocks,
                width = 14,
                FUN = mean,
                na.rm = TRUE,
                fill = 0,
                align = 'right')];
        
        ## Tidy up and make the plot
        blocks_per_day_long = blocks_per_day %>% gather(measure, num_blocks, 2:4);
        blocks_per_day_long = data.table(blocks_per_day_long);
        blocks_per_day_long[measure == 'num_blocks', measure := 'raw data'];
        blocks_per_day_long[measure == 'num_blocks_1wma', measure := '1-week MA'];
        blocks_per_day_long[measure == 'num_blocks_2wma', measure := '2-week MA'];
        blocks_per_day_long[
            , measure := ordered(measure, rev(c('raw data', '1-week MA', '2-week MA')))];

        ## Choose blues with some contrast, with the raw data getting the strongest color
        b_palette = brewer.pal('Blues', n = 7)[c(3,5,7)];
        
        block_day_plot = ggplot(blocks_per_day_long,
                                aes(x=log_date, y=num_blocks, color=measure)) +
        scale_x_date(date_breaks = "1 week", date_labels = "%d %b") +
        scale_y_continuous() +
        scale_colour_manual(values = b_palette) +
        expand_limits(y = 0) +
        labs(title = paste0('Partial blocks created per day - ', w),
             x = 'Date',
             y = 'Number of blocks') +
        theme_light(base_size = 14) +
        geom_line();

        ggsave(paste0(graph_dir, prefix, w, suffix),
           plot = block_day_plot, width = 30, height = 20, units = "cm", dpi = "screen");
    }
}

make_pb_graphs(partial_blocks, graph_dir, pb_graph_prefix, pb_graph_suffix);