In [1]:
import pandas as pd
import itertools
import collections

We import a cleaned version of manually annotated edit filters:

In [2]:
df = pd.read_csv("20190106115600_filters-sorted-by-hits-manual-tags.csv", sep='\t')
df_origin = pd.read_csv("quarry-32518-all-filters-sorted-num-hits.csv", sep=',')
df_hits = pd.read_csv("quarry-34014-how-many-abuse-filter-hits-have-been-there-per-month-en-wiki-run346197.csv", sep=',')

## General stats

In [90]:
# Number of filters
len(df)

954

In [26]:
# Active (enabled) filters
print (len(df.query('af_enabled==1')))

# Disabled filters
print (len(df.query('af_enabled==0')))

# Deleted filters
print (len(df.query('af_deleted==1')))

# Active public filters
print (len(df.query('af_hidden==0 and af_enabled==1')))

# Deleted and enabled
print (len(df.query('af_deleted==1 and af_enabled==1')))

201
753
600
110
0


In [101]:
# hidden filters
print (len(df.query('af_hidden==1')))

# active hidden filters
print (len(df.query('af_hidden==1 and af_enabled==1')))

593
91


In [105]:
# global filters
print (len(df.query('af_global==0')))

954


In [5]:
# throttled
print (len(df.query('af_throttled==0')))

print (len(df.query('af_throttled==1')))

948
6


In [9]:
# group
print (len(df.query('af_group=="default"')))
print (df.query('af_group!="default"'))

# --> so available groups are "default" and "feedback"
# TODO: question: what do they mean?

947
     Unnamed: 0  af_id  af_hidden  af_global  af_enabled  af_deleted  \
168         168    497          0          0           0           1   
173         173    494          0          0           0           1   
174         174    502          0          0           0           1   
187         187    495          0          0           0           1   
190         190    496          0          0           0           1   
227         227    475          0          0           0           1   
349         349    461          0          0           0           1   

     af_throttled  af_group    af_timestamp af_actions  af_hit_count  \
168             0  feedback  20130108151106   disallow          3660   
173             0  feedback  20130108151035   disallow          3325   
174             0  feedback  20130424011002   disallow          3280   
187             0  feedback  20130108151045   disallow          2697   
190             0  feedback  20130108151054   disallow     

In [3]:
# filter hits per month (all filters) (data quarry from 05.03.2019, that's why hitcount Mar 2019 is so small)
df_hits

Unnamed: 0,LogMonth,Freq
0,201903,34309
1,201902,236606
2,201901,252668
3,201812,226287
4,201811,253233
5,201810,256438
6,201809,230354
7,201808,216045
8,201807,205477
9,201806,209374


## Helper functions

In [28]:
flatten = lambda x: list(itertools.chain.from_iterable(x))

## Edit filter actions

In [19]:
actions = df['af_actions'].fillna('')
actions_list = [x.split(",") for x in list(actions)]
all_actions = flatten(actions_list)

print(collections.Counter(all_actions).most_common())

[('', 413), ('disallow', 406), ('warn', 122), ('tag', 70), ('throttle', 52), ('blockautopromote', 4)]


In [23]:
# What are the actions of active hidden filters
active_hidden = df.query('af_hidden==1 and af_enabled==1')
print(collections.Counter(list(active_hidden['af_actions'].fillna(''))).most_common())

[('disallow', 51), ('', 19), ('throttle,disallow', 7), ('throttle', 4), ('tag', 3), ('warn,tag', 2), ('throttle,warn', 2), ('warn', 1), ('disallow,tag', 1), ('warn,disallow', 1)]


In [3]:
# What are the actions of active public filters
active_public = df.query('af_hidden==0 and af_enabled==1')
print(collections.Counter(list(active_public['af_actions'].fillna(''))).most_common())

[('tag', 25), ('warn,tag', 25), ('disallow', 22), ('', 20), ('warn', 12), ('throttle,tag', 2), ('warn,disallow', 2), ('throttle,warn,tag', 1), ('throttle,disallow', 1)]


In [4]:
#TODO: all ids of active public filters set to disallow
active_public[active_public['af_actions'].fillna('').str.contains('disallow')]

Unnamed: 0.1,Unnamed: 0,af_id,af_hidden,af_global,af_enabled,af_deleted,af_throttled,af_group,af_timestamp,af_actions,af_hit_count,af_public_comments,manual_tags,notes
3,3,384,0,0,1,0,0,default,20181018171832,disallow,1159239,Addition of bad words or other vandalism,vandalism,
12,12,225,0,0,1,0,0,default,20180807154519,disallow,482872,Vandalism in all caps,vandalism,
19,19,46,0,0,1,0,0,default,20181023193500,disallow,356945,"""Poop"" vandalism",vandalism,
22,22,260,0,0,1,0,0,default,20181130204255,disallow,286852,Common vandal phrases,vandalism,
37,37,320,0,0,1,0,0,default,20181023193636,disallow,152994,"""Your mom"" Vandalism",vandalism,
41,41,12,0,0,1,0,0,default,20170606215509,disallow,122756,Replacing a page with obscenities,vandalism,
47,47,680,0,0,1,0,0,default,20181023195329,disallow,95242,Adding emoji unicode characters,good_faith,
54,54,365,0,0,1,0,0,default,20181023193820,disallow,85470,Unusual changes to featured or good content,vandalism,Unusual == unusually big (edit_delta > 15000 |...
67,67,803,0,0,1,0,0,default,20181023223916,disallow,46756,Prevent new users from editing other's user pages,"vandalism, good_faith",
99,99,782,0,0,1,0,0,default,20181023223745,disallow,16702,Content Translation Edits,misc,why is this not allowed? Seems to be translati...


## Explore Manual Tags

In [39]:
manual_tags = df['manual_tags']
manual_tags_list = [x.split(", ") for x in list(manual_tags)]
all_tags = flatten(manual_tags_list)

print(collections.Counter(all_tags).most_common())

[('vandalism', 263), ('vandalism?', 162), ('unknown', 71), ('good_faith?', 63), ('misc', 59), ('sockpuppetry', 59), ('good_faith', 48), ('test', 43), ('spam?', 41), ('long_term_abuse', 35), ('sockpuppetry?', 35), ('harassment?', 31), ('harassment', 24), ('abuse?', 21), ('biased_pov', 17), ('spam', 17), ('biased_pov?', 15), ('unclear', 14), ('bad_style', 13), ('bad_style?', 12), ('bug?', 10), ('wiki_policy?', 9), ('long_term_abuse?', 9), ('misc?', 8), ('seo', 8), ('politically_motivated?', 8), ('maintenance', 7), ('trolling?', 7), ('maintenance?', 6), ('personal_attacks', 6), ('bug', 5), ('vandalbot', 5), ('page_move_vandalism', 5), ('silly_vandalism', 5), ('lazyness', 4), ('seo?', 4), ('test?', 4), ('hoaxing?', 4), ('personal_attacks?', 4), ('edit_warring?', 3), ('copyright', 3), ('image_vandalism', 3), ('talk_page_vandalism', 3), ('page_move_vandalism?', 3), ('conflict_of_interest', 3), ('stockbrocker_vandalism', 3), ('copyright?', 2), ('vandalbot?', 2), ('religious_vandalism?', 2), (

('vandalism', 263),
('vandalism?', 162),
  ('spam?', 41),
  ('spam', 17),
  ('vandalbot', 5),
  ('vandalbot?', 2),
  ('page_move_vandalism', 5),
  ('page_move_vandalism?', 3),
  ('silly_vandalism', 5),
  ('silly_vandalism?', 2),
  ('trolling?', 7),
  ('hoaxing?', 4),
  ('hoaxing', 2),
  ('copyright', 3),
  ('copyright?', 2),
  ('image_vandalism', 3),
  ('talk_page_vandalism', 3),
  ('template_vandalism?', 1),
  ('template_vandalism', 1),
  ('template_spam', 2),
  ('link_vandalism?', 1),
  ('abuse_of_tags_vandalism?', 1),
  ('avoidant_vandalism', 1),
  ('avoidant_vandalism?', 1),
  ('username_vandalism?', 1),

('prank', 1)

('phishing?', 1),
('malware?', 1),
('malware', 1),

('guideline_vio?', 1),

('religious_vandalism?', 3),
('politically_motivated?', 8),
('politically_motivated', 2),

('sockpuppetry', 59),
('sockpuppetry?', 35),
('long_term_abuse', 35),
('long_term_abuse?', 9),
('abuse', 1),
('abuse?', 21),
('harassment?', 31),
('harassment', 24),
('doxxing?', 2),
('personal_attacks', 6),
('personal_attacks?', 4),
('impersonation', 1),
('not_polite', 1),

('biased_pov', 17),
('biased_pov?', 15),

('conflict_of_interest', 3),
('stockbrocker_vandalism', 3),
('self_promotion?', 2),
('conflict_of_interest?', 1),
('self_promotion', 1),

('seo', 8),
('seo?', 4),

('bad_style', 13),
('bad_style?', 12),
('edit_warring?', 3),

('good_faith?', 63),
('good_faith', 48),

('lazyness', 4),

('maintenance', 7),
('maintenance?', 5),
('maintenance? ', 1),

('bug', 5),
('bug?', 10),
('wiki_policy?', 9),

('test', 43),
('test?', 4),

('unknown', 71),
('misc', 59),
('misc?', 8),
('unclear', 14),

## Combine manual tags with filter actions

In [18]:
# What are the actions and tags of active public filters
active_public = df.query('af_hidden==0 and af_enabled==1').sort_values(by=['af_actions'])

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(active_public[['af_id', 'af_actions', 'manual_tags']].fillna(''))


     af_id         af_actions                            manual_tags
653    897           disallow                        spam, vandalbot
67     803           disallow                  vandalism, good_faith
41      12           disallow                              vandalism
37     320           disallow                              vandalism
499    694           disallow                             good_faith
99     782           disallow                                   misc
22     260           disallow                              vandalism
54     365           disallow                              vandalism
130    784           disallow                              vandalism
19      46           disallow                              vandalism
171    860           disallow                              vandalism
110    554           disallow                seo?, vandalism?, spam?
47     680           disallow                             good_faith
470    843           disallow     

In [5]:
#all ids of active public filters set to disallow and labeled 'good_faith'
active_public[active_public['af_actions'].fillna('').str.contains('disallow') & active_public['manual_tags'].fillna('').str.contains('good_faith')]

Unnamed: 0.1,Unnamed: 0,af_id,af_hidden,af_global,af_enabled,af_deleted,af_throttled,af_group,af_timestamp,af_actions,af_hit_count,af_public_comments,manual_tags,notes
47,47,680,0,0,1,0,0,default,20181023195329,disallow,95242,Adding emoji unicode characters,good_faith,
67,67,803,0,0,1,0,0,default,20181023223916,disallow,46756,Prevent new users from editing other's user pages,"vandalism, good_faith",
271,271,642,0,0,1,0,0,default,20181023194832,disallow,712,OTRS template added by non-OTRS member (global),good_faith?,from comments: “This filter is easy to subvert...
499,499,694,0,0,1,0,0,default,20181023223653,disallow,74,Moves to or from the Module namespace,good_faith,“These types of moves don't do what users expe...


Upon second inspection (looking at https://en.wikipedia.org/wiki/Special:AbuseLog), edits targeted by filter with id 680 (adding unicode emojis) seem to be vandalism in their vast majority.

Filter 803 seems to be labeled accurately on the other hand. There are a lot of personal insults in these edits, but there are also some that look as if the user themselves wanted to edit their own page but forgot to log in (so they're editing from an IP).

The other two filters seem to be labeled somewhat reasonably and they are set to "disallow" since the intended/expected effect of the action difers from the real one and since they're disruptive.

## Hit count

In [69]:
 df['af_hit_count'].describe()

count    9.540000e+02
mean     2.401892e+04
std      1.205649e+05
min      0.000000e+00
25%      7.000000e+00
50%      9.050000e+01
75%      1.185250e+03
max      1.611956e+06
Name: af_hit_count, dtype: float64

## Edit filter editors

In [89]:
raw_df = pd.read_csv("quarry-32518-all-filters-sorted-num-hits.csv", sep=',')
editors = raw_df['af_user_text']
print (editors.unique())
print (len(editors.unique()))
print (raw_df['af_user_text'].value_counts())

['Zzuuzz' 'Dragons flight' 'This, that and the other' 'MusikAnimal' 'Crow'
 'Samtar' 'Xaosflux' 'King of Hearts' 'Amorymeltzer' 'Samwalton9'
 'Biblioworm' 'NawlinWiki' 'MER-C' 'Rich Farmbrough' 'Galobtter'
 'Cenarium' 'Ruslik0' 'Legoktm' 'Od Mishehu' 'BU Rob13' 'Prodego'
 'Timotheus Canens' 'Oshwah' 'The Earwig' 'The Anome' 'Kww' 'Beetstra'
 'Reaper Eternal' 'BethNaught' 'Mlitn' 'Cyp' "There'sNoTime" 'Kuru'
 'Shirik' 'Xeno' 'Kaldari' 'Kingpin13' 'DoRD' 'Elockid' 'Ritchie333'
 'Maxim' 'Ryan Kaldari (WMF)' 'Cyberpower678' 'GB fan' 'Jackmcbarn' 'L235'
 'Smalljim' 'Materialscientist' 'Someguy1221' 'Billinghurst' 'Tedder'
 'Gogo Dodo' 'Triplestop' 'Darkwind' 'Amalthea' 'Slakr' 'Scottywong'
 'Mr.Z-man' 'SQL' 'Avraham' 'NuclearWarfare' 'OverlordQ' 'Nihiltres'
 'Hersfold' 'Mifter' 'Chris G' 'EdoDodo' 'Nakon' 'Werdna' 'Wknight94'
 'DMacks' 'East718' 'Georgewilliamherbert' 'Mindmatrix' 'Rschen7754'
 'Lustiger seth' "Chris G's Test Account"]
77
MusikAnimal                 249
King of Hearts      

## Vandalism

We may be interested in how the notion of vandalism changed over the years. For this an inquiry into which filters have "vandalism" in their public description (and were tagged as "vandalism") and what they do may be interesting.

Archive for now, the question in not very cs-y

## Potential harassment

In [9]:
df_harassment_tagged = df[df['manual_tags'].fillna('').str.contains('harassment')]

print(df_harassment_tagged[['af_id', 'af_hidden', 'af_public_comments', 'manual_tags']])

     af_id  af_hidden                                 af_public_comments  \
14     189          0                             BLP vandalism or libel   
16     380          1                               Multiple obscenities   
23     686          0    IP adding possibly unreferenced material to BLP   
42     247          1                          Adding emails in articles   
45      11          0                                You/He/She/It sucks   
53     339          0  Claims of homosexuality, bisexuality, or trans...   
72       9          0       Personal attacks by unregistered or new user   
74     466          1                     Userspace & talk page spamming   
93     460          0                               Feedback: Foul words   
136    478          1                                    Talk page abuse   
148     97          0                       Personal attacks by new user   
150    294          1                                   Personal attacks   
154    463  

Another idea would be to classify filters according to the namespaces they cover. A filter targeting the talk/user name spaces may be indicative of dealing with personal attacks or harassment.

In [29]:
namespaces = df_origin[df_origin['af_pattern'].fillna('').str.contains('namespace')]
non_article_spaces = namespaces[~namespaces['af_pattern'].fillna('').str.contains('namespace == 0') & ~namespaces['af_pattern'].fillna('').str.contains('namespace = 0') & ~namespaces['af_pattern'].fillna('').str.contains('namespace==0')]

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print (non_article_spaces[['af_id', 'af_public_comments']])

     af_id                                 af_public_comments
37     320                               "Your mom" Vandalism
40     631                          Extraneous toolbar markup
55      65                               Excessive whitespace
61      33        Talk page blanking by unregistered/new user
67     803  Prevent new users from editing other's user pages
70     368  Making large changes when marking the edit as ...
81     167       Botched submissions to Articles for creation
83     613                                 Signing in article
85     627  Promotional text added by user to draft in own...
94      59   New user removing templates on image description
95     174                     New user removing XfD template
101    602         Arbitration discretionary sanctions alerts
107    420           Large removal of talk page content by IP
109    733  New user creating a page in someone else's use...
134    878                     New user removing COI template
157    8

Following filters seem to be potentially targeting harassment: (manually kick out all that obviously do not have anything to do with harassment)

    af_id                                 af_public_comments
    67     803  Prevent new users from editing other's user pages
    101    602         Arbitration discretionary sanctions alerts
    109    733  New user creating a page in someone else's use...
    274     99                 Edits to an other user's userspace
    285    123                New users moving other users' pages
    329    212  New user placing comments without a header on ...
    424    168          Non-admins responding to unblock requests
    619      6           Users editing editnotices of other users
    643     15                          Discussion page vandalism
    730    207              Non-admins reviewing unblock requests
    863     67                    Sockpuppetry at AfD discussions
    866    329                                     SPI disruption
    921    427                  Possible Emergency Reponse Needed

## Code snippets that may come in handy

In [36]:
# make a data frame out of list
ten_tags = manual_tags.head(10).str.split(", ", n = 1, expand = True).apply(pd.Series)
ten_tags = ten_tags.rename(columns = lambda x : 'tag_' + str(x))
ten_tags

0               good_faith
1                vandalism
2                vandalism
3                vandalism
4               good_faith
5               good_faith
6     good_faith, lazyness
7    vandalism, good_faith
8               good_faith
9               good_faith
Name: manual_tags, dtype: object


Unnamed: 0,tag_0,tag_1
0,good_faith,
1,vandalism,
2,vandalism,
3,vandalism,
4,good_faith,
5,good_faith,
6,good_faith,lazyness
7,vandalism,good_faith
8,good_faith,
9,good_faith,


In [88]:
raw_df.groupby('af_user_text').count()

MusikAnimal                 249
King of Hearts               91
Zzuuzz                       81
Rich Farmbrough              61
Ruslik0                      59
Prodego                      45
Samwalton9                   34
Cenarium                     32
NawlinWiki                   28
Xaosflux                     27
Reaper Eternal               25
Shirik                       23
Beetstra                     16
Dragons flight               15
Crow                         13
Legoktm                      11
Samtar                        9
The Anome                     9
Cyp                           7
BethNaught                    6
Ryan Kaldari (WMF)            5
BU Rob13                      5
Oshwah                        5
Kww                           5
Od Mishehu                    5
There'sNoTime                 5
Elockid                       4
Kuru                          4
Materialscientist             4
Mlitn                         4
                           ... 
This, th