In persistence API:

- **adds**: adds(non-stopwords) + adds(stopwords)

- **adds_surv_48h**: adds including stopwords surviving for 48 hours

- **adds_stopword_count**: adds(stopwords)

**dels** and **reins** are similar

In [1]:
# Import modules.

## For display.
from IPython.display import clear_output, display

## Data Processing
import pandas as pd

## APIs
from wikiwho_wrapper import WikiWho
from external.wikipedia import WikipediaDV, WikipediaAPI

## Managers
from metrics.conflict import ConflictManager
from metrics.token import TokensManager

In [2]:
# Parameters.
page_id = 23507036  # 23507036, 5544751
language = 'en'  # 'en', 'de'

In [3]:
# Persistence API
wikiwho_api = WikiWho(lng=language)
total_actions = wikiwho_api.dv.edit_persistence(page_id)

## Eliminate columns containing 'persistent'
cols_no_persi = total_actions.columns[~total_actions.columns.str.contains('persistent')]
total_actions = total_actions[cols_no_persi]

## Convert some columns' names to what we could understand ##
total_columns_names = ['total', 'total_surv_48h', 'total_stopword_count']

## Add all actions together and merge them into one aggregation dataframe.
total_actions = total_actions.join(pd.DataFrame(
    total_actions.loc[:,'adds':'adds_stopword_count'].values +\
    total_actions.loc[:,'dels':'dels_stopword_count'].values +\
    total_actions.loc[:,'reins':'reins_stopword_count'].values, 
    index=total_actions.index, 
    columns=total_columns_names
))

## Only editors id? No, we still want to know their names. Use WikipediaAPI to achieve ##
## this. More details see:                                                             ##
## https://github.com/gesiscss/wikiwho_demo/blob/master/external/wikipedia.py          ##
wikipediadv_instance = WikipediaDV(WikipediaAPI(lng=language))

# Unique editor's id for getting their names
unique_ids = total_actions['editor_id'].unique()

# Grab editors' names
grabbed_names = wikipediadv_instance.get_editors(unique_ids).rename(columns = {'userid': 'editor_id'})

## Merge the names of editors to the total actions dataframe ##

# Merge on editors ids
total_actions = total_actions.merge(grabbed_names[['editor_id', 'name']], on='editor_id')

# Adjust the column position and fill NaN by 'Unregistered'
total_actions.insert(3, 'editor', total_actions['name'])
total_actions = total_actions.drop(columns=['name'])
total_actions['editor'] = total_actions['editor'].fillna("Unregistered")

## Continue to use our data from A.1; first convert the date to the type datetime ##
total_actions['year_month'] = pd.to_datetime(total_actions['year_month'])

total_actions

Unnamed: 0,year_month,page_id,editor_id,editor,adds,adds_surv_48h,adds_stopword_count,dels,dels_surv_48h,dels_stopword_count,reins,reins_surv_48h,reins_stopword_count,conflict,elegibles,conflicts,revisions,total,total_surv_48h,total_stopword_count
0,2009-07-01,23507036,381506,Rhallanger,6,6,3,0,0,0,0,0,0,0.0,0,0,1,6,6,3
1,2009-08-01,23507036,3117090,MatP,9,9,4,0,0,0,0,0,0,0.0,0,0,1,9,9,4
2,2009-08-01,23507036,1094964,Zarex,1062,1062,546,2,2,1,0,0,0,0.0,0,0,1,1064,1064,547
3,2009-10-01,23507036,1094964,Zarex,103,103,88,30,30,18,0,0,0,0.0,0,0,1,133,133,106
4,2010-10-01,23507036,1094964,Zarex,4,4,2,0,0,0,0,0,0,0.0,0,0,1,4,4,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
435,2019-03-01,23507036,29182876,Lewis C,50,30,18,10,10,0,0,0,0,0.0,0,0,1,60,40,18
436,2019-03-01,23507036,26021349,MB,2,2,1,38,38,18,0,0,0,0.0,0,0,1,40,40,19
437,2019-03-01,23507036,13127810,Spicemix,5,5,4,3,3,3,0,0,0,0.0,0,0,1,8,8,7
438,2019-04-01,23507036,36324024,Urigamer,13,12,6,1,1,0,0,0,0,0.0,0,0,2,14,13,6


In [4]:
# Standard wikiwho api.
all_content = wikiwho_api.dv.all_content(page_id)
revisions = wikiwho_api.dv.rev_ids_of_article(page_id)

## Conflict manager.
conflict_manager = ConflictManager(all_content, revisions, lng=language, include_stopwords=False)
conflict_manager_inc_sw = ConflictManager(all_content, revisions, lng=language, include_stopwords=True)

conflict_manager.calculate()
conflict_manager_inc_sw.calculate()

clear_output()

## Token manager
all_actions = conflict_manager.all_actions
all_actions_inc_sw = conflict_manager_inc_sw.all_actions

token_manager = TokensManager(all_actions, maxwords=100000)
token_manager_inc_sw = TokensManager(all_actions_inc_sw, maxwords=100000)

adds, dels, reins = token_manager.token_survive()
adds_sw, dels_sw, reins_sw = token_manager_inc_sw.token_survive()

In [5]:
# Compare

## Filter date
def filter_date(df, start, end='2020-04-15'):
    df = df.reset_index()
    
    return df[(df['rev_time'] < end) & (df['rev_time'] > start)]

## Actions statistic from persistence api
group_total = total_actions.groupby(pd.Grouper(
                    key='year_month', freq='M'+'S')).sum().reset_index()

## An example for EN, from persistence api.
group_total['year_month'] = group_total['year_month'].astype(str)
print('Use data from Persistence API:')
display(group_total[group_total['year_month'] == '2012-08-01'])

Use data from Persistence API:


Unnamed: 0,year_month,page_id,editor_id,adds,adds_surv_48h,adds_stopword_count,dels,dels_surv_48h,dels_stopword_count,reins,reins_surv_48h,reins_stopword_count,conflict,elegibles,conflicts,revisions,total,total_surv_48h,total_stopword_count
37,2012-08-01,94028144,173687,47,42,21,30,30,6,0,0,0,0.948147,2,2,6,77,72,27


In [6]:
## Standard WikiWho api.
start_date = '2012-07-31'
end_date = '2012-09-01'
### adds action
df_adds = filter_date(adds, start_date, end_date)
df_adds_sw = filter_date(adds_sw, start_date, end_date)

print('Use data from standard WikiWho API:')
print('Adds including stopwords:', df_adds_sw.shape[0])
print('Adds including stopwords survived:', df_adds_sw[df_adds_sw['survive'] == 1].shape[0])
print('Adds excluding stopwords:', df_adds.shape[0])
print('Adds stopwords count:', df_adds_sw.shape[0] - df_adds.shape[0])

Use data from standard WikiWho API:
Adds including stopwords: 47
Adds including stopwords survived: 42
Adds excluding stopwords: 26
Adds stopwords count: 21


In [7]:
### dels action
df_dels = filter_date(dels, start_date, end_date)
df_dels_sw = filter_date(dels_sw, start_date, end_date)

print('Use data from standard WikiWho API:')
print('Dels including stopwords:', df_dels_sw.shape[0])
print('Dels including stopwords survived:', df_dels_sw[df_dels_sw['survive'] == 1].shape[0])
print('Dels excluding stopwords:', df_dels.shape[0])
print('Dels stopwords count:', df_dels_sw.shape[0] - df_dels.shape[0])

Use data from standard WikiWho API:
Dels including stopwords: 30
Dels including stopwords survived: 30
Dels excluding stopwords: 24
Dels stopwords count: 6


In [8]:
### reins action
df_reins = filter_date(reins, start_date, end_date)
df_reins_sw = filter_date(reins_sw, start_date, end_date)

print('Use data from standard WikiWho API:')
print('Reins including stopwords:', df_reins_sw.shape[0])
print('Reins including stopwords survived:', df_reins_sw[df_reins_sw['survive'] == 1].shape[0])
print('Reins excluding stopwords:', df_reins.shape[0])
print('Reins stopwords count:', df_reins_sw.shape[0] - df_reins.shape[0])

Use data from standard WikiWho API:
Reins including stopwords: 0
Reins including stopwords survived: 0
Reins excluding stopwords: 0
Reins stopwords count: 0
