In [1]:
import pandas as pd
from urllib.parse import unquote
import wikitoolkit as wt
import string
import pickle

my_agent = 'wikireddit <p.gildersleve@exeter.ac.uk>'


In [2]:
bodylinks = pd.read_hdf('data/bodylinks.h5', 'df')
titlelinks = pd.read_hdf('data/titlelinks.h5', 'df')
bodylinks['in_title'] = False
titlelinks['in_title'] = True
commentlinks = pd.read_hdf('data/commentlinks.h5', 'df')

posts = pd.read_hdf('data/posts.h5', 'df')
comments = pd.concat([pd.read_hdf(f'data/comments_{x}.h5') for x in range(1,5)]).reset_index(drop=True)

postlinks = pd.concat([bodylinks, titlelinks], ignore_index=True)
postlinks = posts.merge(postlinks, on='id', how='left').dropna(subset=['final_url'])
commentlinks = comments.merge(commentlinks, on='id', how='left').dropna(subset=['final_url'])

In [3]:
# redo this collection

postlinks['updated_at'] = postlinks['updated_at'].replace(pd.Timestamp('1970-01-01'), pd.NaT)
postlinks['created_date'] = postlinks['created_at'].dt.floor('D')
postlinks['updated_date'] = postlinks['updated_at'].dt.floor('D')
postlinks = postlinks[postlinks['final_valid']]
postlinks_unique = postlinks[['final_url', 'created_date', 'updated_date']].drop_duplicates().copy()
print(len(postlinks_unique))

commentlinks['last_modified_at'] = commentlinks['last_modified_at'].replace(pd.Timestamp('1970-01-01'), pd.NaT)
commentlinks['created_date'] = commentlinks['created_at'].dt.floor('D')
commentlinks['updated_date'] = commentlinks['last_modified_at'].dt.floor('D')
commentlinks = commentlinks[commentlinks['final_valid']]
commentlinks_unique = commentlinks[['final_url', 'created_date', 'updated_date']].drop_duplicates().copy()
print(len(commentlinks_unique))

all_links = pd.concat([postlinks_unique, commentlinks_unique], ignore_index=True).drop_duplicates().reset_index()

586676
9962439


In [15]:
def url_parse(url):
    if '?' in url:
        query = url.split('?')[1]
        query_dict = dict(q.split('=') for q in query.split('&') if len(q.split('=')) == 2)
        # print(query_dict)
        return query_dict
    else:
        return None

async def resolve_ids(links_df):

    missing = links_df[links_df['raw_title'].isna()][['lang', 'final_url']]
    for l in missing['lang'].unique():
        urls = missing[missing['lang'] == l]['final_url'].unique()
        urldicts = {u: url_parse(u) for u in urls}
        # print(urldicts)
        titlemaps = {}
        revmaps = {}
        pageidmaps = {}
        for u in urls:
            if urldicts[u] is not None:
                if 'title' in urldicts[u]:
                    titlemaps[u] = urldicts[u]['title'].replace('+', ' ')
                elif 'curid' in urldicts[u]:
                    pageidmaps[u] = unquote(urldicts[u]['curid']).strip(string.punctuation+string.whitespace)
                elif ('oldid' in urldicts[u])&(urldicts[u].get('oldid', '') != 'prev'):
                    revmaps[u] = unquote(urldicts[u]['oldid']).strip(string.punctuation+string.whitespace)
                elif 'diff' in urldicts[u]:
                    revmaps[u] = unquote(urldicts[u]['diff']).strip(string.punctuation+string.whitespace)
            # print(urldicts[u])

        wtsession = wt.WTSession(f'{l}.wikipedia', user_agent=my_agent)
        pagemaps = wt.PageMaps()
        # print(revmaps)
        if revmaps:
            # print(revmaps)
            revinfo = await wt.basic_info(wtsession, revids=list(revmaps.values()), pagemaps=pagemaps, params={'prop': 'revisions', 'rvprop': 'ids'})
        else:
            revinfo = []
        if pageidmaps:
            pageidinfo = await wt.basic_info(wtsession, pageids=list(pageidmaps.values()), pagemaps=pagemaps, params={'prop': 'revisions', 'rvprop': 'ids'})
        else:
            pageidinfo = []
        await wtsession.close()
        
        # print(pageidinfo)

        revtitledict = [{z['revid']: y['title'] for z in y['revisions']} for x in revinfo for y in x]
        # combine into single dict
        revtitledict = {k: v for d in revtitledict for k, v in d.items()}
        pageidtitledict = {y['pageid']: y['title'] for x in pageidinfo for y in x}
        # combine into single dict
        # print(pagetitledict)
        revmaps = {k: revtitledict.get(int(v), None) for k, v in revmaps.items()}
        pageidmaps = {k: pageidtitledict[int(v)] for k, v in pageidmaps.items()}

        titledict = {**titlemaps, **revmaps, **pageidmaps}

        urltitledict = {u: titledict.get(u, None) for u in urls}

        links_df.loc[(links_df['lang'] == l) & (links_df['raw_title'].isna()), 'raw_title'
                     ] = links_df.loc[(links_df['lang'] == l) & (links_df['raw_title'].isna()), 'final_url'].map(urltitledict)
    
    return links_df

In [40]:
links_df = all_links.copy()

langvars = ['zh-hans', 'zh-tw', 'zh-hk', 'zh-cn', 'zh-hant', 'zh', 'sr-ec', 'sr-el', 'zh-sg', 'zh-my', 'zh-mo', 'sr']
links_df.loc[:, 'lang_subdomain'] = links_df['final_url'].str.extract(r'https://([\w\.-]+)\.wikipedia\.org')[0]
links_df.loc[:, 'lang'] = links_df['lang_subdomain'].str.split('.').str[0]
links_df.loc[:, 'mobile'] = links_df['lang_subdomain'].str.split('.').str[1] == 'm'
links_df['final_url'] = links_df['final_url'].apply(unquote)
links_df.loc[:, 'raw_title'] = links_df['final_url'].str.extract(r'https://([\w\.-]+)\.wikipedia\.org/+wiki/+(.+)'
                                )[1].str.split('?').str[0]
links_df.loc[:, 'raw_title'] = links_df['raw_title'].fillna(
    links_df['final_url'].str.extract(r'https://([\w\.-]+)\.wikipedia\.org/api/rest_v1/page/mobile-html/(.+)')[1].str.split('?').str[0])

# raise
for lv in langvars:
    links_df.loc[:, 'raw_title'] = links_df['raw_title'].fillna(
        links_df['final_url'].str.extract(r'https://([\w\.-]+)\.wikipedia\.org/+%s/+([^/]+)' %lv)[1].str.split('?').str[0])

links_df = await resolve_ids(links_df)
links_df['raw_title'] = links_df['raw_title'].str.replace('_', ' ')

RuntimeError: No active exception to reraise

In [58]:
lna = links_df[links_df['raw_title'].isna()]

In [59]:
vc = lna['final_url'].str.split('/').str[3].value_counts().index

In [60]:
lna['final_url'].str.split('/').str[3].value_counts()

final_url
                   4135
w                   156
api                  47
wiki                 19
static                5
robots.txt            3
favicon.ico           3
wiki\                 2
wikipedia             2
?search=              1
?url=https:           1
?uselang=en           1
?uselang=ja           1
portal                1
zh-cn                 1
?useskin=vector       1
?wiki                 1
503.html              1
Name: count, dtype: int64

In [64]:
vc[4]

'static'

In [None]:
# check /w/index/ !!!

In [68]:
for i in lna[lna['final_url'].str.split('/').str[3].isin(vc[5:])]['final_url'].value_counts().index:
    print(i)

https://en.wikipedia.org/robots.txt
https://en.wikipedia.org/favicon.ico
https://en.wikipedia.org/wiki\/DemoSat","video_link":"https:\/\/www.youtube.com\/watch?v=0a_00nJ_Y88","youtube_id":"0a_00nJ_Y88","flickr_images"
https://en.wikipedia.org/?search=
https://upload.wikimedia.org/wikipedia/commons/3/30/George_the_amazing_guinea_pig.jpg
https://en.wikipedia.org/?url=https:/%/wiki/Robert_C._Martin
https://ja.wikipedia.org/?uselang=en
https://ja.wikipedia.org/?uselang=ja
https://www.wikipedia.org/portal/wikipedia.org/assets/img/Wikipedia-logo-v2@2x.png
https://zh.wikipedia.org/zh-cn/
https://en.wikipedia.org/?useskin=vector
https://upload.wikimedia.org/wikipedia/commons/2/23/US_one_dollar_bill,_obverse,_series_2009.jpg
https://en.wikipedia.org/?wiki/Persecution_of_Christians_in_the_Soviet_Union
https://en.wikipedia.org/503.html


In [7]:
links_df['raw_title'].str.replace('_', ' ').dropna()

0                                                    Unit 731
1                                                 Shanghaiing
2           List of public corporations by market capitali...
3                                                  Space suit
4                                                 Captain Ron
                                  ...                        
10538723                                        Surplus value
10538724                                     Romani Holocaust
10538725                   Bombing of Bangkok in World War II
10538726                                     Historical Jesus
10538727                   Shipborne rolling vertical landing
Name: raw_title, Length: 10512371, dtype: object

In [73]:
articles_long = links_df.melt(id_vars=['lang', 'raw_title'], value_vars=['created_date', 'updated_date'], 
                                       var_name='date_type', value_name='date').dropna(subset=['date'])
articles_long = articles_long.rename(columns={'date_type': 'is_updated_date'}).reset_index(drop=True)
articles_long['is_updated_date'] = articles_long['is_updated_date'] == 'updated_date'
articles_long = articles_long.copy()

article_dates_unique = articles_long[['lang', 'raw_title', 'date']].drop_duplicates().reset_index(drop=True)
article_dates_unique

Unnamed: 0,lang,raw_title,date
0,en,Unit 731,2020-01-29
1,en,Shanghaiing,2020-09-06
2,en,List of public corporations by market capitali...,2020-04-25
3,en,Space suit,2020-07-18
4,en,Captain Ron,2020-04-27
...,...,...,...
13164572,nl,Zaak-Arcopar,2023-12-23
13164573,de,Audi A4 B9,2023-12-14
13164574,de,Audi Q4 e-tron,2023-12-14
13164575,en,Bombing of Bangkok in World War II,2023-12-23


In [74]:
article_dates_unique.to_hdf('data/article_dates_unique.h5', 'df', mode='w')

  article_dates_unique.to_hdf('data/article_dates_unique.h5', 'df', mode='w')
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['lang', 'raw_title'], dtype='object')]

  article_dates_unique.to_hdf('data/article_dates_unique.h5', 'df', mode='w')


In [75]:
pagemapsdict = {}
for lang in article_dates_unique['lang'].unique():
    print(lang)
    lang_articles = article_dates_unique[article_dates_unique['lang'] == lang]
    wtsession = wt.WTSession(f'{lang}.wikipedia', user_agent=my_agent)
    pagemapsdict[lang] = wt.PageMaps()
    groupsize = 1000
    ua = list(lang_articles['raw_title'].unique())
    groups = [ua[i:i+groupsize] for i in range(0, len(ua), groupsize)]
    for g in groups:
        ingroupsize = groupsize
        done = 0
        while done < len(g):
            try:
                await pagemapsdict[lang].fix_redirects(wtsession, titles=g[done:done+ingroupsize])
                done += ingroupsize
            except ValueError:
                ingroupsize = ingroupsize//2
                print(f'Error, reducing group size to {ingroupsize}')

    await wtsession.close()

    article_dates_unique.loc[article_dates_unique['lang'] == lang, 'redirected_title'] = article_dates_unique['raw_title'].map(pagemapsdict[lang].norm_map).fillna(article_dates_unique['raw_title'])   
    article_dates_unique.loc[article_dates_unique['lang'] == lang, 'redirected_title'] = (
        article_dates_unique.loc[article_dates_unique['lang'] == lang, 'redirected_title']
        .map(pagemapsdict[lang].titles_redirect_map)
        .fillna(article_dates_unique.loc[article_dates_unique['lang'] == lang, 'redirected_title'])
    )
    article_dates_unique.loc[article_dates_unique['lang'] == lang, 'pageid'] = article_dates_unique['redirected_title'].map(pagemapsdict[lang].id_map)
    
with open('data/langpagemaps.pkl', 'wb') as f:
    pickle.dump(pagemapsdict, f)
article_dates_unique = article_dates_unique.drop_duplicates(subset=['lang', 'date', 'redirected_title']).reset_index(drop=True)
article_dates_unique.to_hdf('data/article_dates_unique.h5', 'df', mode='w')

en




ru
pt
it
ro
nl
es
fr
no
de
simple
eu
ja
zh
tr
hi
sl
ar
cs
csb
mk
szl
hr
uk
sr
sk
bg
cu
dsb
bs
hsb
sh
pl
rue
be
ml
el
fa
sw
ko
id
www
hu
fi
vi
eo
th
Error, reducing group size to 500
Error, reducing group size to 250
Error, reducing group size to 125
sv
ta
lt
sco
he
ga
zh-classical
hy
da
ur
ast
ca
als
et
nan
tl
arz
ka
az
sq
is
la
ms
got
fy
stq
lb
sa
rmy
sd
bar
af
nds
vo
ang
yi
pcd
cy
kw
lv
ltg
pap
so
vec
qu
nap
si
my
nds-nl
zh-yue
nostalgia
ie
tk
bn
nah
lmo
nn
ku
mr
ba
kk
zh-min-nan
gl
avk
vep
ary
ia
hif
ps
skr
pnb
ks
bpy
pa
or
te
kn
gu
jam
ceb
mn
shi
kab
azb
uz
ky
mni
ne
wuu
nv
pam
cbk-zam
bh
tg
am
be-tarask
ty
vls
co
br
frp
nrm
gcr
oc
mwl
wa
li
pfl
gd
rm
fur
pdc
tpi
ksh
fo
os
an
pi
se
bat-smg
gom
olo
gn
mg
mt
arc
test
war
roa-rup
crh
ace
tyv
gv
eml
scn
pnt
zea
lad
tt
sc
jbo
km
gan
ha
frr
ckb
pih
lfn
yo
bo
min
nov
io
ht
tn
pms
kv
ab
aa
bcl
thankyou
ug
sat
nqo
kl
xh
jv
om
zu
bi
ff
haw
lij
mai
ext
wo
tay
trv
cdo
hak
dv
ss
sm
rw
cv
chy
mi
cr
xmf
su
bug
hyw
bjn
roa-tara
kaa
diq
atj
ban
fiu

  article_dates_unique.to_hdf('data/article_dates_unique.h5', 'df', mode='w')
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block2_values] [items->Index(['lang', 'raw_title', 'redirected_title'], dtype='object')]

  article_dates_unique.to_hdf('data/article_dates_unique.h5', 'df', mode='w')


In [3]:
article_dates_unique = pd.read_hdf('data/article_dates_unique.h5', 'df')

In [4]:
article_dates_unique[article_dates_unique['pageid'].isna()]['redirected_title'].value_counts()

redirected_title
                     84
Augustów roundup     2
Name: count, dtype: int64

In [79]:
article_dates_unique[article_dates_unique['pageid']==-1]['redirected_title'].value_counts().head(50)

redirected_title
Special:BookSources                                                                                      1001
Special:RecentChangesLinked                                                                               331
Special:Search                                                                                            294
File:Graham's Hierarchy of Disagreement.svg                                                               254
Attacks on the United States                                                                              155
Pol/                                                                                                      113
Special:UserLogin                                                                                          86
Special:Log                                                                                                82
Did Jesus Exist                                                                                        

In [4]:
article_dates_unique = article_dates_unique.dropna(subset=['pageid'])
article_dates_unique = article_dates_unique[article_dates_unique['pageid'] != -1]
article_dates_unique['pageid'] = article_dates_unique['pageid'].astype(int)

In [8]:
lang = 'en'
lang_article_dates_unique = article_dates_unique[article_dates_unique['lang'] == lang]
lang_article_dates_unique

Unnamed: 0,lang,raw_title,date,redirected_title,pageid
0,en,Unit 731,2020-01-29,Unit 731,214659
1,en,Shanghaiing,2020-09-06,Shanghaiing,686244
2,en,List of public corporations by market capitali...,2020-04-25,List of public corporations by market capitali...,14094649
3,en,Space suit,2020-07-18,Space suit,39375
4,en,Captain Ron,2020-04-27,Captain Ron,4179081
...,...,...,...,...,...
13109304,en,Chengdu J-20,2024-01-01,Chengdu J-20,30236719
13109305,en,Antisemitism is the socialism of fools,2023-12-29,Antisemitism is the socialism of fools,60879210
13109306,en,Grand Theft Auto: London 1969,2023-12-07,Grand Theft Auto: London 1969,10947703
13109310,en,Bombing of Bangkok in World War II,2023-12-23,Bombing of Bangkok in World War II,33247384


In [13]:
langs = article_dates_unique['lang'].unique()

ranges_dfs = []
for lang in langs:
    print(lang)

    lang_article_dates_unique = article_dates_unique[article_dates_unique['lang'] == lang]
    lang_article_dates_unique = lang_article_dates_unique.sort_values(['redirected_title', 'date'])
    lang_article_dates_unique['start_date'] = lang_article_dates_unique['date'] - pd.DateOffset(days=10)
    lang_article_dates_unique['end_date'] = lang_article_dates_unique['date'] + pd.DateOffset(days=11)
    
    # get overlapping date ranges with same article
    title = None
    last_date = None
    range_dfs = []
    l_range_df = []
    n = 0
    for i, row in lang_article_dates_unique.iterrows():
        if n % 100000 == 0:
            print(n/len(lang_article_dates_unique), end='\r')
        if row['redirected_title'] == title:
            if row['start_date'] <= last_date:
                last_date = row['end_date']
            else:
                l_range_df.append({'title': title, 'start_date': start_date, 'end_date': last_date})
                start_date = row['start_date']
                last_date = row['end_date']  
        else:
            if title is not None:
                l_range_df.append({'title': title, 'start_date': start_date, 'end_date': last_date})
            title = row['redirected_title']
            start_date = row['start_date']
            last_date = row['end_date']
        n+=1
    l_range_df.append({'title': title, 'start_date': start_date, 'end_date': last_date})

    l_range_df = pd.DataFrame(l_range_df)
    l_range_df['lang'] = lang
    ranges_dfs.append(l_range_df[['lang', 'title', 'start_date', 'end_date']])

ranges_df = pd.concat(ranges_dfs, ignore_index=True)
ranges_df.to_hdf('data/ranges_df.h5', 'df', mode='w')

en
ru999516575275909234
pt0
it0
ro0
nl0
es0
fr0
no0
de0
simple4802713005595
eu0
ja0
zh0
tr0
hi0
sl0
ar0
cs0
csb
mk0
szl
hr0
uk0
sr0
sk0
bg0
cu0
dsb
bs0
hsb
sh0
pl0
rue
be0
ml0
el0
fa0
sw0
ko0
id0
hu0
fi0
vi0
eo0
th0
sv0
ta0
lt0
sco
he0
ga0
zh-classical
hy0
da0
ur0
ast
ca0
als
et0
tl0
arz
ka0
az0
sq0
is0
la0
ms0
got
fy0
stq
lb0
sa0
rmy
sd0
bar
af0
nds
vo0
ang
yi0
pcd
cy0
kw0
lv0
ltg
pap
so0
vec
qu0
nap
si0
my0
nds-nl
zh-yue
nostalgia
ie0
tk0
bn0
nah
lmo
nn0
ku0
mr0
ba0
kk0
zh-min-nan
gl0
avk
vep
ary
ia0
hif
ps0
skr
pnb
ks0
bpy
pa0
or0
te0
kn0
gu0
jam
ceb
mn0
shi
kab
azb
uz0
ky0
mni
ne0
wuu
nv0
pam
cbk-zam
bh0
tg0
am0
be-tarask
ty0
vls
co0
br0
frp
nrm
gcr
oc0
mwl
wa0
li0
pfl
gd0
rm0
fur
pdc
tpi
ksh
fo0
os0
an0
pi0
se0
bat-smg
gom
olo
gn0
mg0
mt0
arc
test
war
roa-rup
crh
ace
tyv
gv0
eml
scn
pnt
zea
lad
tt0
sc0
jbo
km0
gan
frr
ckb
lfn
pih
yo0
min
nov
io0
ht0
tn0
pms
kv0
ab0
aa0
bcl
thankyou
ug0
bo0
sat
nqo
kl0
xh0
jv0
om0
zu0
bi0
ff0
haw
lij
mai
ext
wo0
tay
trv
cdo
hak
dv0
ss0
sm0
rw0
cv0


  ranges_df.to_hdf('data/ranges_df.h5', 'df', mode='w')


In [215]:

missing = links_df[links_df['raw_title'].isna()][['lang', 'final_url']]
for l in missing['lang'].unique():
    urls = missing[missing['lang'] == l]['final_url'].unique()
    urldicts = {u: url_parse(u) for u in urls}
    print(urldicts)
    titlemaps = {}
    revmaps = {}
    pageidmaps = {}
    for u in urls:
        if urldicts[u] is not None:
            if 'title' in urldicts[u]:
                titlemaps[u] = urldicts[u]['title'].replace('+', ' ')
            elif 'curid' in urldicts[u]:
                pageidmaps[u] = unquote(urldicts[u]['curid']).strip(string.punctuation+string.whitespace)
            elif ('oldid' in urldicts[u])&(urldicts[u].get('oldid', '') != 'prev'):
                revmaps[u] = unquote(urldicts[u]['oldid']).strip(string.punctuation+string.whitespace)
            elif 'diff' in urldicts[u]:
                revmaps[u] = unquote(urldicts[u]['diff']).strip(string.punctuation+string.whitespace)
        print(urldicts[u])

    wtsession = wt.WTSession(f'{l}.wikipedia', user_agent=my_agent)
    pagemaps = wt.PageMaps()
    print(revmaps)
    if revmaps:
        print(revmaps)
        revinfo = await wt.basic_info(wtsession, revids=list(revmaps.values()), pagemaps=pagemaps, params={'prop': 'revisions', 'rvprop': 'ids'})
    else:
        revinfo = []
    if pageidmaps:
        pageidinfo = await wt.basic_info(wtsession, pageids=list(pageidmaps.values()), pagemaps=pagemaps, params={'prop': 'revisions', 'rvprop': 'ids'})
    else:
        pageidinfo = []
    await wtsession.close()
    
    print(pageidinfo)

    revtitledict = [{z['revid']: y['title'] for z in y['revisions']} for x in revinfo for y in x]
    # combine into single dict
    revtitledict = {k: v for d in revtitledict for k, v in d.items()}
    pagetitledict = {y['pageid']: y['title'] for x in pageidinfo for y in x}
    # combine into single dict
    print(pagetitledict)
    revmaps = {k: revtitledict.get(int(v), None) for k, v in revmaps.items()}
    titlemaps = {k: pagetitledict[int(v)] for k, v in pageidmaps.items()}

    titledict = {**titlemaps, **revmaps, **titlemaps}

    urltitledict = {u: titledict.get(u, None) for u in urls}



{'title': 'Soka_Gakkai', 'oldid': '548095387'}
{'title': 'Clannad_(visual_novel)', 'redirect': 'no'}
{'title': 'Google_Stadia'}
{'title': 'Zinc_carbonate', 'redirect': 'no'}
{'title': 'Special:Contributions/2001:8003:4000:0:0:0:0:0/35'}
{'title': 'Culture_war', 'oldid': '963768354'}
{'title': 'Tax_protester', 'action': 'edit', 'section': '5'}
{'title': 'Pay2Win', 'redirect': 'no'}
{'title': 'Quit_India_speech'}
{'lang': 'en', 'modules': 'ext.cite.styles%7Cext.math.styles%7Cext.uls.interlanguage%7Cext.visualEditor.desktopArticleTarget.noscript%7Cext.wikimediaBadges%7Cmediawiki.toc.styles%7Cskins.vector.styles.legacy%7Cwikibase.client.init', 'only': 'styles', 'skin': 'vector'}
{'title': 'Media_in_Ottawa%E2%80%93Gatineau', 'action': 'edit', 'section': '3'}
{'title': 'Nau_(ship)', 'redirect': 'no'}
{'action': 'query', 'titles': "'"}
{'action': 'query', 'titles': "'File:Super%20Smash%20Bros%20Melee%20box%20art.png'", 'prop': 'imageinfo', 'iiprop': 'url'}
{'action': 'query', 'prop': 'images'

CancelledError: 