In [1]:
import pandas as pd
from urllib.parse import unquote
import wikitoolkit as wt
import string
import pickle
import sqlite3

my_agent = 'wikireddit <p.gildersleve@exeter.ac.uk>'

In [2]:
#  create full link tables

bodylinks = pd.read_hdf('data/bodylinks.h5', 'df')
titlelinks = pd.read_hdf('data/titlelinks.h5', 'df')
bodylinks['in_title'] = False
titlelinks['in_title'] = True
commentlinks = pd.read_hdf('data/commentlinks.h5', 'df')

posts = pd.read_hdf('data/posts.h5', 'df')
comments = pd.concat([pd.read_hdf(f'data/comments_{x}.h5') for x in range(1,5)]).reset_index(drop=True)

postlinks = pd.concat([bodylinks, titlelinks], ignore_index=True)
postlinks = posts.merge(postlinks, on='id', how='left').dropna(subset=['final_url'])
commentlinks = comments.merge(commentlinks, on='id', how='left').dropna(subset=['final_url'])

In [None]:
# put all unique (date, links) in one table

postlinks['created_date'] = postlinks['created_at'].dt.floor('D')
postlinks['updated_date'] = postlinks['updated_at'].dt.floor('D')
postlinks = postlinks[postlinks['final_valid']]
postlinks_unique = postlinks[['final_url', 'created_date', 'updated_date']].drop_duplicates().copy()
print(len(postlinks_unique))

commentlinks['created_date'] = commentlinks['created_at'].dt.floor('D')
commentlinks['updated_date'] = commentlinks['last_modified_at'].dt.floor('D')
commentlinks = commentlinks[commentlinks['final_valid']]
commentlinks_unique = commentlinks[['final_url', 'created_date', 'updated_date']].drop_duplicates().copy()
print(len(commentlinks_unique))

all_links = pd.concat([postlinks_unique, commentlinks_unique], ignore_index=True).drop_duplicates().reset_index()

In [7]:
# functions to get article titles from urls

def url_parse(url):
    if '?' in url:
        query = url.split('?')[1]
        query_dict = dict(q.split('=') for q in query.split('&') if len(q.split('=')) == 2)
        # print(query_dict)
        return query_dict
    else:
        return None

async def resolve_ids(links_df):

    missing = links_df[links_df['raw_title'].isna()][['lang', 'final_url']]
    for l in missing['lang'].unique():
        urls = missing[missing['lang'] == l]['final_url'].unique()
        urldicts = {u: url_parse(u) for u in urls}
        # print(urldicts)
        titlemaps = {}
        revmaps = {}
        pageidmaps = {}
        for u in urls:
            if urldicts[u] is not None:
                if 'title' in urldicts[u]:
                    titlemaps[u] = urldicts[u]['title'].replace('+', ' ')
                elif 'curid' in urldicts[u]:
                    pageidmaps[u] = unquote(urldicts[u]['curid']).strip(string.punctuation+string.whitespace)
                elif ('oldid' in urldicts[u])&(urldicts[u].get('oldid', '') != 'prev'):
                    revmaps[u] = unquote(urldicts[u]['oldid']).strip(string.punctuation+string.whitespace)
                elif 'diff' in urldicts[u]:
                    revmaps[u] = unquote(urldicts[u]['diff']).strip(string.punctuation+string.whitespace)
            # print(urldicts[u])

        wtsession = wt.WTSession(f'{l}.wikipedia', user_agent=my_agent)
        pagemaps = wt.PageMaps()
        # print(revmaps)
        if revmaps:
            # print(revmaps)
            revinfo = await wt.basic_info(wtsession, revids=list(revmaps.values()), pagemaps=pagemaps, params={'prop': 'revisions', 'rvprop': 'ids'})
        else:
            revinfo = []
        if pageidmaps:
            pageidinfo = await wt.basic_info(wtsession, pageids=list(pageidmaps.values()), pagemaps=pagemaps, params={'prop': 'revisions', 'rvprop': 'ids'})
        else:
            pageidinfo = []
        await wtsession.close()
        
        # print(pageidinfo)

        revtitledict = [{z['revid']: y['title'] for z in y['revisions']} for x in revinfo for y in x]
        # combine into single dict
        revtitledict = {k: v for d in revtitledict for k, v in d.items()}
        pageidtitledict = {y['pageid']: y['title'] for x in pageidinfo for y in x}
        # combine into single dict
        # print(pagetitledict)
        revmaps = {k: revtitledict.get(int(v), None) for k, v in revmaps.items()}
        pageidmaps = {k: pageidtitledict[int(v)] for k, v in pageidmaps.items()}

        titledict = {**titlemaps, **revmaps, **pageidmaps}

        urltitledict = {u: titledict.get(u, None) for u in urls}

        links_df.loc[(links_df['lang'] == l) & (links_df['raw_title'].isna()), 'raw_title'
                     ] = links_df.loc[(links_df['lang'] == l) & (links_df['raw_title'].isna()), 'final_url'].map(urltitledict)
    
    return links_df

In [19]:
# get article titles from urls

links_df = all_links.copy()
langvars = ['zh-hans', 'zh-tw', 'zh-hk', 'zh-cn', 'zh-hant', 'zh', 'sr-ec', 'sr-el', 'zh-sg', 'zh-my', 'zh-mo', 'sr'] # lang variants
links_df.loc[:, 'lang_subdomain'] = links_df['final_url'].str.extract(r'https://([\w\.-]+)\.wikipedia\.org')[0]
links_df.loc[:, 'lang'] = links_df['lang_subdomain'].str.split('.').str[0]
links_df.loc[:, 'mobile'] = links_df['lang_subdomain'].str.split('.').str[1] == 'm'
links_df['final_url'] = links_df['final_url'].apply(unquote)
links_df.loc[:, 'raw_title'] = links_df['final_url'].str.extract(r'https://([\w\.-]+)\.wikipedia\.org/+wiki/+(.+)'
                                )[1].str.split('?').str[0]
links_df.loc[:, 'raw_title'] = links_df['raw_title'].fillna(
    links_df['final_url'].str.extract(r'https://([\w\.-]+)\.wikipedia\.org/api/rest_v1/page/mobile-html/(.+)')[1].str.split('?').str[0])
links_df.loc[:, 'raw_title'] = links_df['raw_title'].fillna(
    links_df['final_url'].str.extract(r'https://([\w\.-]+)\.wikipedia\.org/+w/index\.php\?title=([^&]+)')[1].str.split('?').str[0])

# raise
for lv in langvars:
    links_df.loc[:, 'raw_title'] = links_df['raw_title'].fillna(
        links_df['final_url'].str.extract(r'https://([\w\.-]+)\.wikipedia\.org/+%s/+([^/]+)' %lv)[1].str.split('?').str[0])

links_df = await resolve_ids(links_df)
links_df['raw_title'] = links_df['raw_title'].str.replace('_', ' ')

In [None]:
# sanity checks
lna = links_df[links_df['raw_title'].isna()]
vc = lna['final_url'].str.split('/').str[3].value_counts().index
lna['final_url'].str.split('/').str[3].value_counts()

In [None]:
for i in lna[lna['final_url'].str.split('/').str[3]=='?wiki']['final_url'].value_counts().index:
    print(i)

In [None]:
# create long table of articles and dates
articles_long = links_df.melt(id_vars=['lang', 'raw_title'], value_vars=['created_date', 'updated_date'], 
                                       var_name='date_type', value_name='date').dropna(subset=['date'])
articles_long = articles_long.rename(columns={'date_type': 'is_updated_date'}).reset_index(drop=True)
articles_long['is_updated_date'] = articles_long['is_updated_date'] == 'updated_date'
articles_long = articles_long.copy()

article_dates_unique = articles_long[['lang', 'raw_title', 'date']].drop_duplicates().reset_index(drop=True)
article_dates_unique.to_hdf('data/article_dates_unique.h5', 'df', mode='w')
article_dates_unique

Collect ID and redirect data

In [32]:
article_dates_unique = pd.read_hdf('data/article_dates_unique.h5', 'df')

In [None]:
pagemapsdict = {}

In [None]:
for lang in article_dates_unique['lang'].unique():
    print()
    print(lang)
    if lang not in pagemapsdict:
        pagemapsdict[lang] = wt.PageMaps()
    lang_articles = article_dates_unique[article_dates_unique['lang'] == lang]
    wtsession = wt.WTSession(f'{lang}.wikipedia', user_agent=my_agent)
    groupsize = 1000
    ua = list(lang_articles['raw_title'].unique())
    groups = [ua[i:i+groupsize] for i in range(0, len(ua), groupsize)]

    for n, g in enumerate(groups):
        print(f'Fixing redirects {n+1}/{len(groups)}', end='\r')
        ingroupsize = groupsize
        done = 0
        while done < len(g):
            try:
                await pagemapsdict[lang].fix_redirects(wtsession, titles=g[done:done+ingroupsize])
                done += ingroupsize
            except ValueError:
                ingroupsize = ingroupsize//2
                
                print(f'\nError, reducing group size to {ingroupsize}')

    for n, g in enumerate(groups):
        print(f'Getting wikidata ids {n+1}/{len(groups)}', end='\r')
        ingroupsize = groupsize
        done = 0
        while done < len(g):
            try:
                await pagemapsdict[lang].get_wikidata_ids(wtsession, titles=g[done:done+ingroupsize])
                done += ingroupsize
            except ValueError:
                ingroupsize = ingroupsize//2
                print(f'\nError, reducing group size to {ingroupsize}')

    for n, g in enumerate(groups):
        print(f'Getting redirects {n+1}/{len(groups)}', end='\r')
        ingroupsize = groupsize
        done = 0
        while done < len(g):
            try:
                await pagemapsdict[lang].get_redirects(wtsession, titles=g[done:done+ingroupsize])
                done += ingroupsize
            except ValueError:
                ingroupsize = ingroupsize//2
                print(f'\nError, reducing group size to {ingroupsize}')

    await wtsession.close()

# save pagemapsdict
with open('data/langpagemaps.pkl', 'wb') as f:
    pickle.dump(pagemapsdict, f)


Convert redirects table to save in db

In [13]:
with open('data/langpagemaps.pkl', 'rb') as f:
    pagemapsdict = pickle.load(f)

In [None]:
pageidsdf = []
for lang, pm in pagemapsdict.items():
    df = pd.Series(pm.id_map).reset_index(name='pageid').rename(columns={'index': 'title'})
    wikidata_ids = pd.Series(pm.wikidata_id_map).reset_index(name='wikidata_id').rename(columns={'index': 'title'})
    df['lang'] = lang
    df = df.merge(wikidata_ids, on='title', how='left')
    pageidsdf.append(df[['lang', 'title', 'pageid', 'wikidata_id']])

pageidsdf = pd.concat(pageidsdf, ignore_index=True)
pageidsdf

In [None]:
resolved_redirects = []
for lang, pm in pagemapsdict.items():
    norm_df = pd.Series(pm.norm_map).reset_index(name='norm_title').rename(columns={'index': 'raw_title'})
    titles_redirect_df = pd.Series(pm.titles_redirect_map).reset_index(name='redirected_title').rename(columns={'index': 'norm_title'})

    redirects_df = norm_df.merge(titles_redirect_df, on='norm_title', how='outer')
    redirects_df['raw_title'] = redirects_df['raw_title'].fillna(redirects_df['norm_title'])
    redirects_df['lang'] = lang

    resolved_redirects.append(redirects_df[['lang', 'raw_title', 'norm_title', 'redirected_title']])
resolved_redirects = pd.concat(resolved_redirects, ignore_index=True)
resolved_redirects

In [None]:
collected_redirects = []
for lang, pm in pagemapsdict.items():
    df = pd.DataFrame([(k, y) for k, v in pm.collected_title_redirects.items() for y in v],
                columns=['canonical_title', 'other_title'])
    df['lang'] = lang
    collected_redirects.append(df[['lang', 'canonical_title', 'other_title']])
collected_redirects = pd.concat(collected_redirects, ignore_index=True)
collected_redirects

In [None]:
# save pagemaps data to sql tables

conn = sqlite3.connect('wikireddit.db')
pageidsdf.to_sql('wiki_ids', conn, if_exists='replace', index=False)
resolved_redirects.to_sql('resolved_redirects', conn, if_exists='replace', index=False)
collected_redirects.to_sql('collected_redirects', conn, if_exists='replace', index=False)


In [None]:
# get storage size of database
import os
mb_size = os.path.getsize('wikireddit.db') / 1024 / 1024
print(f"Database size: {mb_size:.2f} MB")

Process the lang link date tables to get canonical titles

In [None]:
for lang in pagemapsdict:
    print(lang)
    article_dates_unique.loc[article_dates_unique['lang'] == lang, 'redirected_title'] = article_dates_unique['raw_title'].map(pagemapsdict[lang].norm_map).fillna(article_dates_unique['raw_title'])   
    article_dates_unique.loc[article_dates_unique['lang'] == lang, 'redirected_title'] = (
        article_dates_unique.loc[article_dates_unique['lang'] == lang, 'redirected_title']
        .map(pagemapsdict[lang].titles_redirect_map)
        .fillna(article_dates_unique.loc[article_dates_unique['lang'] == lang, 'redirected_title'])
    )
    article_dates_unique.loc[article_dates_unique['lang'] == lang, 'pageid'] = article_dates_unique['redirected_title'].map(pagemapsdict[lang].id_map)
    article_dates_unique.loc[article_dates_unique['lang'] == lang, 'wikidata_id'] = article_dates_unique['redirected_title'].map(pagemapsdict[lang].wikidata_id_map)

article_dates_unique = article_dates_unique.drop_duplicates(subset=['lang', 'date', 'redirected_title']).reset_index(drop=True)


In [88]:
#  bug fix - unicode parsing pecularity of api?

article_dates_unique.loc[article_dates_unique['raw_title']=='Augustów roundup', 'redirected_title'] = 'Augustów roundup'
article_dates_unique.loc[article_dates_unique['raw_title']=='Augustów roundup', 'pageid'] = 6002747
article_dates_unique.loc[article_dates_unique['raw_title']=='Augustów roundup', 'wikidata_id'] = 'Q2612443'

In [None]:
# sanity checks

display(article_dates_unique[article_dates_unique['pageid'].isna()]['redirected_title'].value_counts())
article_dates_unique[article_dates_unique['pageid']==-1]['redirected_title'].value_counts().head(50) # -1 is a placeholder for missing pageids - these are all invalid / special pages

In [None]:
# clean up
article_dates_unique = article_dates_unique.dropna(subset=['pageid'])
article_dates_unique = article_dates_unique[article_dates_unique['pageid'] != -1]
article_dates_unique['pageid'] = article_dates_unique['pageid'].astype(int)

article_dates_unique.to_hdf('data/article_dates_unique.h5', 'df', mode='w')

In [None]:
# get (overlapping) date ranges for each article

langs = article_dates_unique['lang'].unique()

ranges_dfs = []
for lang in langs:
    print(lang)

    lang_article_dates_unique = article_dates_unique[article_dates_unique['lang'] == lang]
    lang_article_dates_unique = lang_article_dates_unique.sort_values(['redirected_title', 'date'])
    lang_article_dates_unique['start_date'] = lang_article_dates_unique['date'] - pd.DateOffset(days=10)
    lang_article_dates_unique['end_date'] = lang_article_dates_unique['date'] + pd.DateOffset(days=11)
    
    # get overlapping date ranges with same article
    title = None
    last_date = None
    range_dfs = []
    l_range_df = []
    n = 0
    for i, row in lang_article_dates_unique.iterrows():
        if n % 100000 == 0:
            print(n/len(lang_article_dates_unique), end='\r')
        if row['redirected_title'] == title:
            if row['start_date'] <= last_date:
                last_date = row['end_date']
            else:
                l_range_df.append({'title': title, 'start_date': start_date, 'end_date': last_date})
                start_date = row['start_date']
                last_date = row['end_date']  
        else:
            if title is not None:
                l_range_df.append({'title': title, 'start_date': start_date, 'end_date': last_date})
            title = row['redirected_title']
            start_date = row['start_date']
            last_date = row['end_date']
        n+=1
    l_range_df.append({'title': title, 'start_date': start_date, 'end_date': last_date})

    l_range_df = pd.DataFrame(l_range_df)
    l_range_df['lang'] = lang
    ranges_dfs.append(l_range_df[['lang', 'title', 'start_date', 'end_date']])

ranges_df = pd.concat(ranges_dfs, ignore_index=True)
ranges_df.to_hdf('data/ranges_df.h5', 'df', mode='w')

Also get lang article dates for raw, originally posted titles

In [36]:
with open('data/langpagemaps.pkl', 'rb') as f:
    pagemapsdict = pickle.load(f)

In [None]:
for lang in pagemapsdict:
    print(lang)
    article_dates_unique.loc[article_dates_unique['lang'] == lang, 'norm_raw_title'] = article_dates_unique['raw_title'].map(pagemapsdict[lang].norm_map).fillna(article_dates_unique['raw_title']) 
article_dates_unique

In [None]:
# get (overlapping) date ranges for each article - with normalised RAW TITLES - don't bother collecting for those we've already got

diffarts = article_dates_unique[article_dates_unique['norm_raw_title'] != article_dates_unique['redirected_title']]

langs = article_dates_unique['lang'].unique()

raw_ranges_dfs = []
for lang in langs:
    print(lang)

    lang_article_dates_unique = diffarts[diffarts['lang'] == lang]
    lang_article_dates_unique = lang_article_dates_unique.sort_values(['norm_raw_title', 'date'])
    lang_article_dates_unique['start_date'] = lang_article_dates_unique['date'] - pd.DateOffset(days=10)
    lang_article_dates_unique['end_date'] = lang_article_dates_unique['date'] + pd.DateOffset(days=11)
    
    # get overlapping date ranges with same article
    title = None
    last_date = None
    raw_range_dfs = []
    l_range_df = []
    n = 0
    for i, row in lang_article_dates_unique.iterrows():
        if n % 100000 == 0:
            print(n/len(lang_article_dates_unique), end='\r')
        if row['norm_raw_title'] == title:
            if row['start_date'] <= last_date:
                last_date = row['end_date']
            else:
                l_range_df.append({'title': title, 'start_date': start_date, 'end_date': last_date})
                start_date = row['start_date']
                last_date = row['end_date']  
        else:
            if title is not None:
                l_range_df.append({'title': title, 'start_date': start_date, 'end_date': last_date})
            title = row['norm_raw_title']
            start_date = row['start_date']
            last_date = row['end_date']
        n+=1
    l_range_df.append({'title': title, 'start_date': start_date, 'end_date': last_date})

    l_range_df = pd.DataFrame(l_range_df)
    l_range_df['lang'] = lang
    raw_ranges_dfs.append(l_range_df[['lang', 'title', 'start_date', 'end_date']])

raw_ranges_df = pd.concat(raw_ranges_dfs, ignore_index=True)
raw_ranges_df.to_hdf('data/raw_ranges_df.h5', 'df', mode='w')