In [2]:
from igraph import *
import pandas as pd
import numpy as np
import time
import os
import re
import progressbar

### Load uk_wiki

In [254]:
#path to data
PATH_TO_DATA = '../data/'
PATH_TO_DATA_UK = PATH_TO_DATA+"ukwiki/"
PATH_TO_DATA_EN = PATH_TO_DATA+"enwiki/"

In [255]:
# get list of archives to unpack for uk wiki
UKWIKI_ART_FNMS = []
for file in os.listdir(PATH_TO_DATA_UK):
    if re.match(r"ukwiki-20180620-pages-meta-current\d{2}-p\d+p\d+.xml_art.csv.gz", file):
        UKWIKI_ART_FNMS.append(file) 
        
UK_ID_NAME = "ukwiki-20180620-id_name.csv.gz"

In [256]:
def unpack(file_name):
    file_name_new = file_name.replace(".gz","")
    with gzip.open(file_name, 'rb') as f_in, open(file_name_new, 'wb') as f_out:
        f_out.writelines(f_in)
    return file_name_new

In [257]:
start_time = time.time()
# loading all articles to dataframe
df_uk = pd.DataFrame()
n_files = len(UKWIKI_ART_FNMS)
for index in range(n_files):
    print(str(index+1) + '/' + str(n_files))
    fn = UKWIKI_ART_FNMS[index]
    fn = PATH_TO_DATA_UK+fn
    print(fn)
    fn_new = unpack(fn)
    df_articles = pd.read_csv(fn_new, encoding='UTF-8', quotechar="\"")  
    df_articles = df_articles[df_articles['is_red_link']==False][['id','link_id']]
    df_uk = pd.concat((df_uk, df_articles))
    print("df_articles size: {}. df_articles columns: {}\n".format(df_articles.shape, list(df_articles.columns)))
    os.remove(fn_new)
    
print('Total time: %.1f minutes' % ((time.time() - start_time)/60))

1/8
../data/ukwiki/ukwiki-20180620-pages-meta-current02-p5503943p11007884.xml_art.csv.gz
df_articles size: (4820555, 2). df_articles columns: ['id', 'link_id']

2/8
../data/ukwiki/ukwiki-20180620-pages-meta-current02-p5501931p11003859.xml_art.csv.gz
df_articles size: (4819664, 2). df_articles columns: ['id', 'link_id']

3/8
../data/ukwiki/ukwiki-20180620-pages-meta-current03-p11003860p16505788.xml_art.csv.gz
df_articles size: (4513547, 2). df_articles columns: ['id', 'link_id']

4/8
../data/ukwiki/ukwiki-20180620-pages-meta-current04-p16505789p22007717.xml_art.csv.gz
df_articles size: (4878606, 2). df_articles columns: ['id', 'link_id']

5/8
../data/ukwiki/ukwiki-20180620-pages-meta-current01-p1p5503942.xml_art.csv.gz
df_articles size: (5005870, 2). df_articles columns: ['id', 'link_id']

6/8
../data/ukwiki/ukwiki-20180620-pages-meta-current01-p1p5501930.xml_art.csv.gz
df_articles size: (5005711, 2). df_articles columns: ['id', 'link_id']

7/8
../data/ukwiki/ukwiki-20180620-pages-meta-

In [258]:
print(df_uk.shape)
print(df_uk.columns)

(38426179, 2)
Index(['id', 'link_id'], dtype='object')


In [259]:
# article names
df_uk_name = pd.read_csv(PATH_TO_DATA_UK+UK_ID_NAME,  encoding='UTF-8', quotechar="\"")
df_uk_name = df_uk_name[['id','title']]
print(df_uk_name.shape)
print(df_uk_name.columns)

(796714, 2)
Index(['id', 'title'], dtype='object')


### Load en_wiki

In [20]:
# get list of archives to unpack for en wiki
ENWIKI_ART_FNMS = []
for file in os.listdir(PATH_TO_DATA_EN):
    if re.match(r"enwiki-20180620-pages-meta-current\d{2}-p\d+p\d+.xml_art.csv.gz", file):
        ENWIKI_ART_FNMS.append(file)

EN_ID_NAME = "enwiki-20180620-id_name.csv.gz"

In [22]:
start_time = time.time()

# loading existing articles and red links to dataframes
df_en_blue = pd.DataFrame()
df_en_red = pd.DataFrame()
n_files = len(ENWIKI_ART_FNMS)
for index in range(n_files):
    print(str(index+1) + '/' + str(n_files))
    fn = ENWIKI_ART_FNMS[index]
    fn = PATH_TO_DATA_EN+fn
    print(fn)
    fn_new = unpack(fn)
    df_articles = pd.read_csv(fn_new, encoding='ISO-8859-1',quotechar="'", usecols=[0,1,4,6])
    
    df_articles_blue = df_articles[df_articles['is_red_link']==False]
    df_en_blue = pd.concat((df_en_blue, df_articles_blue[['id','link_id']]))
    df_articles_red = df_articles[df_articles['is_red_link']]
    df_en_red = pd.concat((df_en_red, df_articles_red[['id','link_val']]))
    
    print("df_articles size: {}. df_articles columns: {}\n".format(df_articles.shape, list(df_articles.columns)))
    os.remove(fn_new)

print('Total time: %.1f minutes' % ((time.time() - start_time)/60))

1/55
../data/enwiki/enwiki-20180620-pages-meta-current26-p39567203p41067203.xml_art.csv.gz
df_articles size: (3684353, 4). df_articles columns: ['id', 'link_id', 'link_val', 'is_red_link']

2/55
../data/enwiki/enwiki-20180620-pages-meta-current19-p16120543p17620543.xml_art.csv.gz
df_articles size: (3284004, 4). df_articles columns: ['id', 'link_id', 'link_val', 'is_red_link']

3/55
../data/enwiki/enwiki-20180620-pages-meta-current27-p57663462p57726175.xml_art.csv.gz
df_articles size: (89311, 4). df_articles columns: ['id', 'link_id', 'link_val', 'is_red_link']

4/55
../data/enwiki/enwiki-20180620-pages-meta-current14-p7697598p7744799.xml_art.csv.gz
df_articles size: (143742, 4). df_articles columns: ['id', 'link_id', 'link_val', 'is_red_link']

5/55
../data/enwiki/enwiki-20180620-pages-meta-current22-p25427984p26823660.xml_art.csv.gz
df_articles size: (3288047, 4). df_articles columns: ['id', 'link_id', 'link_val', 'is_red_link']

6/55
../data/enwiki/enwiki-20180620-pages-meta-current2

df_articles size: (5168413, 4). df_articles columns: ['id', 'link_id', 'link_val', 'is_red_link']

45/55
../data/enwiki/enwiki-20180620-pages-meta-current20-p20254736p21222156.xml_art.csv.gz
df_articles size: (2270783, 4). df_articles columns: ['id', 'link_id', 'link_val', 'is_red_link']

46/55
../data/enwiki/enwiki-20180620-pages-meta-current13-p5040438p6197594.xml_art.csv.gz
df_articles size: (5139668, 4). df_articles columns: ['id', 'link_id', 'link_val', 'is_red_link']

47/55
../data/enwiki/enwiki-20180620-pages-meta-current15-p7744803p9244803.xml_art.csv.gz
df_articles size: (4598605, 4). df_articles columns: ['id', 'link_id', 'link_val', 'is_red_link']

48/55
../data/enwiki/enwiki-20180620-pages-meta-current26-p42567203p42663461.xml_art.csv.gz
df_articles size: (212304, 4). df_articles columns: ['id', 'link_id', 'link_val', 'is_red_link']

49/55
../data/enwiki/enwiki-20180620-pages-meta-current22-p23927984p25427984.xml_art.csv.gz
df_articles size: (3875227, 4). df_articles column

In [23]:
print(df_en_blue.shape)
print(df_en_blue.columns)

print(df_en_red.shape)
print(df_en_red.columns)

(168384829, 2)
Index(['id', 'link_id'], dtype='object')
(9692719, 2)
Index(['id', 'link_val'], dtype='object')


In [24]:
# article names
df_en_name = pd.read_csv(PATH_TO_DATA_EN+EN_ID_NAME, encoding='ISO-8859-1', 
                         quotechar="'", escapechar ="\\", usecols = [0,1])

print(df_en_name.shape)
print(df_en_name.columns)

(5669865, 2)
Index(['id', 'title'], dtype='object')


In [25]:
# add ids for read links

# find maximum id among blue links
max_id = np.max(df_en_name['id'])

# find unique titles among red links, create df with
red_links = df_en_red['link_val'].unique()
red_links_ids = pd.DataFrame({'title': red_links, 
                              'id': np.arange(max_id+1, len(red_links)+max_id+1)})
red_links_ids['is_red_link'] = True

# create dataframe with titles and red link tickets / add it to article titles df
df_en_name['is_red_link'] = False
df_en_name = pd.concat((df_en_name,red_links_ids))
df_en_name = df_en_name.reset_index(drop=True)

print(df_en_name.shape)
print(df_en_name.columns)


(11480803, 3)
Index(['id', 'title', 'is_red_link'], dtype='object')


In [26]:
# add link ids to df_en_red
df_en_red = df_en_red.merge(right = red_links_ids, left_on = 'link_val', right_on = 'title', how = 'left')
df_en_red = df_en_red[['id_x','id_y']]
df_en_red.columns = ['id', 'link_id']

# concatenate red and blue article to one df
df_en = pd.concat((df_en_red, df_en_blue))

In [27]:
print(df_en.shape)
print(df_en_blue.shape[0] + df_en_red.shape[0])

(178077548, 2)
178077548


In [28]:
del red_links, red_links_ids, df_en_red, df_en_blue

### Load uk-en correpondences

In [None]:
fn = PATH_TO_DATA+'link/20180620-langlinks_uk_en.csv.gz'
df_link = pd.read_csv(fn,  encoding='UTF-8', quotechar='\'')
print(df_link.shape)
print(df_link.columns)

In [None]:
# remove values links to negative values
df_link = df_link[(df_link['id_en']>0) & (df_link['id_en']!=49244)]
print(df_link.shape)

In [None]:
# add en correspondences to uk article names
df_uk_name = df_uk_name.merge(right = df_link, left_on = 'id', right_on = 'id_uk', how = 'left')
df_uk_name = df_uk_name[['id','title', 'id_en']]

In [164]:
# several en articles have multiple corresponding uk acticles, just remove oldest one uk article
df_uk_translated = (df_uk_name[~df_uk_name['id_en'].isnull()])[['id', 'id_en']]
df_uk_translated.columns = ['id_uk','id_en']
df_uk_translated = df_uk_translated.sort_values(by = 'id_uk')
df_uk_translated = df_uk_translated.drop_duplicates(keep = 'first', subset = ['id_en'])

# add uk correspondences to en article names
df_en_name = df_en_name.merge(right = df_uk_translated, left_on = 'id', right_on = 'id_en', how = 'left')
df_en_name = df_en_name[['id','title','is_red_link','id_uk']]
df_en_name = df_en_name.sort_values(by = 'id')

In [176]:
df_uk_name.head()

Unnamed: 0,id,title,id_en
0,3,Головна сторінка,
1,13,Географія,18963910.0
2,584,Атом,902.0
3,585,Мільярд,1136363.0
4,586,Ядро,


In [181]:
df_en_name.head()

Unnamed: 0,id,title,is_red_link,id_uk
0,12,Anarchism,False,12101.0
1,25,Autism,False,37656.0
2,39,Albedo,False,10899.0
3,290,A,False,235422.0
4,303,Alabama,False,6320.0


In [182]:
# save names to files
df_uk_name.to_csv('uk_names.csv.gz', compression='gzip', header=True, index=False)
df_en_name.to_csv('en_names.csv.gz', compression='gzip', header=True, index=False)

In [261]:
# load names 
df_uk_name = pd.read_csv('uk_names.csv.gz')
df_en_name = pd.read_csv('en_names.csv.gz')

### Encode uk nontranslated article by its incoming links

In [262]:
# select non-translated articles
uk_nontranslated = np.array(df_uk_name[df_uk_name['id_en'].isnull()]['id'])
print('Nontranslated uk acticles: %6d' % (len(uk_nontranslated)))

# select non-translated acticles that have at least 5 distinct incoming links
uk_nontranslated = df_uk[df_uk['link_id'].isin(uk_nontranslated)].groupby('link_id') \
            .agg({'id': lambda x: x.nunique()})
uk_nontranslated = uk_nontranslated.reset_index()
uk_nontranslated.columns = ['id','n_incoming']
uk_nontranslated = uk_nontranslated[uk_nontranslated['n_incoming']>=5]
uk_nontranslated = np.array(uk_nontranslated['id'])
uk_nontranslated_ids = np.sort(uk_nontranslated)
print('Nontranslated uk acticles with more at least 5 incoming links: %6d' % (len(uk_nontranslated)))

Nontranslated uk acticles: 355796
Nontranslated uk acticles with more at least 10 incoming links:  82122


In [263]:
# encode each nontranslated article by its incoming links
uk_nontranslated_encoding_df = df_uk[df_uk['link_id'].isin(uk_nontranslated)]
uk_nontranslated_encoding_df = uk_nontranslated_encoding_df.sort_values(by = ['link_id','id'])
uk_nontranslated_encoding_df = uk_nontranslated_encoding_df.reset_index(drop = True)
indices = np.array(uk_nontranslated_encoding_df.drop_duplicates(keep='first', subset=['link_id']).index)

uk_nontranslated_encoding = []
pbar = progressbar.ProgressBar()
for i in pbar(range(0, len(indices))):
    if i == len(indices) - 1:
        this_encoding = set(uk_nontranslated_encoding_df.iloc[indices[i]:]['id'])
    else: 
        this_encoding = set(uk_nontranslated_encoding_df.iloc[indices[i]:indices[i+1]]['id'])
    uk_nontranslated_encoding.append(this_encoding)
    
del uk_nontranslated_encoding_df, uk_nontranslated

100% (82122 of 82122) |##################| Elapsed Time: 0:00:15 Time:  0:00:15


In [264]:
# save encoding to file
np.save(file='uk_nontranslated_encoding.npy', arr=uk_nontranslated_encoding)
np.save(file='uk_nontranslated_ids.npy', arr=uk_nontranslated_ids)

In [4]:
# load encoding from file
uk_nontranslated_encoding = np.load('uk_nontranslated_encoding.npy')
uk_nontranslated_ids = np.load('uk_nontranslated_ids.npy')

### Encode red links incoming uk links

In [36]:
# select only red links from en_wiki
df_en_red = df_en[df_en['link_id']>max_id]
en_red_ids = df_en_red['link_id'].unique()
print('Number of red links in en wiki: %d' %(len(en_red_ids)))

# for every red link find its correponding uk incoming link (only distinct)
df_en_red_encoding = df_en_red.merge(right=df_en_name, left_on='id', 
                                         right_on='id', how='left')[['id_uk','link_id']]
df_en_red_encoding = df_en_red_encoding[~df_en_red_encoding['id_uk'].isnull()]
df_en_red_encoding = df_en_red_encoding.drop_duplicates()

# for every red link calculate number of incoming links
df_en_red_incoming = df_en_red_encoding['link_id'].value_counts().reset_index()
df_en_red_incoming.columns = ['id','n_incoming']

# select only those red links that can be encoded with at least 5 uk incoming links
en_red_ids = np.array(df_en_red_incoming[df_en_red_incoming['n_incoming']>=5]['id'])
print('Number of red links with at least 5 distinct incoming uk links: %d' % (len(en_red_ids)))

Number of red links in en wiki: 5810938
Number of red links with at least 5 distinct incoming uk links: 3593


In [37]:
# encode top red links using incoming uk links
en_red_ids = np.sort(en_red_ids)
df_en_red_encoding = df_en_red_encoding[df_en_red_encoding['link_id'].isin(en_red_ids)]
df_en_red_encoding = df_en_red_encoding.sort_values(by = 'link_id').reset_index(drop = True)
indices = np.array(df_en_red_encoding.drop_duplicates(keep='first', subset=['link_id']).index)

en_red_encoding = []
pbar = progressbar.ProgressBar()
for i in pbar(range(0, len(indices))):
    if i == len(indices)-1:
        this_encoding = set(df_en_red_encoding.iloc[indices[i]:]['id_uk'])
    else:
        this_encoding = set(df_en_red_encoding.iloc[indices[i]:indices[i+1]]['id_uk'])
    en_red_encoding.append(this_encoding)

100% (3593 of 3593) |####################| Elapsed Time: 0:00:06 Time:  0:00:06


In [245]:
del df_en_red_incoming, df_en_red_encoding

In [238]:
# save encoding to file
np.save(file='en_red_ids.npy', arr=en_red_ids)
np.save(file='en_red_encoding.npy', arr=en_red_encoding)

In [5]:
# load encoding from file
en_red_ids = np.load('en_red_ids.npy')
en_red_encoding = np.load('en_red_encoding.npy')

In [49]:
# create df with incoming en ids and names for every red link
incoming_en_link_names = []
n_incoming_en = []
incoming_uk_link_names = []
n_incoming_uk = []

pbar = progressbar.ProgressBar()
for i in pbar(range(len(en_red_ids))):
    red_id = en_red_ids[i]
    incoming_ids = list(df_en_red[df_en_red['link_id'] == red_id]['id'])
    incoming_names = set(df_en_name[df_en_name['id'].isin(incoming_ids)]['title'])
    incoming_uk_names = set(df_uk_name[df_uk_name['id'].isin(en_red_encoding[i])]['title'])
              
    incoming_en_link_names.append(incoming_names)
    n_incoming_en.append(len(incoming_names))
    incoming_uk_link_names.append(incoming_uk_names) 
    n_incoming_uk.append(len(incoming_uk_names))
    
df_red_links = pd.DataFrame({'red_link_id': en_red_ids, 
                          'red_link_name': list(df_en_name[df_en_name['id'].isin(en_red_ids)]['title']),
                          'incoming_en_names': incoming_en_link_names,
                          'n_incoming_en': n_incoming_en,
                          'incoming_uk_names': incoming_uk_link_names,
                          'n_incoming_uk': n_incoming_uk
                          })
              
df_red_links = df_red_links[['red_link_id','red_link_name','incoming_en_names',
                             'n_incoming_en', 'incoming_uk_names', 'n_incoming_uk']]

100% (3593 of 3593) |####################| Elapsed Time: 0:16:12 Time:  0:16:12


In [53]:
print(df_red_links.shape)
df_red_links.head()

(3593, 6)


Unnamed: 0,red_link_id,red_link_name,incoming_en_names,n_incoming_en,incoming_uk_names,n_incoming_uk
0,57726536,Cleonini,"{Pachycerus, Rhabdorrhynchus, Liocleonus clath...",7,"{Pachycerus, Rhabdorrhynchus, Cyphocleonus, Li...",5
1,57726733,West Coast Mafia Records,"{The Final Chapter (C-Bo album), Gas Chamber (...",24,"{Life as a Rider, Cashville Records, West Coas...",18
2,57726734,Killa Tay,"{JT the Bigga Figga, C-Bo, E.D.I. Mean, A Mill...",35,"{Life as a Rider, Дискографія Yukmouth, Blow (...",16
3,57726809,Denver Film Critics Society,"{The Croods, The Social Network (soundtrack), ...",24,"{Гіліян Флінн, Сімейка Крудсів, Б'ютифул, Сієн...",11
4,57727161,Wim Smet,"{2011â12 Oud-Heverlee Leuven season, 2012â...",13,"{Кубок Бельгії з футболу 2016—2017, Кубок Бель...",6


In [50]:
# save red links incoming links to files
df_red_links.to_csv('red_links_summary.csv.gz', compression='gzip', header=True, index=False)

In [6]:
# load 
df_red_links = pd.read_csv('red_links_summary.csv.gz')

### Find most similar uk articles for red links

In [265]:
# jaccard similarity
def jaccard(a, b):
    c = a.intersection(b)
    return len(c) / (len(a) + len(b) - len(c))

In [268]:
uk_article_list = []
uk_article_found = []

pbar = progressbar.ProgressBar()
for j in pbar(range(len(en_red_ids))):
    red_link = en_red_ids[j]
    red_link_encoding = en_red_encoding[j]
    similarities = []
    for i in range(len(uk_nontranslated_ids)):
        similarity = jaccard(red_link_encoding, uk_nontranslated_encoding[i])
        similarities.append(similarity)
    
    # select top 5 most similar uk article, zero similarities excluded
    similarities = np.array(similarities)
    indices = np.argsort(-similarities)[:5]
    score = np.round(similarities[indices],3)
    n_nonzero = np.sum(score>0)
    indices = np.sort(indices[:n_nonzero])

    # scores and article names sorted by article ids
    score = np.round(similarities[indices],3)
    uk_article_ids = uk_nontranslated_ids[indices]
    uk_article_names = list(df_uk_name[df_uk_name['id'].isin(uk_article_ids)]['title'])
    
    # sort by jaccard similarity
    d = dict(zip(uk_article_names, score))
    d_sorted = sorted(((value, key) for (key,value) in d.items()), reverse=True)
    
    # save to list
    uk_article_list.append(d_sorted)
    
    if len(uk_article_names)>0:
        uk_article_found.append(True)
    else:
        uk_article_found.append(False)

        
# create df to show results
df_similarities = pd.DataFrame({'red_link_id': en_red_ids,
                   'uk_similar_found': uk_article_found, 
                   'uk_similar': uk_article_list
                  })
    
    

100% (3593 of 3593) |####################| Elapsed Time: 0:04:32 Time:  0:04:32


In [269]:
df_red_results = df_red_links.merge(right=df_similarities, right_on = 'red_link_id',
                                     left_on = 'red_link_id', how = 'left')

In [270]:
df_red_results.head()

Unnamed: 0,red_link_id,red_link_name,incoming_en_names,n_incoming_en,incoming_uk_names,n_incoming_uk,uk_similar,uk_similar_found
0,57726536,Cleonini,"{Pachycerus, Rhabdorrhynchus, Liocleonus clath...",7,"{Pachycerus, Rhabdorrhynchus, Cyphocleonus, Li...",5,"[(0.067, Тер-Мінасян Маргарита Єрвандівна), (0...",True
1,57726733,West Coast Mafia Records,"{The Final Chapter (C-Bo album), Gas Chamber (...",24,"{Life as a Rider, Cashville Records, West Coas...",18,"[(0.842, West Coast Mafia Records), (0.091, Пе...",True
2,57726734,Killa Tay,"{JT the Bigga Figga, C-Bo, E.D.I. Mean, A Mill...",35,"{Life as a Rider, Дискографія Yukmouth, Blow (...",16,"[(0.375, West Coast Mafia Records), (0.007, РЕ...",True
3,57726809,Denver Film Critics Society,"{The Croods, The Social Network (soundtrack), ...",24,"{Гіліян Флінн, Сімейка Крудсів, Б'ютифул, Сієн...",11,"[(0.043, Задніпровський Назар Олександрович), ...",True
4,57727161,Wim Smet,"{2011â12 Oud-Heverlee Leuven season, 2012â...",13,"{Кубок Бельгії з футболу 2016—2017, Кубок Бель...",6,"[(0.091, Павел Рачковський)]",True


In [271]:
df_red_results['uk_similar_found'].value_counts()

True     2467
False    1126
Name: uk_similar_found, dtype: int64

In [281]:
# save results to file
df_red_results.to_csv('red_links_results.csv.gz', compression='gzip', 
                      header=True, index=False, encoding = 'UTF-16')

In [None]:
# load 
df_red_results = pd.read_csv('red_links_results.csv.gz', encoding = 'UTF-16')