In [3]:
import pandas as pd
import numpy as np
import sys
import os    
import scipy
import sklearn
from sklearn import metrics
import keras
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
os.environ['THEANO_FLAGS'] = "device=cuda"
import theano

In [23]:
path_gem = "./libs/GEM/"
path_data = "./libs/GEM/gem/data/"
sys.path.append(path_gem)

In [6]:
from gem.utils import graph_util, plot_util
from gem.evaluation import visualize_embedding as viz
from gem.evaluation import evaluate_graph_reconstruction as gr
from time import time

#from gem.embedding.gf       import GraphFactorization
#from gem.embedding.hope     import HOPE
#from gem.embedding.lap      import LaplacianEigenmaps
from gem.embedding.lle      import LocallyLinearEmbedding
from gem.embedding.sdne     import SDNE

## 1. Prepare data for embedding

In [9]:
# load all links between articles in uk wiki
df_uklinks = pd.read_csv('./data/links_in_uk.csv', encoding = 'UTF-8')

In [10]:
# load id correspondences for uk articles and their equivalences and in enwiki
uk_en_pairs = pd.read_csv('./data/df_uk_translated.csv.gz', encoding = 'UTF-8')

In [11]:
# load all links in enwiki to red articles
red_links_full = pd.read_csv('./data/redlinks_with_en_ids.csv', encoding = 'UTF-8')

In [12]:
# reidentify 'parent' articles of red links in terms of ukwiki
uk_ids_for_rlinksparents = uk_en_pairs.merge(right=red_links_full, how='inner', left_on='id_en', right_on='id_x')
uk_ids_for_rlinksparents = uk_ids_for_rlinksparents[['id_uk', 'id_y']]

In [13]:
# load red links which have at least 5 incoming links from uk wiki
rlinks5more = pd.read_csv('./data/red_links_with_at_least_5_distinct_incoming_uk_links.csv', header=None, encoding = 'UTF-8')

In [15]:
# extract red articles which have at least 5 'parent' articles with correspondences in uk wiki
rlinks5more.columns = ['en_id_5more']
uk_ids_for_rlinks5more = uk_ids_for_rlinksparents.merge(right=rlinks5more, how='inner', left_on='id_y', right_on='en_id_5more')
red_links_final = uk_ids_for_rlinks5more[['id_uk', 'en_id_5more']]
red_links_final.columns = ['id', 'link_id']
red_links_final.to_csv('./data/redlinks_with_parent_page_uk_id.csv', index=False)

### 1.1. In the graph find pieces which contain red links (up to second neigbor)

In [17]:
# load red articles which have at least 5 'parent' articles with correspondences in uk wiki
red_art_with_parent_uk_id = pd.read_csv('./data/redlinks_with_parent_page_uk_id.csv', encoding = 'UTF-8')

In [None]:
# get all outcoming links for 'parent' article of red articles. drop duplicate links.

unique_parents_for_redart = red_art_with_parent_uk_id[['id']].drop_duplicates()
unique_parents_for_redart.columns = ['rlink_uk_par']

found_in_first = df_uklinks.merge(right=unique_parents_for_redart, how='inner', left_on='id', right_on='rlink_uk_par')
df_redart_with_parent_outcoming_links = pd.concat((found_in_first[['id','link_id']], red_art_with_parent_uk_id))
df_redart_with_parent_outcoming_links = df_redart_with_parent_outcoming_links[['id', 'link_id']].drop_duplicates()

In [20]:
print('number of links for the grapf is', len(df_redart_with_parent_outcoming_links))

number of links for the grapf is 348948


In [22]:
# save the data prepared for graph embedding
df_redart_with_parent_outcoming_links.to_csv('./data/redart_with_parent_outcoming_links.txt', sep=' ', index=False, header=None)

## 2. Embed graph with GEM library

In [24]:
# File that contains the edges. Format: source target
edge_f = os.path.join(path_data, 'redart_with_parent_outcoming_links')
# Specify whether the edges are directed
isDirected = True

In [25]:
# Load graph
G = graph_util.loadGraphFromEdgeListTxt(edge_f, directed=isDirected)
G = G.to_directed()

In [29]:
models = []
# You can comment out the methods you don't want to run

#models.append(LocallyLinearEmbedding(d=2))

models.append(SDNE(d=3, beta=5, alpha=1e-5, nu1=1e-6, nu2=1e-6, K=3,n_units=[50, 15,], rho=0.3, n_iter=1, xeta=0.02,n_batch=500,
                  modelfile=['./results/intermediate/enc_model.json', './results/intermediate/dec_model.json'],
                  weightfile=['./results/intermediate/enc_weights.hdf5', './results/intermediate/dec_weights.hdf5']))

for embedding in models:
    print ('Num nodes: %d, num edges: %d' % (G.number_of_nodes(), G.number_of_edges()))
    t1 = time()
    # Learn embedding - accepts a networkx graph or file with edge list
    Y, t = embedding.learn_embedding(graph=G, edge_f=None, is_weighted=False, no_python=True)
    print (embedding._method_name+':\n\tTraining time: %f' % (time() - t1))
    # Visualize
    #viz.plot_embedding2D(embedding.get_embedding(), di_graph=G, node_colors=None)
    #plt.show()

Num nodes: 99629, num edges: 348948


  name=name)


Epoch 1/1
sdne:
	Training time: 2101.314101


## 3. Test embedding for finding red articles correspondences

### !!Danger. Data is too big to process

In [30]:
# save results of embedding and restart notebook to free memory space
np.save('./data/learnt_emb.npy', Y)

In [None]:
# load embeddings
Y = np.load('learnt_emb.npy')

In [None]:
# find nearest neighbors with cosine similarity
Y_cos_sim = sklearn.metrics.pairwise.cosine_similarity(Y)

In [None]:
Y_cos_sim.shape
# save results of cosine similarity matrix. !results in 39 GB file
#np.save('Y_cos_sim.npy', Y_cos_sim)

In [None]:
# process the matrix. !evokes MemoryError
# a = Y_cos_sim - 2*np.eye(99629)

## conclusion: the experiments done show that a new way to process the data should be found. and further research on the nature of the data and embedding perspectives for that data is needed.