#### Notes

1.   If we are providing a solution for link prediction, we might consider using imbalanced data with more negative samples to mimic the real world scenario. 
2.   We can think of a better approach than randomly selecting edges and checking if they are +ve/-ve. We can do this by factoring in a weight related to the path between two nodes.

In [1]:
!pip install StellarGraph

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting StellarGraph
  Downloading stellargraph-1.2.1-py3-none-any.whl (435 kB)
[K     |████████████████████████████████| 435 kB 31.0 MB/s 
Installing collected packages: StellarGraph
Successfully installed StellarGraph-1.2.1


#Imports + Installations

In [2]:
# Importing Libraries
# please do go through this python notebook: 
import warnings
warnings.filterwarnings("ignore")

import csv
import pandas as pd # Pandas to create small dataframes 
import datetime # Convert to unix time
import time # Convert to unix time
# If numpy is not installed already : pip3 install numpy
import numpy as np # Do aritmetic operations on arrays
# Matplotlib: used to plot graphs
import matplotlib
import matplotlib.pylab as plt
import seaborn as sns # Plots
from matplotlib import rcParams # Size of plots  
from sklearn.cluster import MiniBatchKMeans, KMeans # Clustering
import math
import pickle
import os
# To install xgboost: pip3 install xgboost
import xgboost as xgb

import warnings
import networkx as nx
import pdb
import pickle
from tqdm.notebook import tqdm
import os
import random
from sklearn.model_selection import train_test_split

import collections
from stellargraph.mapper import GraphWaveGenerator
from stellargraph import StellarGraph
from scipy.sparse.linalg import eigs

#Loading the data

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
os.listdir()

['.config', 'gdrive', 'sample_data']

In [6]:
os.listdir('gdrive/My Drive/Twitch Link Prediction/DE')

['musae_DE.json', 'musae_DE_edges.csv', 'musae_DE_target.csv']

In [10]:
data_path = 'gdrive/My Drive/Twitch Link Prediction/DE'

In [11]:
train = pd.read_csv(data_path+'/musae_DE_edges.csv', names = ['Source', 'Destination'])[1:]
train['Source'] = train['Source'].astype('int64')
train['Destination'] = train['Destination'].astype('int64')
train.head()

Unnamed: 0,Source,Destination
1,0,9206
2,0,7787
3,0,2145
4,0,2684
5,0,7275


In [12]:
print(type(train['Source'][1]), type(train['Destination'][1]))

<class 'numpy.int64'> <class 'numpy.int64'>


In [13]:
nodes = sorted(list(set(train['Source'].values) | set (train['Destination'].values)))
print(len(nodes))
ledger = {}
for i, node in enumerate(nodes):
  ledger[node] = i
print(len(ledger))

9498
9498


In [14]:
friends = nx.DiGraph()
for src, dst in tqdm(train.values):
  friends.add_edge(ledger[src], ledger[dst])

  0%|          | 0/153138 [00:00<?, ?it/s]

In [15]:
df = pd.DataFrame({}, {}, columns = ['src', 'dest'])  
src_list = [ledger[src] for src,dest in train.values]
dest_list = [ledger[dest] for src, dest in train.values]
df['src'] = src_list
df['dest'] = dest_list
df['link'] = 1
print(df.head())

   src  dest  link
0    0  9206     1
1    0  7787     1
2    0  2145     1
3    0  2684     1
4    0  7275     1


In [16]:
df = df.sample(frac=1)

In [17]:
node1 = set(df['src'])
node2 = set(df['dest'])
df_nodes = node1.union(node2)
print(len(df_nodes))

9498


In [18]:
friends.number_of_nodes()

9498

In [19]:
friends.number_of_edges()

153138

In [20]:
for node in friends.nodes():
  if (len(friends.out_edges(node)) + len(friends.in_edges(node))) == 0:
    print(node)

in_out_zero = []
for node in friends.nodes():
  if len(friends.out_edges(node)) == 0 or len(friends.in_edges(node)) == 0:
    in_out_zero.append(node)

len(in_out_zero)

2001

In [21]:
nodelist = sorted(list(ledger.values()))
nodes = sorted(list(friends.nodes()))
print(len(nodelist))
if nodes == nodelist:
  print("true")
else:
  print("false")

9498
true


In [22]:
friends.adj[3]

AtlasView({6522: {}, 8608: {}})

# Section 1: Data Preparation

1.1 Extracting positive samples from the graph

The procedure followed is:
* Obtain the adjacency matrix
* Traverse to find nodes that are not connected
* Store these pairs of nodes to use as negative samples during training

In [23]:
list(friends.edges())[:5]

[(0, 9206), (0, 7787), (0, 2145), (0, 2684), (0, 7275)]

In [24]:
def get_neg_samples(nodes, graph):
  """
  Input: 
    nodes: list of all nodes of the graph
    graph: the nx graph
  Output:
    neg_samples: a pd dataframe containing all pairs of disconnected nodes
  
  Computes the adjacency matrix, and returns all disconnected nodes
  """

  edges = set(graph.edges())

  discon_pairs = set([])

  while (len(discon_pairs)<150000):
    try:
      a=ledger[random.randint(0, 9498)]
      b=ledger[random.randint(0, 9498)]
    except:
      continue
    if a!=b and (a,b) not in edges and (b,a) not in edges:
        try:
            if nx.shortest_path_length(graph,source=a,target=b) > 2: 
                discon_pairs.add((a,b))
            else:
                continue  
        except:  
                discon_pairs.add((a,b))              
    else:
        continue

  # for i in tqdm(range(adj_g.shape[0]-1)):
  #   for j in range(i+1, adj_g.shape[1]):
  #     try:
  #       if nx.shortest_path_length(graph, i, j) <= 2:
  #         if adj_g[i, j] == 0:
  #           discon_pairs.append([nodes[i], nodes[j]])
  #     except nx.NetworkXNoPath:
  #       continue

  print(f"Number of disconnected nodes found = {len(discon_pairs)}")

  #converting the list into a dataframe

  neg_samples = pd.DataFrame({'src': [i[0] for i in discon_pairs], 
                              'dest': [i[1] for i in discon_pairs]})  
  neg_samples['link'] = 0

  return neg_samples

In [25]:
%%time
neg_data = get_neg_samples(nodelist, friends)

# estimated running time for this cell = 4 days
# check for possible places for optimization 

Number of disconnected nodes found = 150000
CPU times: user 2min 31s, sys: 340 ms, total: 2min 31s
Wall time: 2min 33s


In [26]:
neg_data.head()

Unnamed: 0,src,dest,link
0,4170,5671,0
1,1790,1929,0
2,6543,8693,0
3,4563,3012,0
4,754,3662,0


1.2 Obtain positive samples from the original graph

These positive samples, along with the negative samples obtained above would be used for training the classifiers. The positive samples are simply **pairs of nodes between which a link actually exists in the original graph.**

The procedure followed is: 
* Check if dropping a node pair results in splitting of the graph (increases the number of conn components)
* Check if dropping the pair results in a reduction in the number of nodes
* If both the above constraints are fulfilled, then drop the node pair (edge) - store this node pair in a list of positive samples

In [27]:
# creating a dictionary of number of times a node occurs in the dict
src = list(df['src'])
dest = list(df['dest'])

src_dict = collections.Counter(src)
dest_dict = collections.Counter(dest)

print(len(src_dict), len(dest_dict))

8650 8345


In [30]:
def get_pos_samples(df, g):
  """
  Input:
    df: the original edgelist as a pd dataframe 
    g: the original networkx graph
    assumes: 
      nodes are specified as Source and Destination
      original Graph is a Digraph
      node indexing is corrected
  Output:
    pos_samples: dataframe containing all removable edges (positive samples)
  """
  df_temp = df.copy()
  gr = g.copy(as_view=False)
  node_count = len(gr.nodes())
  print(f"original node count = {node_count}")
  
  removable_links_idx = []
  
  cc = nx.number_weakly_connected_components(gr)
  print(f"Original number of connected components = {cc}")

  for c, i in tqdm(enumerate(df.index.values)):
    src, dest, link = df.loc[i].values
    
    gr.remove_edge(src,dest)

    if(gr.degree(src) == 0):
      gr.remove_node(src)
    if(gr.degree(dest) == 0):
      gr.remove_node(dest)
  
    if len(gr.nodes()) == node_count:
      removable_links_idx.append(i)
      df_temp = df_temp.drop(index=i)
    else:
      gr.add_edge(src,dest)
    
    if c == 15500:
      print(f"removable links index = {len(removable_links_idx)}")
      break
  
  pos_samples = df.loc[removable_links_idx]
  pos_samples['link'] = 1

  nodes_new = set(df_temp['src']).union(set(df_temp['dest']))
  print(f"New number of nodes = {len(nodes_new)}")

  return pos_samples, df_temp

In [31]:
pos_samples, df_train = get_pos_samples(df, friends)
pos_samples_idx = pos_samples.index.values
print(pos_samples.shape)
pos_samples.head()

original node count = 9498
Original number of connected components = 1


0it [00:00, ?it/s]

removable links index = 15467
New number of nodes = 9498
(15467, 3)


Unnamed: 0,src,dest,link
14972,3173,7789,1
7432,257,2385,1
93848,116,8468,1
103148,4139,2025,1
67267,2546,8960,1


In [32]:
print(df_train.shape)
df_train.head()

(137671, 3)


Unnamed: 0,src,dest,link
72151,2716,4596,1
59689,2233,9423,1
78618,2916,3937,1
1040,53,1289,1
135261,6504,7787,1


In [33]:
# sanity check
train_nodes = set(df_train['src']).union(set(df_train['dest']))

In [34]:
len(df_nodes.difference(train_nodes))

0

In [35]:
train_graph = nx.from_pandas_edgelist(df_train, source='src', target='dest', create_using=nx.MultiDiGraph())

In [36]:
original_graph = nx.from_pandas_edgelist(df, source="src", target="dest", create_using=nx.MultiDiGraph)

In [37]:
cc_before = nx.number_weakly_connected_components(original_graph)
print(cc_before)

1


In [38]:
cc_after = nx.number_weakly_connected_components(train_graph)

In [39]:
print(cc_after)

3


In [40]:
neg_data.shape

(150000, 3)

In [41]:
neg_test = neg_data.sample(frac = 0.1)
print(neg_test.shape)
neg_test.head()

(15000, 3)


Unnamed: 0,src,dest,link
17885,2891,7178,0
121364,6523,3935,0
45030,940,1754,0
129430,5772,6840,0
144245,4006,6931,0


In [42]:
neg_train = neg_data.drop(neg_test.index.values, axis=0)
print(neg_train.shape)
neg_train.head()

(135000, 3)


Unnamed: 0,src,dest,link
0,4170,5671,0
1,1790,1929,0
2,6543,8693,0
3,4563,3012,0
4,754,3662,0


In [43]:
combined_tr = pd.concat([df_train, neg_train], axis = 0).sample(frac=1)
print(combined_tr.shape)
combined_tr.head()

(272671, 3)


Unnamed: 0,src,dest,link
115788,4957,9057,1
140262,7055,8106,1
146109,7744,7802,1
115014,7737,1331,0
115249,2152,7581,0


In [44]:
combined_te = pd.concat([pos_samples, neg_test], axis = 0).sample(frac=1)
print(combined_te.shape)
combined_te.head()

(30467, 3)


Unnamed: 0,src,dest,link
34503,1285,8630,1
118484,8578,6482,0
44808,538,2754,0
131958,5061,7162,0
58422,2172,3741,1


In [45]:
combined_tr.to_csv(data_path+'/de_train_91.csv')
combined_te.to_csv(data_path+'/de_test_91.csv')

In [46]:
df_train.to_csv(data_path+'/de_pos_train_91.csv')

In [47]:
combined_tr = pd.read_csv(data_path+'/de_train_91.csv', index_col = [0])
print(combined_tr.shape)
combined_tr.head()

(272671, 3)


Unnamed: 0,src,dest,link
115788,4957,9057,1
140262,7055,8106,1
146109,7744,7802,1
115014,7737,1331,0
115249,2152,7581,0


In [48]:
combined_te = pd.read_csv(data_path+'/de_test_91.csv', index_col = [0])
print(combined_te.shape)
combined_te.head()

(30467, 3)


Unnamed: 0,src,dest,link
34503,1285,8630,1
118484,8578,6482,0
44808,538,2754,0
131958,5061,7162,0
58422,2172,3741,1


In [49]:
df_train = pd.read_csv(data_path+'/de_pos_train_91.csv', index_col = [0])
print(df_train.shape)
df_train.head()

(137671, 3)


Unnamed: 0,src,dest,link
72151,2716,4596,1
59689,2233,9423,1
78618,2916,3937,1
1040,53,1289,1
135261,6504,7787,1


In [None]:
train_graph = nx.from_pandas_edgelist(df_train, source='src', target='dest', create_using=nx.MultiDiGraph)

In [None]:
scales = [5, 10, 20, 50]
sample_points = np.linspace(0, 100, 50).astype(np.float32)
degree = 20
G = StellarGraph.from_networkx(train_graph)
generator = GraphWaveGenerator(G, scales=scales, degree=degree)

In [None]:
emb = generator.flow(
    node_ids = G.nodes(), sample_points = sample_points, batch_size = 1, repeat = False
)

In [None]:
embeddings = [x.numpy() for x in tqdm(emb)]

  0%|          | 0/75879 [00:00<?, ?it/s]

In [None]:
filename = data_path + "_embeddings"

In [None]:
outfile = open(filename, "wb")
pickle.dump(embeddings, outfile)

In [None]:
print(filename)

gdrive/My Drive/Major Project_embeddings


In [None]:
file = open(filename, "rb")
gw_emb = pickle.load(file)
file.close()

EOFError: ignored

In [None]:
print(len(gw_emb))

75879


In [None]:
emb_dict = {}
for node, emb in zip(G.nodes(), gw_emb):
  emb_dict[node] = emb
print(len(emb_dict))

75879


In [None]:
# get training embeddings
t_e = []
for i, row in combined.iterrows():
  comb_emb = emb_dict[row['src']] + emb_dict[row['dest']]
  t_e.append(comb_emb)
print(len(t_e))

999165


In [None]:
t_y = combined['link']
len(t_y)

999165

In [None]:

import xgboost as xgb
