#### Notes

1.   If we are providing a solution for link prediction, we might consider using imbalanced data with more negative samples to mimic the real world scenario. 
2.   We can think of a better approach than randomly selecting edges and checking if they are +ve/-ve. We can do this by factoring in a weight related to the path between two nodes.

In [1]:
!pip install StellarGraph

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


#Imports + Installations

In [2]:
# Importing Libraries
# please do go through this python notebook: 
import warnings
warnings.filterwarnings("ignore")

import csv
import pandas as pd # Pandas to create small dataframes 
import datetime # Convert to unix time
import time # Convert to unix time
# If numpy is not installed already : pip3 install numpy
import numpy as np # Do aritmetic operations on arrays
# Matplotlib: used to plot graphs
import matplotlib
import matplotlib.pylab as plt
import seaborn as sns # Plots
from matplotlib import rcParams # Size of plots  
from sklearn.cluster import MiniBatchKMeans, KMeans # Clustering
import math
import pickle
import os
# To install xgboost: pip3 install xgboost
import xgboost as xgb

import warnings
import networkx as nx
import pdb
import pickle
from tqdm.notebook import tqdm
import os
import random
from sklearn.model_selection import train_test_split

import collections
from stellargraph.mapper import GraphWaveGenerator
from stellargraph import StellarGraph
from scipy.sparse.linalg import eigs

#Loading the data

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [4]:
os.listdir()

['.config', 'gdrive', 'sample_data']

In [5]:
os.listdir('gdrive/My Drive/GitHub Link Prediction')

['musae_git_edges.csv',
 'musae_git_features.json',
 'musae_git_target.csv',
 'citing.txt',
 'README.txt']

In [6]:
data_path = 'gdrive/My Drive/GitHub Link Prediction'

In [7]:
train = pd.read_csv(data_path+'/musae_git_edges.csv', names = ['Source', 'Destination'])[1:]
train['Source'] = train['Source'].astype('int64')
train['Destination'] = train['Destination'].astype('int64')
train.head()

Unnamed: 0,Source,Destination
1,0,23977
2,1,34526
3,1,2370
4,1,14683
5,1,29982


In [8]:
print(type(train['Source'][1]), type(train['Destination'][1]))

<class 'numpy.int64'> <class 'numpy.int64'>


In [9]:
nodes = sorted(list(set(train['Source'].values) | set (train['Destination'].values)))
print(len(nodes))
ledger = {}
for i, node in enumerate(nodes):
  ledger[node] = i
print(len(ledger))

37700
37700


In [10]:
friends = nx.DiGraph()
for src, dst in tqdm(train.values):
  friends.add_edge(ledger[src], ledger[dst])

  0%|          | 0/289003 [00:00<?, ?it/s]

In [11]:
df = pd.DataFrame({}, {}, columns = ['src', 'dest'])  
src_list = [ledger[src] for src,dest in train.values]
dest_list = [ledger[dest] for src, dest in train.values]
df['src'] = src_list
df['dest'] = dest_list
df['link'] = 1
print(df.head())

   src   dest  link
0    0  23977     1
1    1  34526     1
2    1   2370     1
3    1  14683     1
4    1  29982     1


In [12]:
df = df.sample(frac=1)

In [13]:
node1 = set(df['src'])
node2 = set(df['dest'])
df_nodes = node1.union(node2)
print(len(df_nodes))

37700


In [14]:
friends.number_of_nodes()

37700

In [15]:
friends.number_of_edges()

289003

In [16]:
for node in friends.nodes():
  if (len(friends.out_edges(node)) + len(friends.in_edges(node))) == 0:
    print(node)

in_out_zero = []
for node in friends.nodes():
  if len(friends.out_edges(node)) == 0 or len(friends.in_edges(node)) == 0:
    in_out_zero.append(node)

len(in_out_zero)

14350

In [17]:
nodelist = sorted(list(ledger.values()))
nodes = sorted(list(friends.nodes()))
print(len(nodelist))
if nodes == nodelist:
  print("true")
else:
  print("false")

37700
true


In [18]:
friends.adj[3]

AtlasView({4950: {}, 18029: {}, 3358: {}, 34935: {}, 5916: {}})

# Section 1: Data Preparation

1.1 Extracting positive samples from the graph

The procedure followed is:
* Obtain the adjacency matrix
* Traverse to find nodes that are not connected
* Store these pairs of nodes to use as negative samples during training

In [25]:
list(friends.edges())[:5]

[(0, 23977), (23977, 30863), (23977, 29826), (23977, 31890), (23977, 17127)]

In [19]:
def get_neg_samples(nodes, graph):
  """
  Input: 
    nodes: list of all nodes of the graph
    graph: the nx graph
  Output:
    neg_samples: a pd dataframe containing all pairs of disconnected nodes
  
  Computes the adjacency matrix, and returns all disconnected nodes
  """

  edges = set(graph.edges())

  discon_pairs = set([])

  while (len(discon_pairs)<290000):
    try:
      a=ledger[random.randint(0, 37700)]
      b=ledger[random.randint(0, 37700)]
    except:
      continue
    if a!=b and (a,b) not in edges and (b,a) not in edges:
        try:
            if nx.shortest_path_length(graph,source=a,target=b) > 2: 
                discon_pairs.add((a,b))
            else:
                continue  
        except:  
                discon_pairs.add((a,b))              
    else:
        continue

  # for i in tqdm(range(adj_g.shape[0]-1)):
  #   for j in range(i+1, adj_g.shape[1]):
  #     try:
  #       if nx.shortest_path_length(graph, i, j) <= 2:
  #         if adj_g[i, j] == 0:
  #           discon_pairs.append([nodes[i], nodes[j]])
  #     except nx.NetworkXNoPath:
  #       continue

  print(f"Number of disconnected nodes found = {len(discon_pairs)}")

  #converting the list into a dataframe

  neg_samples = pd.DataFrame({'src': [i[0] for i in discon_pairs], 
                              'dest': [i[1] for i in discon_pairs]})  
  neg_samples['link'] = 0

  return neg_samples

In [20]:
%%time
neg_data = get_neg_samples(nodelist, friends)

# estimated running time for this cell = 4 days
# check for possible places for optimization 

Number of disconnected nodes found = 290000
CPU times: user 8min 27s, sys: 1.34 s, total: 8min 28s
Wall time: 8min 29s


In [21]:
neg_data.head()

Unnamed: 0,src,dest,link
0,27422,13452,0
1,7664,6853,0
2,30733,17240,0
3,5203,12912,0
4,2275,8202,0


1.2 Obtain positive samples from the original graph

These positive samples, along with the negative samples obtained above would be used for training the classifiers. The positive samples are simply **pairs of nodes between which a link actually exists in the original graph.**

The procedure followed is: 
* Check if dropping a node pair results in splitting of the graph (increases the number of conn components)
* Check if dropping the pair results in a reduction in the number of nodes
* If both the above constraints are fulfilled, then drop the node pair (edge) - store this node pair in a list of positive samples

In [22]:
# creating a dictionary of number of times a node occurs in the dict
src = list(df['src'])
dest = list(df['dest'])

src_dict = collections.Counter(src)
dest_dict = collections.Counter(dest)

print(len(src_dict), len(dest_dict))

30855 30195


In [23]:
def get_pos_samples(df, g):
  """
  Input:
    df: the original edgelist as a pd dataframe 
    g: the original networkx graph
    assumes: 
      nodes are specified as Source and Destination
      original Graph is a Digraph
      node indexing is corrected
  Output:
    pos_samples: dataframe containing all removable edges (positive samples)
  """
  df_temp = df.copy()
  gr = g.copy(as_view=False)
  node_count = len(gr.nodes())
  print(f"original node count = {node_count}")
  
  removable_links_idx = []
  
  cc = nx.number_weakly_connected_components(gr)
  print(f"Original number of connected components = {cc}")

  for c, i in tqdm(enumerate(df.index.values)):
    src, dest, link = df.loc[i].values
    
    gr.remove_edge(src,dest)

    if(gr.degree(src) == 0):
      gr.remove_node(src)
    if(gr.degree(dest) == 0):
      gr.remove_node(dest)
  
    if len(gr.nodes()) == node_count:
      removable_links_idx.append(i)
      df_temp = df_temp.drop(index=i)
    else:
      gr.add_edge(src,dest)
    
    if c == 34000:
      print(f"removable links index = {len(removable_links_idx)}")
      break
  
  pos_samples = df.loc[removable_links_idx]
  pos_samples['link'] = 1

  nodes_new = set(df_temp['src']).union(set(df_temp['dest']))
  print(f"New number of nodes = {len(nodes_new)}")

  return pos_samples, df_temp

In [24]:
pos_samples, df_train = get_pos_samples(df, friends)
pos_samples_idx = pos_samples.index.values
print(pos_samples.shape)
pos_samples.head()

original node count = 37700
Original number of connected components = 1


0it [00:00, ?it/s]

removable links index = 33325
New number of nodes = 37700
(33325, 3)


Unnamed: 0,src,dest,link
274292,31890,33585,1
225046,20535,2328,1
17415,1343,34611,1
31874,5310,37462,1
252468,25031,26599,1


In [25]:
print(df_train.shape)
df_train.head()

(255678, 3)


Unnamed: 0,src,dest,link
91155,6639,22064,1
32456,2040,26233,1
245270,23636,36468,1
127747,9723,17816,1
70892,5068,16240,1


In [26]:
# sanity check
train_nodes = set(df_train['src']).union(set(df_train['dest']))

In [27]:
len(df_nodes.difference(train_nodes))

0

In [28]:
train_graph = nx.from_pandas_edgelist(df_train, source='src', target='dest', create_using=nx.MultiDiGraph())

In [29]:
original_graph = nx.from_pandas_edgelist(df, source="src", target="dest", create_using=nx.MultiDiGraph)

In [30]:
cc_before = nx.number_weakly_connected_components(original_graph)
print(cc_before)

In [31]:
cc_after = nx.number_weakly_connected_components(train_graph)

In [32]:
print(cc_after)

35


In [33]:
neg_data.shape

(290000, 3)

In [34]:
neg_test = neg_data.sample(frac = 0.1)
print(neg_test.shape)
neg_test.head()

(29000, 3)


Unnamed: 0,src,dest,link
271944,8482,12944,0
11139,15724,18140,0
212090,36224,31363,0
28937,30824,10402,0
16956,1755,5230,0


In [35]:
neg_train = neg_data.drop(neg_test.index.values, axis=0)
print(neg_train.shape)
neg_train.head()

(261000, 3)


Unnamed: 0,src,dest,link
1,7664,6853,0
2,30733,17240,0
3,5203,12912,0
5,23125,20828,0
6,17411,20532,0


In [36]:
combined_tr = pd.concat([df_train, neg_train], axis = 0).sample(frac=1)
print(combined_tr.shape)
combined_tr.head()

(516678, 3)


Unnamed: 0,src,dest,link
132913,10080,28269,1
97601,7104,31695,1
127442,21330,20345,0
189136,13326,30293,0
61075,13218,37521,0


In [37]:
combined_te = pd.concat([pos_samples, neg_test], axis = 0).sample(frac=1)
print(combined_te.shape)
combined_te.head()

(62325, 3)


Unnamed: 0,src,dest,link
222440,28906,32893,0
48306,35757,23700,0
265591,26731,28777,1
132547,5482,11624,0
124965,24379,459,0


In [38]:
combined_tr.to_csv(data_path+'/gh_train_91.csv')
combined_te.to_csv(data_path+'/gh_test_91.csv')

In [39]:
df_train.to_csv(data_path+'/gh_pos_train_91.csv')

In [40]:
combined_tr = pd.read_csv(data_path+'/gh_train_91.csv', index_col = [0])
print(combined_tr.shape)
combined_tr.head()

(516678, 3)


Unnamed: 0,src,dest,link
132913,10080,28269,1
97601,7104,31695,1
127442,21330,20345,0
189136,13326,30293,0
61075,13218,37521,0


In [41]:
combined_te = pd.read_csv(data_path+'/gh_test_91.csv', index_col = [0])
print(combined_te.shape)
combined_te.head()

(62325, 3)


Unnamed: 0,src,dest,link
222440,28906,32893,0
48306,35757,23700,0
265591,26731,28777,1
132547,5482,11624,0
124965,24379,459,0


In [42]:
df_train = pd.read_csv(data_path+'/gh_pos_train_91.csv', index_col = [0])
print(df_train.shape)
df_train.head()

(255678, 3)


Unnamed: 0,src,dest,link
91155,6639,22064,1
32456,2040,26233,1
245270,23636,36468,1
127747,9723,17816,1
70892,5068,16240,1


In [None]:
train_graph = nx.from_pandas_edgelist(df_train, source='src', target='dest', create_using=nx.MultiDiGraph)

In [None]:
scales = [5, 10, 20, 50]
sample_points = np.linspace(0, 100, 50).astype(np.float32)
degree = 20
G = StellarGraph.from_networkx(train_graph)
generator = GraphWaveGenerator(G, scales=scales, degree=degree)

In [None]:
emb = generator.flow(
    node_ids = G.nodes(), sample_points = sample_points, batch_size = 1, repeat = False
)

In [None]:
embeddings = [x.numpy() for x in tqdm(emb)]

  0%|          | 0/75879 [00:00<?, ?it/s]

In [None]:
filename = data_path + "_embeddings"

In [None]:
outfile = open(filename, "wb")
pickle.dump(embeddings, outfile)

In [None]:
print(filename)

gdrive/My Drive/Major Project_embeddings


In [None]:
file = open(filename, "rb")
gw_emb = pickle.load(file)
file.close()

EOFError: ignored

In [None]:
print(len(gw_emb))

75879


In [None]:
emb_dict = {}
for node, emb in zip(G.nodes(), gw_emb):
  emb_dict[node] = emb
print(len(emb_dict))

75879


In [None]:
# get training embeddings
t_e = []
for i, row in combined.iterrows():
  comb_emb = emb_dict[row['src']] + emb_dict[row['dest']]
  t_e.append(comb_emb)
print(len(t_e))

999165


In [None]:
t_y = combined['link']
len(t_y)

999165

In [None]:

import xgboost as xgb
