In [None]:
!pip install networkx



In [None]:
%pip install -q stellargraph[demos]

In [16]:
import stellargraph as sg
from stellargraph import StellarGraph
from stellargraph.data import EdgeSplitter

In [18]:
import pandas as pd
import numpy as np
import random
# import networkx as nx
import re

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [19]:
# convert graph to a dictionary, with node as key, and its neighbours (list) as values
graphDict = {}

# helper variables for constructing a crs matrix and stellar graph
nodes = []
neighbours = []
edges = []

# read file and process data
with open("train.txt", 'r') as f:
    for data in f:
        converted_data = list(map(int, data.split()))
        source = converted_data[0]
        sink = converted_data[1:]
        
        graphDict[source] = sink
        for s in sink:
            nodes.append(source)
            neighbours.append(s)
            edges.append(1)

In [20]:
print(max(nodes), max(neighbours)) # find number of nodes

4867036 4839471570810


In [None]:
# from scipy.sparse import csr_matrix

# _max = max(neighbours) + 1
# adj_matrix = csr_matrix((edges,(nodes,neighbours)),shape=(_max, _max))
# sink_source_matrix=csr_matrix((edges,(neighbours,nodes)),shape=(_max, _max))

In [None]:
sources = graphDict.keys()
neighbours = set(neighbours)
print(len(sources), len(neighbours))

sinks = [node for node in neighbours if node not in sources]
print(len(sources), len(sinks))

20002 4867162
20002 4847160


In [None]:
test_pair = []
test_nodes = set()
with open("test-public.txt", "r") as f:
    next(f)
    for data in f:
        line = [int(i) for i in data.split()]
        test_pair.append((line[1], line[2]))
        test_nodes.add(line[1]) # has to add source as well, because some sources have no out edges in the train.txt
        test_nodes.add(line[2])
f.close()
print(len(test_pair), len(test_nodes))
print(test_pair[:5]) # make sure the test pair is reading correctly in correct order

2000 3948
[(3563811, 3600160), (2052043, 1401960), (4517994, 1690636), (1660006, 4349447), (581111, 1882617)]


In [None]:
_len = 0
for key, values in graphDict.items():
    _len += len(values)
print(len(graphDict.keys()), _len/len(graphDict.keys())) # find number of sources and average degree

20002 1094.8781121887812


In [None]:
import random
random.seed(100)

# nodes that must be included in the graph
critical_nodes = test_nodes.union(sources)
print(len(critical_nodes))

# sample nodes to generate a graph network
sample_nodes = set().union(critical_nodes)
while len(sample_nodes) < 60000:
    n = random.choice(sinks)
    sample_nodes.add(n)
print(len(sample_nodes))

21624
60000


In [None]:
nodes = []
neighbours = []
for s in sample_nodes:
    try:
        _neighbours = sample_nodes.intersection(set(graphDict[s]))
        for d in _neighbours:
            nodes.append(str(s))
            neighbours.append(str(d))
    except:
        continue

In [13]:
edges = pd.DataFrame({"source": nodes, "target": neighbours})
G = StellarGraph(edges=edges, is_directed=True)

print(G.info())

splitter = EdgeSplitter(G)
subG, examples, labels = splitter.train_test_split(p=0.1)

print(subG.info())

StellarGraph: Directed multigraph
 Nodes: 54669, Edges: 2053007

 Node types:
  default: [54669]
    Features: none
    Edge types: default-default->default

 Edge types:
    default-default->default: [2053007]
        Weights: all 1 (default)
        Features: none
** Sampled 205300 positive and 205300 negative edges. **
StellarDiGraph: Directed multigraph
 Nodes: 54669, Edges: 1847707

 Node types:
  default: [54669]
    Features: none
    Edge types: default-default->default

 Edge types:
    default-default->default: [1847707]
        Weights: all 1 (default)
        Features: none


In [14]:
_added = set(list(G.nodes()))

print("checking for source...")
for s in sources:
    if str(s) not in _added:
        print(s)
print("checking for test...")
for s, t in test_pair:
    if str(s) not in _added:
        print("source: " + str(s))
    if str(t) not in _added:
        print("sink: " + str(t))
# no output means we cover all the required nodes

checking for source...
checking for test...
sink: 2978657
sink: 3757922
sink: 902990
sink: 3390005
sink: 725481
sink: 1020996
sink: 2072437
sink: 2931888
sink: 39384
sink: 3565042
sink: 1573721
sink: 1813598
sink: 3267961
sink: 4702544
sink: 485713
sink: 159148
sink: 2617049
sink: 924523
sink: 1434096
sink: 4702587
sink: 2297830
sink: 1646634
sink: 685354
sink: 2811559
sink: 4099761
sink: 3451393
sink: 3320763
sink: 230324
sink: 195020
sink: 4496788
sink: 3199334
sink: 3256611
sink: 3816260
sink: 1149714
sink: 3239981
sink: 4066160
sink: 563440
sink: 1239270
sink: 1674256


In [15]:
# https://stellargraph.readthedocs.io/en/stable/demos/link-prediction/node2vec-link-prediction.html#refs
import multiprocessing

p = 1.0
q = 1.0
dimensions = 128
num_walks = 10
walk_length = 80
window_size = 10
num_iter = 1
workers = multiprocessing.cpu_count()


from stellargraph.data import BiasedRandomWalk
from gensim.models import Word2Vec

%time
rw = BiasedRandomWalk(subG)
walks = rw.run(subG.nodes(), n=num_walks, length=walk_length, p=p, q=q)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 4.77 µs


KeyboardInterrupt: ignored

In [None]:
model = Word2Vec(
    walks,
    size=dimensions,
    window=window_size,
    min_count=0,
    sg=1,
    workers=workers,
    iter=num_iter,
)

In [None]:
x_train, x_dev, y_train, y_dev = train_test_split(examples, labels, train_size=0.75, test_size=0.25)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

def hadamard(u, v):
    return u * v

def l1(u, v):
    return np.abs(u - v)

def l2(u, v):
    return (u - v) ** 2

train_embs = [hadamard(model.wv[s], model.wv[t]) for s, t in x_train]
dev_embs = [hadamard(model.wv[s], model.wv[t]) for s, t in x_dev]
                     
lr_clf = LogisticRegressionCV(Cs=10, cv=10, scoring="roc_auc", max_iter=2000)
clf = Pipeline(steps=[("sc", StandardScaler()), ("clf", lr_clf)])
clf.fit(train_embs, y_train)

predicted = clf.predict_proba(dev_embs)
cols = list(clf.classes_).index(1)
score = roc_auc_score(link_labels, predicted[:, positive_column])
print(score) 

In [None]:
# test_embeddings = [n2v_model[str(s)] + n2v_model[str(t)] for s, t in test_pair]
# predicted = lr_clf.predict_proba(test_embeddings)
predicted = []
import csv
with open("predict_yjc.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerow(["Id", "Predicted"])
    _id = 1
    for p in predicted:
        writer.writerow([_id, p[1]])
        _id += 1
f.close()