In [1]:
!pip install networkx



In [2]:
%pip install -q stellargraph[demos]

Note: you may need to restart the kernel to use updated packages.


In [3]:
import stellargraph as sg
from stellargraph import StellarGraph
from stellargraph.data import EdgeSplitter

In [14]:
import pandas as pd
import numpy as np
import random
import networkx as nx
import re

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [5]:
# convert graph to a dictionary, with node as key, and its neighbours (list) as values
graphDict = {}

# helper variables for constructing a crs matrix and stellar graph
nodes = []
neighbours = []
edges = []

# read file and process data
with open("train.txt", 'r') as f:
    for data in f:
        converted_data = data.split()
        source = converted_data[0]
        sink = converted_data[1:]
        
        graphDict[source] = sink
        for s in sink:
            nodes.append(source)
            neighbours.append(s)
            edges.append(1)

assert(type(nodes[0]) == type(nodes[-1]))
assert(type(nodes[0]) == str)

# edges = pd.DataFrame({"source": nodes, "target": neighbours})
# G_master = StellarGraph(edges=edges, is_directed=True)

In [6]:
temp = [int(i) for i in neighbours]
print(max(temp))
print(max(nodes), max(neighbours)) # find number of nodes

4867135
998989 999999


In [7]:
from scipy.sparse import csr_matrix

_max = max(temp) + 1
adj_matrix = csr_matrix((edges,(nodes,neighbours)),shape=(_max, _max))

In [8]:
sources = graphDict.keys()
neighbours = set(neighbours)
print(len(sources), len(neighbours))

sinks = [node for node in neighbours if node not in sources]
print(len(sources), len(sinks))

20000 4867136
20000 4847136


In [9]:
test_pair = []
test_nodes = set()
with open("test-public.txt", "r") as f:
    next(f)
    for line in f:
        data = line.split()
        test_pair.append((data[1], data[2]))
        test_nodes.add(data[1]) # has to add source as well, because some sources have no out edges in the train.txt
        test_nodes.add(data[2])
f.close()
print(len(test_pair), len(test_nodes))
print(test_pair[:5]) # make sure the test pair is reading correctly in correct order

2000 3948
[('3563811', '3600160'), ('2052043', '1401960'), ('4517994', '1690636'), ('1660006', '4349447'), ('581111', '1882617')]


In [10]:
_len = 0
for key, values in graphDict.items():
    _len += len(values)
print(len(graphDict.keys()), _len/len(graphDict.keys())) # find number of sources and average degree

20000 1200.21805


In [11]:
import random
random.seed(100)

# nodes that must be included in the graph
critical_nodes = test_nodes.union(sources)
print(len(critical_nodes))

# sample nodes to generate a graph network
sample_nodes = set().union(critical_nodes)
while len(sample_nodes) < 1000000:
    n = random.choice(sinks)
    sample_nodes.add(n)
print(len(sample_nodes))

21622
1000000


In [12]:
edges = []
nodes = set()
for s in sample_nodes:
    nodes.add(s)
    try:
        _neighbours = sample_nodes.intersection(set(graphDict[s]))
        for d in _neighbours:
            edges.append((s, d))
            nodes.add(d)
    except:
        continue

print(len(edges))

6382181


In [13]:
# sample positive edges
pos_examples = set()
neg_examples = set()

# 0.1 proportion
while len(pos_examples) < 600000:
    i, j = random.choice(edges)
    pos_examples.add((i, j))

nodes = list(nodes)
while len(neg_examples) < 600000:
    i = random.choice(nodes)
    j = random.choice(nodes)
    if i != j and adj_matrix[int(i), int(j)] == 0:
        neg_examples.add((i, j))
        
examples = list(pos_examples) + list(neg_examples)
labels = [1 for i in pos_examples] + [0 for i in neg_examples]

print(list(pos_examples)[0])

('4151915', '3583761')


In [18]:
# edges = pd.DataFrame({"source": nodes, "target": neighbours})
# G = StellarGraph(edges=edges, is_directed=True)

# print(G.info())

# splitter = EdgeSplitter(G, g_master=G_master)
# subG, examples, labels = splitter.train_test_split(p=0.1)

# print(subG.info())

nxG = nx.DiGraph(edges)
print(len(list(nxG.nodes)))
print(len(list(nxG.edges)))

nxG.remove_edges_from(list(pos_examples))
print(len(list(nxG.nodes)))
print(len(list(nxG.edges)))

subG = StellarGraph.from_networkx(nxG)
print(subG.info())

1000000
6382181
1000000
5782181
StellarDiGraph: Directed multigraph
 Nodes: 1000000, Edges: 5782181

 Node types:
  default: [1000000]
    Features: none
    Edge types: default-default->default

 Edge types:
    default-default->default: [5782181]
        Weights: all 1 (default)
        Features: none


In [19]:
_added = set(list(subG.nodes()))

print("checking for source...")
for s in sources:
    if str(s) not in _added:
        print(s)
print("checking for test...")
for s, t in test_pair:
    if str(s) not in _added:
        print("source: " + str(s))
    if str(t) not in _added:
        print("sink: " + str(t))
# no output means we cover all the required nodes

checking for source...
checking for test...


In [20]:
# https://stellargraph.readthedocs.io/en/stable/demos/link-prediction/node2vec-link-prediction.html#refs
import multiprocessing

p = 0.25
q = 0.25
dimensions = 128
num_walks = 30
walk_length = 50
window_size = 10
num_iter = 1
workers = multiprocessing.cpu_count()


from stellargraph.data import BiasedRandomWalk
from gensim.models import Word2Vec


rw = BiasedRandomWalk(subG)
walks = rw.run(subG.nodes(), n=num_walks, length=walk_length, p=p, q=q)

KeyboardInterrupt: 

In [None]:
model = Word2Vec(
    walks,
    size=dimensions,
    window=window_size,
    min_count=0,
    sg=1,
    workers=workers,
    iter=num_iter,
)

In [None]:
x_train, x_dev, y_train, y_dev = train_test_split(examples, labels, train_size=0.75, test_size=0.25)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

def hadamard(u, v):
    return u * v

# def l1(u, v):
#     return np.abs(u - v)

# def l2(u, v):
#     return (u - v) ** 2

train_embs = [hadamard(model.wv[s], model.wv[t]) for s, t in x_train]
dev_embs = [hadamard(model.wv[s], model.wv[t]) for s, t in x_dev]

lr_clf = LogisticRegressionCV(Cs=10, cv=10, scoring="roc_auc", max_iter=2000)
clf = Pipeline(steps=[("sc", StandardScaler()), ("clf", lr_clf)])
clf.fit(train_embs, y_train)

predicted = clf.predict_proba(dev_embs)
cols = list(clf.classes_).index(1)
score = roc_auc_score(y_dev, predicted[:, cols])
print(score) 

In [None]:
test_embeddings = [hadamard(model.wv[s], model.wv[t]) for s, t in test_pair]
predicted = lr_clf.predict_proba(test_embeddings)
import csv
with open("predict_yjc.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerow(["Id", "Predicted"])
    _id = 1
    for p in predicted:
        writer.writerow([_id, p[1]])
        _id += 1
f.close()