In [None]:
!pip install networkx

In [None]:
!pip install -q stellargraph[demos]

In [1]:
import stellargraph as sg
from stellargraph import StellarGraph
from stellargraph.data import EdgeSplitter

In [2]:
import pandas as pd
import numpy as np
import random
# import networkx as nx
import re

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [3]:
# convert graph to a dictionary, with node as key, and its neighbours (list) as values
graphDict = {}

# helper variables for constructing a crs matrix and stellar graph
nodes = []
neighbours = []

# read file and process data
with open("train.txt", 'r') as f:
    for line in f:
        data = line.split()
        source = data[0]
        sink = data[1:]
        
        graphDict[source] = sink
        for s in sink:
            nodes.append(source)
            neighbours.append(s)

assert(len(nodes) == len(neighbours))
assert(type(nodes[0]) == type(nodes[-1]))
assert(type(neighbours[0]) == type(neighbours[-1]))
assert(type(nodes[0]) == type(neighbours[0]))
assert(type(nodes[0]) == str)

edges = pd.DataFrame({"source": nodes, "target": neighbours})
G_master = StellarGraph(edges=edges, is_directed=True)

In [6]:
sources = graphDict.keys()
neighbours = set(neighbours)
print(len(sources), len(neighbours))

sinks = [node for node in neighbours if node not in sources]
print(len(sources), len(sinks))

20000 4867136
20000 4847136


In [7]:
test_pair = []
test_nodes = set()
with open("test-public.txt", "r") as f:
    next(f)
    for line in f:
        data = line.split()
        test_pair.append((data[1], data[2]))
        test_nodes.add(data[1]) # has to add source as well, because some sources have no out edges in the train.txt
        test_nodes.add(data[2])
f.close()

assert(type(test_pair[0][0]) == type(test_pair[0][1]))
assert(type(test_pair[-1][0]) == type(test_pair[-1][1]))
assert(type(test_pair[0][0]) == type(test_pair[-1][0]))
assert(type(test_pair[0][0]) == str)

print(len(test_pair), len(test_nodes))
print(test_pair[:5]) # make sure the test pair is reading correctly in correct order

2000 3948
[('3563811', '3600160'), ('2052043', '1401960'), ('4517994', '1690636'), ('1660006', '4349447'), ('581111', '1882617')]


In [8]:
_len = 0
for key, values in graphDict.items():
    _len += len(values)
print(len(graphDict.keys()), _len/len(graphDict.keys())) # find number of sources and average degree

20000 1200.21805


In [9]:
import random
random.seed(100)

# nodes that must be included in the graph
critical_nodes = test_nodes.union(sources)
print(len(critical_nodes))

# sample nodes to generate a graph network
sample_nodes = set().union(critical_nodes)
while len(sample_nodes) < 100000:
    n = random.choice(sinks)
    sample_nodes.add(n)
print(len(sample_nodes))

21622
100000


In [10]:
nodes = []
neighbours = []
for s in sample_nodes:
    try:
        _neighbours = sample_nodes.intersection(set(graphDict[s]))
        for d in _neighbours:
            nodes.append(s)
            neighbours.append(d)
    except:
        continue

assert(len(nodes) == len(neighbours))
assert(type(nodes[0]) == type(nodes[-1]))
assert(type(neighbours[0]) == type(neighbours[-1]))
assert(type(nodes[0]) == type(neighbours[0]))
assert(type(nodes[0]) == str)

In [11]:
print(len(nodes), len(neighbours))

edges = pd.DataFrame({"source": nodes, "target": neighbours})
G = StellarGraph(edges=edges, is_directed=True)

print(G.info())

splitter = EdgeSplitter(G, g_master=G_master)
subG, examples, labels = splitter.train_test_split(p=0.1)

print(subG.info())

2297087 2297087
StellarGraph: Directed multigraph
 Nodes: 100000, Edges: 2297087

 Node types:
  default: [100000]
    Features: none
    Edge types: default-default->default

 Edge types:
    default-default->default: [2297087]
        Weights: all 1 (default)
        Features: none
** Sampled 229708 positive and 229708 negative edges. **
StellarDiGraph: Directed multigraph
 Nodes: 100000, Edges: 2067379

 Node types:
  default: [100000]
    Features: none
    Edge types: default-default->default

 Edge types:
    default-default->default: [2067379]
        Weights: all 1 (default)
        Features: none


In [12]:
_added = set(list(G.nodes()))

print("checking for source...")
for s in sources:
    if s not in _added:
        print(s)
print("checking for test...")
for s, t in test_pair:
    if s not in _added:
        print("source: " + s)
    if t not in _added:
        print("sink: " + t)
# no output means we cover all the required nodes

checking for source...
checking for test...


In [13]:
# https://stellargraph.readthedocs.io/en/stable/demos/link-prediction/node2vec-link-prediction.html#refs
import multiprocessing

p = 0.25
q = 0.25
dimensions = 128
num_walks = 30
walk_length = 80
window_size = 10
num_iter = 1
workers = multiprocessing.cpu_count()

from stellargraph.data import BiasedRandomWalk
from gensim.models import Word2Vec

rw = BiasedRandomWalk(subG)
walks = rw.run(subG.nodes(), n=num_walks, length=walk_length, p=p, q=q)

In [14]:
model = Word2Vec(
    walks,
    size=dimensions,
    window=window_size,
    min_count=1,
    sg=1,
    workers=workers,
    iter=num_iter,
)

In [30]:
x_train, x_dev, y_train, y_dev = train_test_split(examples, labels, train_size=0.75, test_size=0.25)

In [31]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

def hadamard(u, v):
    return u * v

def l1(u, v):
    return np.abs(u - v)

def l2(u, v):
    return (u - v) ** 2

train_embs = [hadamard(model.wv[s], model.wv[t]) for s, t in x_train]
dev_embs = [hadamard(model.wv[s], model.wv[t]) for s, t in x_dev]

lr_clf = LogisticRegressionCV(Cs=10, cv=10, scoring="roc_auc", max_iter=2000)
clf = Pipeline(steps=[("sc", StandardScaler()), ("clf", lr_clf)])
clf.fit(train_embs, y_train)

predicted = clf.predict_proba(dev_embs)
cols = list(clf.classes_).index(1)
score = roc_auc_score(y_dev, predicted[:, cols])
print(score) 

0.9281838047965549


In [32]:
# test_embs = [hadamard(model.wv[str(s)], model.wv[str(t)]) for s, t in test_pair]

# predicted = clf.predict_proba(test_embeddings)
# import csv
# with open("predict_yjc.csv", "w") as f:
#     writer = csv.writer(f)
#     writer.writerow(["Id", "Predicted"])
#     _id = 1
#     for p in predicted:
#         writer.writerow([_id, p[1]])
#         _id += 1
# f.close()

In [33]:
from sklearn.neural_network import MLPClassifier

mlp_clf = MLPClassifier(hidden_layer_sizes=(256,), random_state=1, max_iter=2000)
clf = Pipeline(steps=[("sc", StandardScaler()), ("clf", mlp_clf)])
clf.fit(train_embs, y_train)

predicted = clf.predict_proba(dev_embs)
cols = list(clf.classes_).index(1)
score = roc_auc_score(y_dev, predicted[:, cols])
print(score) 

0.9338383147428893


In [34]:
# test_embs = [hadamard(model.wv[str(s)], model.wv[str(t)]) for s, t in test_pair]

# predicted = clf.predict_proba(test_embeddings)
# import csv
# with open("predict_yjc.csv", "w") as f:
#     writer = csv.writer(f)
#     writer.writerow(["Id", "Predicted"])
#     _id = 1
#     for p in predicted:
#         writer.writerow([_id, p[1]])
#         _id += 1
# f.close()

In [44]:
!pip install lightgbm
import lightgbm as lgb

x_train, x_dev, y_train, y_dev = train_test_split(examples, labels, train_size=0.75, test_size=0.25)
x_dev, x_val, y_dev, y_val = train_test_split(x_dev, y_dev, train_size=0.75, test_size=0.25)

train_embs = [hadamard(model.wv[s], model.wv[t]) for s, t in x_train]
dev_embs = [hadamard(model.wv[s], model.wv[t]) for s, t in x_dev]
val_embs = [hadamard(model.wv[s], model.wv[t]) for s, t in x_val]

sc = StandardScaler()
train_trans = sc.fit_transform(train_embs)
dev_trains = sc.transform(dev_embs)
val_trains = sc.transform(val_embs)

# prepare
d_train = lgb.Dataset(train_trans, label=y_train)
d_dev = lgb.Dataset(dev_trains, label=y_dev)
# d_val = lgb.Dataset(val_trains, label=y_val)

params = {
            'max_depth':10, # crtical parameter
            'num_leaves': 800, # critical parameter, must be < 2^max_depth
            'min_data_in_leaf': 3000, # critical parameter, avoid over-fitting
    
            'max_bin': 1000,  
            'learning_rate': 0.1, # small rate with large iteration
            'num_iterations': 1000,
    
            'objective': 'binary', # don't change
            'feature_fraction': 0.9, # don't change, avoid over-fitting
            'verbose': -1, # don't' change
            'metric': 'auc', # don't change
}

lgb_clf = lgb.train(params, d_train, valid_sets=d_dev)
predicted = lgb_clf.predict(val_trains)
for i in range(len(predicted)):
    if predicted[i]>=.5:
        predicted[i]=1
    else:
        predicted[i]=0
score = roc_auc_score(y_val, predicted)
print(score)





[1]	valid_0's auc: 0.84902
[2]	valid_0's auc: 0.869348
[3]	valid_0's auc: 0.88237
[4]	valid_0's auc: 0.887851
[5]	valid_0's auc: 0.893975
[6]	valid_0's auc: 0.898889
[7]	valid_0's auc: 0.902503
[8]	valid_0's auc: 0.906108
[9]	valid_0's auc: 0.90932
[10]	valid_0's auc: 0.911834
[11]	valid_0's auc: 0.914027
[12]	valid_0's auc: 0.91702
[13]	valid_0's auc: 0.918743
[14]	valid_0's auc: 0.920748
[15]	valid_0's auc: 0.922428
[16]	valid_0's auc: 0.924314
[17]	valid_0's auc: 0.925567
[18]	valid_0's auc: 0.927183
[19]	valid_0's auc: 0.928638
[20]	valid_0's auc: 0.929793
[21]	valid_0's auc: 0.93095
[22]	valid_0's auc: 0.932033
[23]	valid_0's auc: 0.933182
[24]	valid_0's auc: 0.934142
[25]	valid_0's auc: 0.935076
[26]	valid_0's auc: 0.935975
[27]	valid_0's auc: 0.936802
[28]	valid_0's auc: 0.937655
[29]	valid_0's auc: 0.9386
[30]	valid_0's auc: 0.939385
[31]	valid_0's auc: 0.940164
[32]	valid_0's auc: 0.940936
[33]	valid_0's auc: 0.941539
[34]	valid_0's auc: 0.942181
[35]	valid_0's auc: 0.942761
[

In [46]:
test_embs = [hadamard(model.wv[str(s)], model.wv[str(t)]) for s, t in test_pair]
test_trans = sc.transform(test_embs)

predicted = lgb_clf.predict(test_trans)
import csv
with open("predict_yjc.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerow(["Id", "Predicted"])
    _id = 1
    for p in predicted:
        writer.writerow([_id, p])
        _id += 1
f.close()