In [1]:
!pip install stellargraph

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting stellargraph
  Downloading stellargraph-1.2.1-py3-none-any.whl (435 kB)
[K     |████████████████████████████████| 435 kB 25.6 MB/s 
Installing collected packages: stellargraph
Successfully installed stellargraph-1.2.1


In [18]:
import warnings
warnings.filterwarnings("ignore")
import csv
import pandas as pd 
import numpy as np
import datetime 
import time 
import numpy as np 
import matplotlib
import matplotlib.pylab as plt
import seaborn as sns 
from matplotlib import rcParams  
import math
import pickle
import os
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
import networkx as nx
import pdb
import pickle
from tqdm.notebook import tqdm
import os
import random
from sklearn.model_selection import train_test_split

#Algo imports 
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping
from stellargraph.core import StellarGraph
from stellargraph.core import StellarDiGraph
from stellargraph.losses import graph_log_likelihood
from tensorflow.keras import optimizers, Model, layers, regularizers
from stellargraph.data import EdgeSplitter, UniformRandomWalk, UnsupervisedSampler
from stellargraph.mapper import GraphSAGELinkGenerator
from stellargraph.layer import GraphSAGE, link_classification
from sklearn.metrics import accuracy_score, f1_score

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
data_path = "gdrive/My Drive/Major Project/"

In [7]:
df_train = pd.read_csv(data_path+"pos_train_91.csv", index_col=0)
print(df_train.shape)
print(df_train.head())

(452474, 3)
          src   dest  link
496205  54500  16485     1
141147   3318  30547     1
483066  33514   4007     1
478351  28118   2641     1
505144  69985  61687     1


In [8]:
train_graph = nx.from_pandas_edgelist(df_train, source='src', target='dest', create_using=nx.DiGraph())

In [9]:
degrees = nx.degree(train_graph)
degrees = dict(degrees)
features = {}
for key, val in degrees.items():
  features[key] = [val]

pr = nx.pagerank(train_graph, alpha=0.85)
pr = dict(pr)
for key, val in pr.items():
  features[key].append(val)

katz = nx.katz_centrality(train_graph,alpha=0.005,beta=1)
katz = dict(katz)
for key, val in katz.items():
  features[key].append(val)

nx.set_node_attributes(train_graph, features, "features")

In [10]:
G = StellarGraph.from_networkx(train_graph, node_features="features")
print(G.info())

StellarDiGraph: Directed multigraph
 Nodes: 75879, Edges: 452474

 Node types:
  default: [75879]
    Features: float32 vector, length 3
    Edge types: default-default->default

 Edge types:
    default-default->default: [452474]
        Weights: all 1 (default)
        Features: none


In [11]:
nodes = list(G.nodes())
num_walks = 1
length = 5

In [12]:
unsupervised_samples = UnsupervisedSampler(
    G, nodes=nodes, length= length, number_of_walks = num_walks
)

In [13]:
batch_size=128
epochs=50
num_samples= [10,5]

In [14]:
generator = GraphSAGELinkGenerator(G, batch_size, num_samples)
train_gen = generator.flow(unsupervised_samples)

In [15]:
layer_sizes = [64, 64]
graphsage = GraphSAGE(
    layer_sizes = layer_sizes, generator = generator, bias = True, normalize='l2'
)

In [16]:
x_inp, x_out = graphsage.in_out_tensors()

In [17]:
prediction = link_classification(
    output_dim = 1, output_act = "sigmoid", edge_embedding_method = "ip"
)(x_out)

link_classification: using 'ip' method to combine node embeddings into edge embeddings


In [19]:
model = keras.Model(inputs = x_inp, outputs = prediction)
model.compile(
    optimizer = keras.optimizers.Adam(1e-2),
    loss = keras.losses.binary_crossentropy,
    metrics = [keras.metrics.binary_accuracy]
)

In [20]:
es = EarlyStopping(
    monitor='loss',
    verbose=1,
    restore_best_weights=True,
    patience=2
)

In [21]:
history = model.fit(
    train_gen,
    epochs=epochs,
    callbacks=[es],
    verbose=1,
    use_multiprocessing=True,
    shuffle=True,
)
filep = open(data_path+"de_graphsage_model.h5", "wb")
pickle.dump(model, filep)
filep.close()

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 9: early stopping
INFO:tensorflow:Assets written to: ram://8de8cbc6-0f5d-4350-b7a5-92ae760ec73d/assets


In [22]:
x_inp_src = x_inp[0::2]
x_out_src = x_out[0]
embedding_model = keras.Model(inputs=x_inp_src, outputs=x_out_src)

In [23]:
from stellargraph.mapper.sampled_node_generators import GraphSAGENodeGenerator
node_ids = list(sorted(G.nodes()))
node_gen = GraphSAGENodeGenerator(G, batch_size, num_samples).flow(node_ids)

In [24]:
node_emb = embedding_model.predict(node_gen, verbose=1)



In [25]:
emb_dict = {}
for i, node in enumerate(node_ids):
  emb_dict[node] = np.array(node_emb[i].reshape(1,64))

In [26]:
combined = pd.read_csv(data_path+"/train_91.csv", index_col = 0)
combined.head()

Unnamed: 0,src,dest,link
73132,35080,10701,0
404654,54809,75095,0
310475,67628,60404,0
301271,57253,462,0
332230,15167,47459,1


In [27]:
# get training embeddings
train_emb = []
for i, row in tqdm(combined.iterrows()):
  comb_emb = (emb_dict[row['src']]) * (emb_dict[row['dest']])
  train_emb.append(comb_emb[0])
train_emb = np.array(train_emb)

0it [00:00, ?it/s]

In [28]:
train_y = combined['link']
print(train_y.shape)
print(train_emb.shape)

(902474,)
(902474, 64)


In [30]:
combined_test = pd.read_csv(data_path+"/test_91.csv", index_col = 0)
print(combined_test.head())

          src   dest  link
487942  32198  62377     0
103900   6056  11650     1
78661    2040  22467     1
71074     143      0     1
382438   1609   5035     1


In [31]:
# get test embeddings
test_emb = []
for i, row in tqdm(combined_test.iterrows()):
  comb_emb = (emb_dict[row['src']]) * (emb_dict[row['dest']])
  test_emb.append(comb_emb[0])
test_emb = np.array(test_emb)
print(test_emb.shape)

test_y = combined_test['link'] 
print(test_y.shape)

0it [00:00, ?it/s]

(106363, 64)
(106363,)


In [32]:
#train_emb_sample, _, train_y_sample, _ = train_test_split(train_emb, train_y, stratify = train_y, test_size = 0.5, random_state = 42)

In [33]:
#reading
from pandas import read_hdf
df_final_train = read_hdf(data_path + 'fea_sample/storage_sample_stage4.h5', 'train_df',mode='r')
df_final_test = read_hdf(data_path + 'fea_sample/storage_sample_stage4.h5', 'test_df',mode='r')

In [34]:
y_train = df_final_train.link
y_test = df_final_test.link

In [35]:
df_final_train.drop(['src', 'dest','link'],axis=1,inplace=True)
df_final_test.drop(['src', 'dest','link'],axis=1,inplace=True)

In [36]:
# GraphSAGE only
clf = XGBClassifier(max_depth=7, n_estimators=100, n_jobs=-1,
              random_state=25)
clf.fit(train_emb, train_y)
train_sc = f1_score(train_y,clf.predict(train_emb))
test_sc = f1_score(test_y,clf.predict(test_emb))
print('Train Score',train_sc,'test Score',test_sc)

Train Score 0.8965881248425729 test Score 0.8886064204861756


In [37]:
# GraphSAGE only
clf = RandomForestClassifier(max_depth=12, n_estimators=100, n_jobs=-1,
              random_state=25)
clf.fit(train_emb, train_y)
train_sc = f1_score(train_y,clf.predict(train_emb))
test_sc = f1_score(test_y,clf.predict(test_emb))
print('Train Score',train_sc,'test Score',test_sc)

Train Score 0.8887165450828917 test Score 0.8789566788362377


In [38]:
# GraphSAGE only
clf = LGBMClassifier(boosting_type='gbdt', num_leaves=31, max_depth=-1, learning_rate=0.1, n_estimators=100, subsample_for_bin=200000, 
                     objective=None, class_weight=None, min_split_gain=0.0, min_child_weight=0.001, min_child_samples=20, subsample=1.0, 
                     subsample_freq=0, colsample_bytree=1.0, reg_alpha=0.0, reg_lambda=0.0, random_state=None, n_jobs=- 1, importance_type='split')
clf.fit(train_emb, train_y)
train_sc = f1_score(train_y,clf.predict(train_emb))
test_sc = f1_score(test_y,clf.predict(test_emb))
print('Train Score',train_sc,'test Score',test_sc)

Train Score 0.888966610974333 test Score 0.8846182426156419


In [39]:
# Heuristic only
clf = XGBClassifier(max_depth=7, n_estimators=100, n_jobs=-1,
              random_state=25)
clf.fit(df_final_train,y_train)
train_sc = f1_score(y_train,clf.predict(df_final_train))
test_sc = f1_score(y_test,clf.predict(df_final_test))
print('Train Score',train_sc,'test Score',test_sc)

Train Score 0.983029655226594 test Score 0.9458179808400884


In [40]:
# Heuristic only
clf = RandomForestClassifier(max_depth=12, n_estimators=100, n_jobs=-1,
              random_state=25)
clf.fit(df_final_train,y_train)
train_sc = f1_score(y_train,clf.predict(df_final_train))
test_sc = f1_score(y_test,clf.predict(df_final_test))
print('Train Score',train_sc,'test Score',test_sc)

Train Score 0.9771341261464953 test Score 0.9465713006860139


In [41]:
# Heuristic only
clf = LGBMClassifier(boosting_type='gbdt', num_leaves=31, max_depth=-1, learning_rate=0.1, n_estimators=100, subsample_for_bin=200000, 
                     objective=None, class_weight=None, min_split_gain=0.0, min_child_weight=0.001, min_child_samples=20, subsample=1.0, 
                     subsample_freq=0, colsample_bytree=1.0, reg_alpha=0.0, reg_lambda=0.0, random_state=None, n_jobs=- 1, importance_type='split')
clf.fit(df_final_train,y_train)
train_sc = f1_score(y_train,clf.predict(df_final_train))
test_sc = f1_score(y_test,clf.predict(df_final_test))
print('Train Score',train_sc,'test Score',test_sc)

Train Score 0.9832426574663836 test Score 0.9469947114279144


In [42]:
df_final_train = np.hstack((df_final_train, train_emb))
print(df_final_train.shape)
df_final_test = np.hstack((df_final_test, test_emb))
print(df_final_test.shape)

(902474, 120)
(106363, 120)


In [43]:
print(type(train_y), type(y_train))
for i in range(len(y_train.values)):
  if train_y.values[i] != y_train.values[i]: print(i)

<class 'pandas.core.series.Series'> <class 'pandas.core.series.Series'>


In [44]:
# Heuristic + GraphSAGE
clf = XGBClassifier(max_depth=7, n_estimators=100, n_jobs=-1,
              random_state=25)
clf.fit(df_final_train,y_train)
train_sc = f1_score(y_train,clf.predict(df_final_train))
test_sc = f1_score(y_test,clf.predict(df_final_test))
print('Train Score',train_sc,'test Score',test_sc)

Train Score 0.983513426519436 test Score 0.9459270309593532


In [45]:
# Heuristic + GraphSAGE
clf = RandomForestClassifier(max_depth=12, n_estimators=100, n_jobs=-1,
              random_state=25)
clf.fit(df_final_train,y_train)
train_sc = f1_score(y_train,clf.predict(df_final_train))
test_sc = f1_score(y_test,clf.predict(df_final_test))
print('Train Score',train_sc,'test Score',test_sc)

Train Score 0.9781439395193189 test Score 0.9429055756631693


In [46]:
# Heuristic + GraphSAGE
clf = LGBMClassifier(boosting_type='gbdt', num_leaves=31, max_depth=-1, learning_rate=0.1, n_estimators=100, subsample_for_bin=200000, 
                     objective=None, class_weight=None, min_split_gain=0.0, min_child_weight=0.001, min_child_samples=20, subsample=1.0, 
                     subsample_freq=0, colsample_bytree=1.0, reg_alpha=0.0, reg_lambda=0.0, random_state=None, n_jobs=- 1, importance_type='split')
clf.fit(df_final_train,y_train)
train_sc = f1_score(y_train,clf.predict(df_final_train))
test_sc = f1_score(y_test,clf.predict(df_final_test))
print('Train Score',train_sc,'test Score',test_sc)

Train Score 0.9835220601405003 test Score 0.947387781752637
