# New part

## Download data

In [None]:
from google.colab import drive
drive.mount('/content/GDrive')

Mounted at /content/GDrive


In [None]:
%%bash
cp /content/GDrive/MyDrive/WikiLinksGraph/enwiki.wikilink_graph.2002-03-01.csv.gz /content/
gzip -d /content/enwiki.wikilink_graph.2002-03-01.csv.gz
cp /content/GDrive/MyDrive/WikiLinksGraph/enwiki.wikilink_graph.2003-03-01.csv.gz /content/
gzip -d /content/enwiki.wikilink_graph.2003-03-01.csv.gz

## Code for model

In [None]:
import pandas as pd
import numpy as np
import networkx as nx
from collections import defaultdict
from sklearn.linear_model import LogisticRegression

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
def create_negative(g, possitives, from_pos = 10000):
  print(f'total negative: {from_pos}')
  negatives = []
  pos2neg = np.random.choice(possitives, from_pos, replace = False)
  nodes = set(g.nodes())
  i = 1
  for e1, _ in pos2neg:
    print(f'Create Negative {i}/{from_pos}|')
    i += 1
    neighs = set(nx.neighbors(g, e1))
    indicates =list(nodes - neighs)
    e2 = np.random.choice(indicates, 1)[0]
    negatives.append((e1, e2))
  negatives = list(set(negatives))
  ret = np.empty(len(negatives), dtype = object)
  ret[:] = negatives
  return ret

In [None]:
def create_train_dataset(g, neg_rate = 1.0):
  nodes = np.array(g.nodes())
  edges = list(g.edges())
  possitives = np.empty(len(edges), dtype = object)
  possitives[:] = edges

  from_pos = int(len(edges)*neg_rate)

  negatives = create_negative(g, possitives, from_pos)
  return possitives, negatives

In [None]:
def create_test_dataset(g_curr, g_aft):
  nodes_curr = set(g_curr.nodes())
  egdes_curr = set(g_curr.edges())
  edges_aft = set(g_aft.edges())
  new_edges = edges_aft - egdes_curr
  test = []
  for new_edge in new_edges:
    if (new_edge[0] in nodes_curr) and (new_edge[1] in nodes_curr):
      test.append(new_edge)
  ret = np.empty(len(test), dtype = object)
  ret[:] = test
  return ret

In [None]:
df1 = pd.read_csv('enwiki.wikilink_graph.2002-03-01.csv', delimiter = '\t')
df2 = pd.read_csv('enwiki.wikilink_graph.2003-03-01.csv', delimiter = '\t')
g1 = nx.from_pandas_edgelist(df1, 'page_id_from', 'page_id_to')
g2 = nx.from_pandas_edgelist(df2, 'page_id_from', 'page_id_to')

In [None]:
pos_train, neg_train = create_train_dataset(g1)

In [None]:
pos_test = create_test_dataset(g1, g2)
neg_test = create_negative(g1, pos_test, len(pos_test))

In [None]:
def create_feature_connection(pos_train, pos_test, neg_train, neg_test):
  train_has_connected = [1 for _ in range(len(pos_train))]
  train_not_connected = [0 for _ in range(len(neg_train))]
  test_has_connected = [1 for _ in range(len(pos_test))]
  test_not_connected = [0 for _ in range(len(neg_test))]

  train_index = np.concatenate((pos_train, neg_train))
  test_index = np.concatenate((pos_test, neg_test))

  train_data = np.concatenate((train_has_connected, train_not_connected))
  test_data = np.concatenate((test_has_connected, test_not_connected))

  #shuffle data
  train_mask = np.random.permutation(len(train_data))
  test_mask = np.random.permutation(len(test_data))

  train_index_s = train_index[train_mask]
  test_index_s = test_index[test_mask]
  train_data_s = train_data[train_mask]
  test_data_s = test_data[test_mask]

  df_train = pd.DataFrame()
  df_test = pd.DataFrame(index=test_index_s)
  df_train['index'] = train_index_s
  df_train['connected'] = train_data_s
  df_test['index'] = test_index_s
  df_test['connected'] = test_data_s

  return df_train, df_test

In [None]:
df_train_lbl, df_test_lbl = create_feature_connection(pos_train, pos_test, neg_train, neg_test)

In [None]:
def build_feature(G, edges):
  index = []
  cn_data, jc_data, pa_data = [], [], []
  for edge in edges:
    cn = len(list(nx.common_neighbors(G, edge[0], edge[1])))
    jc = list(nx.jaccard_coefficient(G, [edge]))[0]
    pa = list(nx.preferential_attachment(G, [edge]))[0]
    index.append(edge)
    cn_data.append(cn)
    jc_data.append(jc[2])
    pa_data.append(pa[2])
  df = pd.DataFrame()
  df['index'] = index
  df['preferential_attachment'] = pa_data
  df['common_neighbors'] = cn_data
  df['jaccard_coef'] = jc_data
  return df

In [None]:
edges_train = np.concatenate((pos_train, neg_train))
edges_test = np.concatenate((pos_test, neg_test))

In [None]:
features_train = build_feature(g1, edges_train)
features_test = build_feature(g1, edges_test)

In [None]:
features_train.fillna(0, inplace = True)
features_test.fillna(0, inplace = True)

In [None]:
train_df_data = df_train_lbl.merge(features_train, on='index', how='left')

In [None]:
test_df_data = df_test_lbl.merge(features_test, on='index', how = 'left')

In [None]:
x_train = train_df_data[['preferential_attachment', 'common_neighbors', 'jaccard_coef']].values
y_train = train_df_data['connected'].values

In [None]:
model = LogisticRegression()

In [None]:
model.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
x_test = test_df_data[['preferential_attachment', 'common_neighbors', 'jaccard_coef']].values
y_test = test_df_data['connected'].values

In [None]:
y_pred = model.predict(x_test)

In [None]:
accuracy_score(y_pred, y_test)

0.7201375543643168

## Node embedding

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.parameter import Parameter
import numpy as np
import time

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
edges = list(g1.edges())
pos_train = np.empty(len(edges), dtype = object)
pos_train[:] = edges
pos_test = create_test_dataset(g1, g2)

In [None]:
class NodeEmbedding(nn.Module):

  def __init__(self, num_nodes, size, num_rels = 2):
    super(NodeEmbedding, self).__init__()
    self.node_embedding = nn.Embedding(num_nodes, size)
    self.relation_embedding = nn.Embedding(num_rels, size)
    self.init(num_nodes)

  def init(self, num_nodes):
    init_range = 6/np.sqrt(num_nodes)
    self.node_embedding.weight.data.uniform_(-init_range, init_range)
  
  def forward(self, h, t):
    self.node_embedding.weight.data[:-1, :].div_(self.node_embedding.weight.data[:-1, :].norm(p=2, dim=1, keepdim=True))
    self.relation_embedding.weight.data[:-1,:].div_(self.relation_embedding.weight.data[:-1,:].norm(p = 2, dim = 1, keepdim = True))
    return self.loss(h, t)
  def loss(self, h, t):
    h_v = self.node_embedding(h)
    t_v = self.node_embedding(t)
    l = torch.norm(h_v - t_v, p = 2, dim = 1)
    return torch.mean(l)
  def save_embedding(self):
    return self.node_embedding.weight.data.detach().cpu().numpy()

In [None]:
def train_one_step(epo, model, optimizer, h, t):
  start = time.time()
  optimizer.zero_grad()
  loss = model(h, t)
  loss.backward()
  optimizer.step()
  end = time.time()
  print('Epoch: %d: -> loss: %f, time: %fs'%(epo, loss, (end-start)))

In [None]:
def map_to_id(filename):
  node_mapping = dict()
  f = open(filename)
  lines = f.readlines()
  idx = 0
  for line in lines[1:]:
    tmp = line.strip().split('\t')
    if int(tmp[0]) not in node_mapping:
      node_mapping[int(tmp[0])] = idx
      idx += 1
    if int(tmp[2]) not in node_mapping:
      node_mapping[int(tmp[2])] = idx
      idx += 1
  return node_mapping

In [None]:
def rebuild_relation(relation, node_mapping):
  h = []
  t = []
  for e1, e2 in relation:
    h.append(node_mapping[e1])
    t.append(node_mapping[e2])
  return h, t

In [None]:
node_mapping = map_to_id('enwiki.wikilink_graph.2002-03-01.csv')

In [None]:
relation = pos_train

In [None]:
h, t = rebuild_relation(relation, node_mapping)

In [None]:
h = torch.LongTensor(h)
t = torch.LongTensor(t)

In [None]:
num_nodes = len(node_mapping)

In [None]:
model = NodeEmbedding(num_nodes = num_nodes, size = 50)

In [None]:
optimizer = optim.Adam(model.parameters(), weight_decay=5e-3)

In [None]:
model = model.to(device)
h = h.to(device)
t = t.to(device)

In [None]:
for epo in range(100):
  train_one_step(epo, model, optimizer, h, t)

Epoch: 0: -> loss: 1.407942, time: 0.192191s
Epoch: 1: -> loss: 1.407938, time: 0.006254s
Epoch: 2: -> loss: 1.407935, time: 0.006479s
Epoch: 3: -> loss: 1.407931, time: 0.006423s
Epoch: 4: -> loss: 1.407927, time: 0.006493s
Epoch: 5: -> loss: 1.407924, time: 0.006500s
Epoch: 6: -> loss: 1.407920, time: 0.006441s
Epoch: 7: -> loss: 1.407917, time: 0.006374s
Epoch: 8: -> loss: 1.407913, time: 0.006338s
Epoch: 9: -> loss: 1.407909, time: 0.006494s
Epoch: 10: -> loss: 1.407905, time: 0.006402s
Epoch: 11: -> loss: 1.407902, time: 0.006251s
Epoch: 12: -> loss: 1.407898, time: 0.006347s
Epoch: 13: -> loss: 1.407894, time: 0.006335s
Epoch: 14: -> loss: 1.407891, time: 0.006355s
Epoch: 15: -> loss: 1.407887, time: 0.006431s
Epoch: 16: -> loss: 1.407884, time: 0.006771s
Epoch: 17: -> loss: 1.407880, time: 0.006416s
Epoch: 18: -> loss: 1.407877, time: 0.006409s
Epoch: 19: -> loss: 1.407873, time: 0.006392s
Epoch: 20: -> loss: 1.407870, time: 0.006383s
Epoch: 21: -> loss: 1.407866, time: 0.006378

In [None]:
emb = model.save_embedding()

In [None]:
f = open('embedding.txt', 'w')
for vct in emb:
  vct_s = list(map(str, vct))
  f.write(' '.join(vct_s) + '\n')

In [None]:
model_prediction = LinkPrediction(emb, 50)
optimizer_prediction = optim.Adam(model_prediction.parameters(), lr = 0.01, weight_decay=1e-2)
criterion = nn.CrossEntropyLoss()

In [None]:
edges_train = df_train_lbl.values
lbl_train = df_train_lbl['connected'].values

In [None]:
h_train, t_train, r_train = rebuild_relation(edges_train, node_mapping)

In [None]:
h_train = torch.LongTensor(h_train)
t_train = torch.LongTensor(t_train)
lbl_train = torch.LongTensor(r_train)

In [None]:
model_prediction = model_prediction.to(device)
h_train = h_train.to(device)
t_train = t_train.to(device)
r_train = lbl_train.to(device)

In [None]:
for epo in range(100):
  train_one_step_prediction(epo, model_prediction, optimizer_prediction, criterion, h_train, t_train, r_train)

In [None]:
edges_test = df_test_lbl.values

In [None]:
h_test, t_test, r_test = rebuild_relation(edges_test, node_mapping)

In [None]:
h_test = torch.LongTensor(h_test)
t_test = torch.LongTensor(t_test)
lbl_test = torch.LongTensor(lbl_test)

In [None]:
h_test.to(device)
t_test.to(device)
lbl_test.to(device)

In [None]:
model_prediction.eval()
y_pred = model_prediction(h_test, t_test, r_test)

In [None]:
y_pred

In [None]:
y_pred = y_pred.detach().numpy()

In [None]:
lbl = np.argmax(y_pred, axis = 1)

In [None]:
sum(lbl == lbl_test.numpy())/len(lbl)

## Try Using GraphSAGE

In [None]:
!pip install stellargraph

Installing collected packages: stellargraph
Successfully installed stellargraph-1.2.1


In [None]:
import networkx as nx
import pandas as pd
import os

import stellargraph as sg
from stellargraph.data import EdgeSplitter
from stellargraph.mapper import GraphSAGELinkGenerator
from stellargraph.layer import GraphSAGE, link_classification

import tensorflow.keras as keras # DO NOT USE KERAS DIRECTLY
from sklearn import preprocessing, feature_extraction, model_selection

from stellargraph import globalvar

In [None]:
g_nx = nx.Graph()
h, t = rebuild_relation(pos_train, node_mapping)
g_nx.add_edges_from(zip(h, t))

In [None]:
for n in list(g_nx.nodes()):
  g_nx.nodes[n]['feature'] = emb[n]

In [None]:
edge_splitter_test = EdgeSplitter(g_nx)

In [None]:
G_test, edge_ids_test, edge_labels_test = edge_splitter_test.train_test_split(
        p=0.1, method="global", keep_connected=True
    )

** Sampled 19586 positive and 19586 negative edges. **


In [None]:
edge_splitter_train = EdgeSplitter(G_test)
G_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split(
    p=0.1, method="global", keep_connected=True
)

** Sampled 17627 positive and 17627 negative edges. **


In [None]:
G_train = sg.StellarGraph(G_train, node_features="feature")
G_test = sg.StellarGraph(G_test, node_features="feature")

  """Entry point for launching an IPython kernel.
  


In [None]:
print(G_train.info())

StellarGraph: Undirected multigraph
 Nodes: 27654, Edges: 158650

 Node types:
  default: [27654]
    Features: float32 vector, length 50
    Edge types: default-default->default

 Edge types:
    default-default->default: [158650]
        Weights: all 1 (default)
        Features: none


In [None]:

print(G_test.info())

StellarGraph: Undirected multigraph
 Nodes: 27654, Edges: 176277

 Node types:
  default: [27654]
    Features: float32 vector, length 50
    Edge types: default-default->default

 Edge types:
    default-default->default: [176277]
        Weights: all 1 (default)
        Features: none


In [None]:
batch_size = 20
epochs = 20

In [None]:
num_samples = [20, 10]


train_gen = GraphSAGELinkGenerator(G_train, batch_size, num_samples)
test_gen = GraphSAGELinkGenerator(G_test,  batch_size, num_samples)

In [None]:
layer_sizes = [30, 30]
assert len(layer_sizes) == len(num_samples)

In [None]:
graphsage = GraphSAGE(layer_sizes=layer_sizes, generator=train_gen,
                      bias=True, dropout=0.3)

In [None]:
train_gen = train_gen.flow(edge_ids_train,edge_labels_train)
test_gen = test_gen.flow(edge_ids_test, edge_labels_test)

In [None]:
x_inp, x_out = graphsage.build()

  """Entry point for launching an IPython kernel.


In [None]:
prediction = link_classification(
        output_dim=1, output_act="relu", edge_embedding_method='ip'
    )(x_out)

link_classification: using 'ip' method to combine node embeddings into edge embeddings


In [None]:
model = keras.Model(inputs=x_inp, outputs=prediction)

model.compile(
        optimizer=keras.optimizers.Adam(lr=1e-3),
        loss=keras.losses.binary_crossentropy,
        metrics=["acc"],
    )

In [None]:
init_train_metrics = model.evaluate_generator(train_gen)
init_test_metrics = model.evaluate_generator(test_gen)

print("\nTrain Set Metrics of the initial (untrained) model:")
for name, val in zip(model.metrics_names, init_train_metrics):
    print("\t{}: {:0.4f}".format(name, val))

print("\nTest Set Metrics of the initial (untrained) model:")
for name, val in zip(model.metrics_names, init_test_metrics):
    print("\t{}: {:0.4f}".format(name, val))




Train Set Metrics of the initial (untrained) model:
	loss: 0.8778
	acc: 0.5577

Test Set Metrics of the initial (untrained) model:
	loss: 0.8872
	acc: 0.5573


In [None]:
history = model.fit_generator(
            train_gen,
            epochs=epochs,
            validation_data=test_gen,
            verbose=2
        )

Epoch 1/20




1763/1763 - 111s - loss: 0.6580 - acc: 0.6412 - val_loss: 0.5600 - val_acc: 0.7398
Epoch 2/20
1763/1763 - 107s - loss: 0.6060 - acc: 0.7052 - val_loss: 0.5486 - val_acc: 0.7382
Epoch 3/20
1763/1763 - 107s - loss: 0.5855 - acc: 0.7272 - val_loss: 0.5423 - val_acc: 0.7535
Epoch 4/20
1763/1763 - 107s - loss: 0.5672 - acc: 0.7356 - val_loss: 0.5725 - val_acc: 0.7018
Epoch 5/20
1763/1763 - 107s - loss: 0.5691 - acc: 0.7351 - val_loss: 0.5632 - val_acc: 0.7049
Epoch 6/20
1763/1763 - 107s - loss: 0.5570 - acc: 0.7271 - val_loss: 0.5474 - val_acc: 0.7165
Epoch 7/20
1763/1763 - 107s - loss: 0.5569 - acc: 0.7350 - val_loss: 0.5372 - val_acc: 0.7224
Epoch 8/20
1763/1763 - 108s - loss: 0.5531 - acc: 0.7360 - val_loss: 0.5467 - val_acc: 0.7177
Epoch 9/20
1763/1763 - 106s - loss: 0.5492 - acc: 0.7426 - val_loss: 0.5415 - val_acc: 0.7275
Epoch 10/20
1763/1763 - 107s - loss: 0.5561 - acc: 0.7434 - val_loss: 0.5874 - val_acc: 0.6888
Epoch 11/20
1763/1763 - 107s - loss: 0.5432 - acc: 0.7441 - val_loss: 

#Final Code

## Data and library

In [None]:
from google.colab import drive
drive.mount('/content/GDrive')

Mounted at /content/GDrive


In [None]:
%%bash
cp /content/GDrive/MyDrive/WikiLinksGraph/enwiki.wikilink_graph.2002-03-01.csv.gz /content/
gzip -d /content/enwiki.wikilink_graph.2002-03-01.csv.gz
cp /content/GDrive/MyDrive/WikiLinksGraph/enwiki.wikilink_graph.2003-03-01.csv.gz /content/
gzip -d /content/enwiki.wikilink_graph.2003-03-01.csv.gz

In [None]:
import pandas as pd
import numpy as np
import networkx as nx
from collections import defaultdict
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from time import time

##Some function

### Create Negative Sampling

In [None]:
def create_negative(g, possitives, from_pos = 10000):
  print(f'total negative: {from_pos}')
  negatives = []
  pos2neg = np.random.choice(possitives, from_pos, replace = False)
  nodes = set(g.nodes())
  i = 1
  for e1, _ in pos2neg:
    print(f'Create Negative {i}/{from_pos}|')
    i += 1
    neighs = set(nx.neighbors(g, e1))
    indicates =list(nodes - neighs)
    e2 = np.random.choice(indicates, 1)[0]
    negatives.append((e1, e2))
  negatives = list(set(negatives))
  ret = np.empty(len(negatives), dtype = object)
  ret[:] = negatives
  return ret

### Create Train set

In [None]:
def create_train_dataset(g, neg_rate = 1.0):
  nodes = np.array(g.nodes())
  edges = list(g.edges())
  possitives = np.empty(len(edges), dtype = object)
  possitives[:] = edges

  from_pos = int(len(edges)*neg_rate)

  negatives = create_negative(g, possitives, from_pos)
  return possitives, negatives

### Create test set

In [None]:
def create_test_dataset(g_curr, g_aft):
  nodes_curr = set(g_curr.nodes())
  egdes_curr = set(g_curr.edges())
  edges_aft = set(g_aft.edges())
  new_edges = edges_aft - egdes_curr
  test = []
  for new_edge in new_edges:
    if (new_edge[0] in nodes_curr) and (new_edge[1] in nodes_curr):
      test.append(new_edge)
  ret = np.empty(len(test), dtype = object)
  ret[:] = test
  return ret

### Create train and test label for edges

In [None]:
def create_feature_connection(pos_train, pos_test, neg_train, neg_test):
  train_has_connected = [1 for _ in range(len(pos_train))]
  train_not_connected = [0 for _ in range(len(neg_train))]
  test_has_connected = [1 for _ in range(len(pos_test))]
  test_not_connected = [0 for _ in range(len(neg_test))]

  train_index = np.concatenate((pos_train, neg_train))
  test_index = np.concatenate((pos_test, neg_test))

  train_data = np.concatenate((train_has_connected, train_not_connected))
  test_data = np.concatenate((test_has_connected, test_not_connected))

  #shuffle data
  train_mask = np.random.permutation(len(train_data))
  test_mask = np.random.permutation(len(test_data))

  train_index_s = train_index[train_mask]
  test_index_s = test_index[test_mask]
  train_data_s = train_data[train_mask]
  test_data_s = test_data[test_mask]

  df_train = pd.DataFrame()
  df_test = pd.DataFrame(index=test_index_s)
  df_train['index'] = train_index_s
  df_train['connected'] = train_data_s
  df_test['index'] = test_index_s
  df_test['connected'] = test_data_s

  return df_train, df_test

### Build features vector (function)

In [None]:
def build_feature(G, edges):
  index = []
  cn_data, jc_data, pa_data = [], [], []
  for edge in edges:
    cn = len(list(nx.common_neighbors(G, edge[0], edge[1])))
    jc = list(nx.jaccard_coefficient(G, [edge]))[0]
    pa = list(nx.preferential_attachment(G, [edge]))[0]
    index.append(edge)
    cn_data.append(cn)
    jc_data.append(jc[2])
    pa_data.append(pa[2])
  df = pd.DataFrame()
  df['index'] = index
  df['preferential_attachment'] = pa_data
  df['common_neighbors'] = cn_data
  df['jaccard_coef'] = jc_data
  return df

##Repair data

### Load graph at time $t$ and $t + 1$ as $g_1$ and $g_2$. <br>
$g_1$ for training, $g_2$ for testing

In [None]:
df1 = pd.read_csv('enwiki.wikilink_graph.2002-03-01.csv', delimiter = '\t')
df2 = pd.read_csv('enwiki.wikilink_graph.2003-03-01.csv', delimiter = '\t')
g1 = nx.from_pandas_edgelist(df1, 'page_id_from', 'page_id_to')
g2 = nx.from_pandas_edgelist(df2, 'page_id_from', 'page_id_to')

In [None]:
start = time()
pos_train, neg_train = create_train_dataset(g1)
end = time()
print(f'total time: {end - start} s')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Create Negative 190864/195863|
Create Negative 190865/195863|
Create Negative 190866/195863|
Create Negative 190867/195863|
Create Negative 190868/195863|
Create Negative 190869/195863|
Create Negative 190870/195863|
Create Negative 190871/195863|
Create Negative 190872/195863|
Create Negative 190873/195863|
Create Negative 190874/195863|
Create Negative 190875/195863|
Create Negative 190876/195863|
Create Negative 190877/195863|
Create Negative 190878/195863|
Create Negative 190879/195863|
Create Negative 190880/195863|
Create Negative 190881/195863|
Create Negative 190882/195863|
Create Negative 190883/195863|
Create Negative 190884/195863|
Create Negative 190885/195863|
Create Negative 190886/195863|
Create Negative 190887/195863|
Create Negative 190888/195863|
Create Negative 190889/195863|
Create Negative 190890/195863|
Create Negative 190891/195863|
Create Negative 190892/195863|
Create Negative 190893/195863|
Creat

In [None]:
start = time()
pos_test = create_test_dataset(g1, g2)
neg_test = create_negative(g1, pos_test, len(pos_test))
end = time()
print(f'total time: {end - start} s')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Create Negative 147832/152830|
Create Negative 147833/152830|
Create Negative 147834/152830|
Create Negative 147835/152830|
Create Negative 147836/152830|
Create Negative 147837/152830|
Create Negative 147838/152830|
Create Negative 147839/152830|
Create Negative 147840/152830|
Create Negative 147841/152830|
Create Negative 147842/152830|
Create Negative 147843/152830|
Create Negative 147844/152830|
Create Negative 147845/152830|
Create Negative 147846/152830|
Create Negative 147847/152830|
Create Negative 147848/152830|
Create Negative 147849/152830|
Create Negative 147850/152830|
Create Negative 147851/152830|
Create Negative 147852/152830|
Create Negative 147853/152830|
Create Negative 147854/152830|
Create Negative 147855/152830|
Create Negative 147856/152830|
Create Negative 147857/152830|
Create Negative 147858/152830|
Create Negative 147859/152830|
Create Negative 147860/152830|
Create Negative 147861/152830|
Creat

### df_train_lbl and df_test_lbl contain edges with their label

In [None]:
df_train_lbl, df_test_lbl = create_feature_connection(pos_train, pos_test, neg_train, neg_test)

### Create data for training and testing

In [None]:
edges_train = np.concatenate((pos_train, neg_train))
edges_test = np.concatenate((pos_test, neg_test))

In [None]:
features_train = build_feature(g1, edges_train)
features_test = build_feature(g1, edges_test)

In [None]:
features_train.fillna(0, inplace = True)
features_test.fillna(0, inplace = True)

In [None]:
train_df_data = df_train_lbl.merge(features_train, on='index', how='left')
test_df_data = df_test_lbl.merge(features_test, on='index', how = 'left')

In [None]:
x_train = train_df_data[['preferential_attachment', 'common_neighbors', 'jaccard_coef']].values
y_train = train_df_data['connected'].values

In [None]:
x_test = test_df_data[['preferential_attachment', 'common_neighbors', 'jaccard_coef']].values
y_test = test_df_data['connected'].values

## Model and (train and test) 

In [None]:
model = LogisticRegression()

In [None]:
model.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
y_pred = model.predict(x_test)

In [None]:
print(f'accuracy score on test: {accuracy_score(y_pred, y_test)}')

accuracy score on test: 0.7185218531639623


## Using community

In [None]:
import networkx.algorithms.community as nxcom

In [None]:
 communities = list(nxcom.label_propagation_communities(g1))

In [None]:
for c, v_c in enumerate(communities):
  for v in v_c:
    g1.nodes[v]['community'] = c + 1

## Rebuild feature

In [None]:
def rebuild_feature(G, edges):
  index = []
  cn_data, jc_data, pa_data = [], [], []
  rai_data, adamicadar_data, cnsh_data = [], [], []
  wic_data, raish_data = [], []
  for edge in edges:
    try:
      cn = len(list(nx.common_neighbors(G, edge[0], edge[1])))
    except:
      cn = 0
    jc = list(nx.jaccard_coefficient(G, [edge]))[0]
    try:
      pa = list(nx.preferential_attachment(G, [edge]))[0]
    except:
      pa = [0, 0, 0]
    ##
    try:
      rai = list(nx.resource_allocation_index(G, [edge]))[0]
    except:
      rai = [0, 0, 0]
    try:
      adamicadar = list(nx.adamic_adar_index(G, [edge]))[0]
    except:
      adamicadar = [0, 0 , 0]
    try:
      cnsh = list(nx.cn_soundarajan_hopcroft(G, [edge]))[0]
    except:
      cnsh = [0, 0, 0]
    ##
    try:
      wic = list(nx.within_inter_cluster(G, [edge]))[0]
    except:
      wic = [0, 0, 0]
    try:
      raish = list(nx.ra_index_soundarajan_hopcroft(G, [edge]))[0]
    except:
      raish = [0, 0, 0]
    ##
    index.append(edge)
    cn_data.append(cn)
    jc_data.append(jc[2])
    pa_data.append(pa[2])

    rai_data.append(rai[2])
    adamicadar_data.append(adamicadar[2])
    cnsh_data.append(cnsh[2])

    wic_data.append(wic[2])
    raish_data.append(raish[2])

  df = pd.DataFrame()
  df['index'] = index
  df['preferential_attachment'] = pa_data
  df['common_neighbors'] = cn_data
  df['jaccard_coef'] = jc_data

  df['resource_allocation_index'] = rai_data
  df['adamic_adar_index'] = adamicadar_data
  df['cn_soundarajan_hopcroft'] = cnsh_data

  df['within_inter_cluster'] = wic_data
  df['ra_index_soundarajan_hopcroft'] = raish_data
  return df

In [None]:
start = time()
features_train = rebuild_feature(g1, edges_train)
end = time()
print(f'time to build train: {end - start}')

time to build train: 570.4147069454193


In [None]:
start = time()
features_test = rebuild_feature(g1, edges_test)
end = time()
print(f'time to build test: {end - start}')

time to build test: 404.86494970321655


In [None]:
features_train.fillna(0, inplace = True)
features_test.fillna(0, inplace = True)

In [None]:
train_df_data = df_train_lbl.merge(features_train, on='index', how='left')
test_df_data = df_test_lbl.merge(features_test, on='index', how = 'left')

In [None]:
col = ['preferential_attachment', 'common_neighbors', 'jaccard_coef',\
       'resource_allocation_index', 'adamic_adar_index', 'cn_soundarajan_hopcroft',\
       'within_inter_cluster', 'ra_index_soundarajan_hopcroft']

In [None]:
data_train_y = train_df_data['connected'].values
data_test_y = test_df_data['connected'].values
model_list = []
score_list = []
for heu in col:
  model = LogisticRegression()
  data_train_x = train_df_data[[heu]].values
  data_test_x = test_df_data[[heu]].values
  model.fit(data_train_x, data_train_y)
  pred = model.predict(data_test_x)
  pred1 = model.predict(data_train_x)
  score1 = accuracy_score(pred1, data_train_y)
  score = accuracy_score(pred, data_test_y)
  model_list.append(model)
  score_list.append(score1)
  print(f'heuristic: {heu} | score: {score}')

heuristic: preferential_attachment | score: 0.5861958727238327
heuristic: common_neighbors | score: 0.7295936009603778
heuristic: jaccard_coef | score: 0.6900432561507898
heuristic: resource_allocation_index | score: 0.6692013596654336
heuristic: adamic_adar_index | score: 0.7025797107122586
heuristic: cn_soundarajan_hopcroft | score: 0.7202344835685346
heuristic: within_inter_cluster | score: 0.6942025014190366
heuristic: ra_index_soundarajan_hopcroft | score: 0.6566159728066914


In [None]:
score_list

[0.6180998643161765,
 0.8634729437257092,
 0.8316626251114728,
 0.8151454832107239,
 0.8457471681431759,
 0.856742412821227,
 0.7930093445525883,
 0.7929914577961983]

### With 3 col

In [None]:
import itertools
cbn = list(itertools.combinations(col, 3))

In [None]:

train_y = train_df_data['connected'].values
best_score = 0.0
best_model = None
best_col = None
for col_train in cbn:
  train_x = train_df_data[list(col_train)].values
  model = LogisticRegression()
  model.fit(train_x, train_y)
  pred = model.predict(train_x)
  score = accuracy_score(pred, train_y)
  print(f'train on col: {col_train} | score: {score}')
  if score > best_score:
    best_score = score
    best_model = model
    best_col = col_train

train on col: ('preferential_attachment', 'common_neighbors', 'jaccard_coef') | score: 0.8602047778081569
train on col: ('preferential_attachment', 'common_neighbors', 'resource_allocation_index') | score: 0.8602047778081569
train on col: ('preferential_attachment', 'common_neighbors', 'adamic_adar_index') | score: 0.8602686590809785
train on col: ('preferential_attachment', 'common_neighbors', 'cn_soundarajan_hopcroft') | score: 0.8537655455077411
train on col: ('preferential_attachment', 'common_neighbors', 'within_inter_cluster') | score: 0.8603785348702316
train on col: ('preferential_attachment', 'common_neighbors', 'ra_index_soundarajan_hopcroft') | score: 0.8602047778081569
train on col: ('preferential_attachment', 'jaccard_coef', 'resource_allocation_index') | score: 0.5004791095461619
train on col: ('preferential_attachment', 'jaccard_coef', 'adamic_adar_index') | score: 0.8489412317842551
train on col: ('preferential_attachment', 'jaccard_coef', 'cn_soundarajan_hopcroft') | s

In [None]:
test_x = test_df_data[list(best_col)]
y_pred = best_model.predict(test_x)
print(accuracy_score(y_pred, y_test))

0.7295936009603778


### With 5 cols

In [None]:
import itertools
cbn = list(itertools.combinations(col, 5))

In [None]:
train_y = train_df_data['connected'].values
best_score = 0.0
best_model = None
best_col = None
for col_train in cbn:
  train_x = train_df_data[list(col_train)].values
  model = LogisticRegression()
  model.fit(train_x, train_y)
  pred = model.predict(train_x)
  score = accuracy_score(pred, train_y)
  print(f'train on col: {col_train} | score: {score}')
  if score > best_score:
    best_score = score
    best_model = model
    best_col = col_train

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


train on col: ('preferential_attachment', 'common_neighbors', 'jaccard_coef', 'resource_allocation_index', 'adamic_adar_index') | score: 0.8490664390789854
train on col: ('preferential_attachment', 'common_neighbors', 'jaccard_coef', 'resource_allocation_index', 'cn_soundarajan_hopcroft') | score: 0.853768100758654
train on col: ('preferential_attachment', 'common_neighbors', 'jaccard_coef', 'resource_allocation_index', 'within_inter_cluster') | score: 0.8604143083830117
train on col: ('preferential_attachment', 'common_neighbors', 'jaccard_coef', 'resource_allocation_index', 'ra_index_soundarajan_hopcroft') | score: 0.8601741147972025
train on col: ('preferential_attachment', 'common_neighbors', 'jaccard_coef', 'adamic_adar_index', 'cn_soundarajan_hopcroft') | score: 0.853768100758654
train on col: ('preferential_attachment', 'common_neighbors', 'jaccard_coef', 'adamic_adar_index', 'within_inter_cluster') | score: 0.8603274298519743
train on col: ('preferential_attachment', 'common_ne

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


train on col: ('common_neighbors', 'resource_allocation_index', 'cn_soundarajan_hopcroft', 'within_inter_cluster', 'ra_index_soundarajan_hopcroft') | score: 0.8427243063132585
train on col: ('common_neighbors', 'adamic_adar_index', 'cn_soundarajan_hopcroft', 'within_inter_cluster', 'ra_index_soundarajan_hopcroft') | score: 0.8468944758030514
train on col: ('jaccard_coef', 'resource_allocation_index', 'adamic_adar_index', 'cn_soundarajan_hopcroft', 'within_inter_cluster') | score: 0.8462454420711841
train on col: ('jaccard_coef', 'resource_allocation_index', 'adamic_adar_index', 'cn_soundarajan_hopcroft', 'ra_index_soundarajan_hopcroft') | score: 0.8533618158635087
train on col: ('jaccard_coef', 'resource_allocation_index', 'adamic_adar_index', 'within_inter_cluster', 'ra_index_soundarajan_hopcroft') | score: 0.8542791509412266
train on col: ('jaccard_coef', 'resource_allocation_index', 'cn_soundarajan_hopcroft', 'within_inter_cluster', 'ra_index_soundarajan_hopcroft') | score: 0.844344

In [None]:
test_x = test_df_data[list(best_col)]
y_pred = best_model.predict(test_x)
print(accuracy_score(y_pred, y_test))

0.7191286136501537


### With 6 cols

In [None]:
import itertools
cbn = list(itertools.combinations(col, 6))

In [None]:
train_y = train_df_data['connected'].values
best_score = 0.0
best_model = None
best_col = None
for col_train in cbn:
  train_x = train_df_data[list(col_train)].values
  model = LogisticRegression()
  model.fit(train_x, train_y)
  pred = model.predict(train_x)
  score = accuracy_score(pred, train_y)
  print(f'train on col: {col_train} | score: {score}')
  if score > best_score:
    best_score = score
    best_model = model
    best_col = col_train

train on col: ('preferential_attachment', 'common_neighbors', 'jaccard_coef', 'resource_allocation_index', 'adamic_adar_index', 'cn_soundarajan_hopcroft') | score: 0.8537553245040896
train on col: ('preferential_attachment', 'common_neighbors', 'jaccard_coef', 'resource_allocation_index', 'adamic_adar_index', 'within_inter_cluster') | score: 0.8603325403538
train on col: ('preferential_attachment', 'common_neighbors', 'jaccard_coef', 'resource_allocation_index', 'adamic_adar_index', 'ra_index_soundarajan_hopcroft') | score: 0.8602558828264142
train on col: ('preferential_attachment', 'common_neighbors', 'jaccard_coef', 'resource_allocation_index', 'cn_soundarajan_hopcroft', 'within_inter_cluster') | score: 0.8591443486793185
train on col: ('preferential_attachment', 'common_neighbors', 'jaccard_coef', 'resource_allocation_index', 'cn_soundarajan_hopcroft', 'ra_index_soundarajan_hopcroft') | score: 0.853768100758654
train on col: ('preferential_attachment', 'common_neighbors', 'jaccard_

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


train on col: ('common_neighbors', 'jaccard_coef', 'adamic_adar_index', 'cn_soundarajan_hopcroft', 'within_inter_cluster', 'ra_index_soundarajan_hopcroft') | score: 0.8465418511770764
train on col: ('common_neighbors', 'resource_allocation_index', 'adamic_adar_index', 'cn_soundarajan_hopcroft', 'within_inter_cluster', 'ra_index_soundarajan_hopcroft') | score: 0.8460819060127609
train on col: ('jaccard_coef', 'resource_allocation_index', 'adamic_adar_index', 'cn_soundarajan_hopcroft', 'within_inter_cluster', 'ra_index_soundarajan_hopcroft') | score: 0.8569391671415174


In [None]:
test_x = test_df_data[list(best_col)]
y_pred = best_model.predict(test_x)
print(accuracy_score(y_pred, y_test))

0.7191123028843959


### With 2 cols

In [None]:
import itertools
cbn = list(itertools.combinations(col, 2))

In [None]:
train_y = train_df_data['connected'].values
best_score = 0.0
best_model = None
best_col = None
for col_train in cbn:
  train_x = train_df_data[list(col_train)].values
  model = LogisticRegression()
  model.fit(train_x, train_y)
  pred = model.predict(train_x)
  score = accuracy_score(pred, train_y)
  print(f'train on col: {col_train} | score: {score}')
  if score > best_score:
    best_score = score
    best_model = model
    best_col = col_train

train on col: ('preferential_attachment', 'common_neighbors') | score: 0.8602124435608954
train on col: ('preferential_attachment', 'resource_allocation_index') | score: 0.5004791095461619
train on col: ('preferential_attachment', 'adamic_adar_index') | score: 0.8486141596674085
train on col: ('preferential_attachment', 'cn_soundarajan_hopcroft') | score: 0.8539827418353345
train on col: ('preferential_attachment', 'within_inter_cluster') | score: 0.7931217755927543
train on col: ('preferential_attachment', 'ra_index_soundarajan_hopcroft') | score: 0.5004791095461619
train on col: ('common_neighbors', 'jaccard_coef') | score: 0.8516421319991516
train on col: ('common_neighbors', 'resource_allocation_index') | score: 0.8405906718010175
train on col: ('common_neighbors', 'adamic_adar_index') | score: 0.8460665745072837
train on col: ('common_neighbors', 'cn_soundarajan_hopcroft') | score: 0.8634729437257092
train on col: ('common_neighbors', 'within_inter_cluster') | score: 0.86347294372

In [None]:
test_x = test_df_data[list(best_col)].values
y_pred = best_model.predict(test_x)
print(accuracy_score(y_pred, y_test))

0.7295936009603778
