<a href="https://colab.research.google.com/github/msafari18/Master_project/blob/main/master_project_DeepWalk.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import networkx as nx
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
from gensim.models.callbacks import CallbackAny2Vec
from gensim.models import Word2Vec
from random import shuffle
import warnings
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score


%matplotlib inline

In [10]:
def read_data(paths):
  df_edges = pd.read_csv(paths[0])
  df_edges.columns = ["source", "target"]
  df_labels = pd.read_csv(paths[1])
  df_labels.columns = ['node','label']

  return df_edges, df_labels

def pre_process_data(netwrok_data):
  
  source = netwrok_data['source']
  target = netwrok_data['target']
  
  edges_list = []
  for i, j in zip(source, target):
    edges_list.append((i,j))
  
  return edges_list

def create_network(network_edges_list):
  
  G = nx.Graph()
  G.add_edges_from(network_edges_list)
  print("number of nodes : ",G.number_of_nodes())

  return G

def get_randomwalk_v1(network ,node, path_length):
    
  path = [node]
  for counter in range(1, path_length): 
      vertex_neighbors = [n for n in network.neighbors(node)]
      probability = []
      probability = probability + [1./len(vertex_neighbors)] * len(vertex_neighbors)
      node = np.random.choice(vertex_neighbors, p=probability)
      path.append(node)
  return path

def generate_randomwalk(network, w = 10, lambdaa = 80):
  
  all_nodes = list(network.nodes())

  random_walks = []
  for n in tqdm(all_nodes):
    for i in range(lambdaa):
      random_walks.append(get_randomwalk_v1(network,n,w))
        
  print("total random walks: ", len(random_walks))
  return random_walks

def pre_process_random_walks(random_walks):
  
  random_walks_str = []
  for i in random_walks:
    temp = []
    for j in i:
      x.append(str(j))
    random_walks_str.append(x)

  shuffle(rand_w_str)  
  return random_walks_str

def train_skip_gram(random_walks_str, epochs = 10, d = 128, negative_smapling = 12, window = 8):
  
  warnings.filterwarnings('ignore')

  model = Word2Vec(window = window, sg = 1, hs = 0,
                  negative = negative_smapling,
                  alpha=0.03, min_alpha=0.0007, size = d, seed = 14, callbacks=[callback()])

  model.build_vocab(random_walks_str, progress_per=2)

  x = model.train(random_walks_str, total_examples = model.corpus_count, epochs= epochs, report_delay=1)

  return model


class callback(CallbackAny2Vec):
    """
    Callback to print loss after each epoch
    """
    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        if self.epoch == 0:
            print('Loss after epoch {}: {}'.format(self.epoch, loss))
        else:
            print('Loss after epoch {}: {}'.format(self.epoch, loss- self.loss_previous_step))
        self.epoch += 1
        self.loss_previous_step = loss


def get_feature(label_data, trained_sg_model, test_size = 0.1):
  
  node = list(df_labels['node'])
  label = list(df_labels['label'])
  y = []
  X = []
  for i in all_nodes:
    # if label[node.index(i)] != 8:
    y.append(label[node.index(i)])
    X.append(trained_sg_model[str(i)])
    
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

  return X_train, X_test, y_train, y_test

def classifier(X_train, y_train):  

  clf = svm.SVC(kernel = 'rbf')
  clf.fit(X_train, y_train)
  print("train acc: ",clf.score(X_train, y_train))
  return clf

def validate_model(classifier, X_test, y_test):

  y_pred = classifier.predict(X_test)
  f1_micro = f1_score(y_test, y_pred, average='micro')
  f1_macro = f1_score(y_test, y_pred, average='macro')

  print("f1_micro: ", f1_micro, "f1_macro: ", f1_macro)


  

In [None]:
df_edges, df_labels = read_data(["/content/edges.csv", "/content/group-edges.csv"])
edges_list = pre_process_data(df_edges)
network = create_network(edges_list)
random_walks = generate_randomwalk(network)
random_walks_str = pre_process_random_walks(random_walks)
sg_model = train_skip_gram(random_walks_str)
X_train, X_test, y_train, y_test = get_feature(df_labels, sg_model)
classifier_model = classifier(X_train, y_train)
validate_model(classifier_model, X_test, y_test)

  0%|          | 0/10312 [00:00<?, ?it/s]

number of nodes :  10312


 85%|████████▌ | 8809/10312 [13:08<02:22, 10.56it/s]

In [None]:
G = nx.Graph()
G.add_edges_from(edges)

In [None]:
G.number_of_nodes()

10312

In [None]:
all_nodes = list(G.nodes())

random_walks = []
for n in tqdm(all_nodes):
    for i in range(80):
        random_walks.append(get_randomwalk(n,10))
        
len(random_walks)

100%|██████████| 10312/10312 [16:46<00:00, 10.25it/s]


824960

In [None]:
from gensim.models.callbacks import CallbackAny2Vec
from gensim.models import Word2Vec
from random import shuffle

# init callback class
class callback(CallbackAny2Vec):
    """
    Callback to print loss after each epoch
    """
    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        if self.epoch == 0:
            print('Loss after epoch {}: {}'.format(self.epoch, loss))
        else:
            print('Loss after epoch {}: {}'.format(self.epoch, loss- self.loss_previous_step))
        self.epoch += 1
        self.loss_previous_step = loss

rand_w_str = []
for i in random_walks:
  x = []
  for j in i:
    x.append(str(j))
  rand_w_str.append(x)

# print(rand_w_str[0])
shuffle(rand_w_str)
import warnings
warnings.filterwarnings('ignore')

model = Word2Vec(window = 30, sg = 1, hs = 0,
                 negative = 12,
                 alpha=0.03, min_alpha=0.0007, size = 128, seed = 14, callbacks=[callback()])

model.build_vocab(rand_w_str, progress_per=2)

x = model.train(rand_w_str, total_examples = model.corpus_count, epochs=3, report_delay=1)


Loss after epoch 0: 0.0
Loss after epoch 1: 0.0
Loss after epoch 2: 0.0


In [None]:
model.similar_by_word('1')

[('400', 0.5817173719406128),
 ('23', 0.5296919941902161),
 ('1271', 0.5135620832443237),
 ('780', 0.49335840344429016),
 ('450', 0.4902195334434509),
 ('6552', 0.4853951334953308),
 ('175', 0.4767530560493469),
 ('1741', 0.4727059006690979),
 ('742', 0.470461905002594),
 ('884', 0.4609057605266571)]

In [None]:
print(len(model['1']))

128


In [None]:
from sklearn import svm

from sklearn.model_selection import train_test_split


node = list(df_labels['node'])
label = list(df_labels['label'])
y = []
X = []
for i in all_nodes:
  # if label[node.index(i)] != 8:
  y.append(label[node.index(i)])
  X.append(model[str(i)])
  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
  
clf = svm.SVC(kernel = 'rbf')
clf.fit(X_train, y_train)
print(clf.score(X_train, y_train))


from sklearn.metrics import f1_score
y_pred = clf.predict(X_test)
f1_micro = f1_score(y_test, y_pred, average='micro')
f1_macro = f1_score(y_test, y_pred, average='macro')

print(f1_micro, f1_macro)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [None]:
print(clf.score(X_train, y_train))

0.6044181034482758


In [None]:
from sklearn.metrics import f1_score
y_pred = clf.predict(X_test)
f1_micro = f1_score(y_test, y_pred, average='micro')
f1_macro = f1_score(y_test, y_pred, average='macro')

print(f1_micro, f1_macro)

0.3507751937984496 0.2006500453443873
