In [27]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import numpy as np
import heapq
from typing import Optional, Union
import random

In [None]:
nodes = pd.read_csv("BlogCatalog-dataset/nodes.csv", names=["node"])
nodes.shape

In [None]:
edges = pd.read_csv("BlogCatalog-dataset/edges.csv", names=["node1", "node2"])
edges.shape

In [None]:
groups = pd.read_csv("BlogCatalog-dataset/groups.csv", names=["group"])
groups.head()
group_edges = pd.read_csv(
    "BlogCatalog-dataset/group-edges.csv", names=["node", "group"]
)
group_edges.shape

In [None]:
group_edges["node"].unique().shape

In [32]:
nodes_train, nodes_test = train_test_split(nodes, test_size=0.8)

In [None]:
def get_adj_list(nodes: pd.DataFrame, edges: pd.DataFrame):
    nodes = nodes["node"].tolist()
    adj_list = {node: [] for node in nodes}
    for i in tqdm(range(len(edges)), desc="Formating adjacency list"):
        node1 = edges["node1"].iloc[i]
        node2 = edges["node2"].iloc[i]
        adj_list[node1].append(node2)
        adj_list[node2].append(node1)
    return adj_list


adj_list = get_adj_list(nodes, edges)

In [39]:
def one_node_random_walk(
    adj_list: list,
    begin_node: int,
    t: int,
):
    """一次随机游走

    Args:
        adj_list (list): 邻接表
        begin_node (int): 最开始的节点
        t (int): 游走的最长长度

    Returns:
        list: 一次随机游走得到的序列
    """
    sequence = []
    current_node = begin_node
    sequence.append(current_node)
    while len(sequence) < t:
        adj_nodes = adj_list[current_node]
        next_index = np.random.choice(np.arange(len(adj_nodes)))
        current_node = adj_nodes[next_index]
        sequence.append(current_node)
    return sequence

In [None]:
one_node_random_walk(
    adj_list=adj_list,
    begin_node=1,
    t=40,
)

In [None]:
tree_height = int(np.ceil(np.log2(nodes.shape[0])))
tree_height

In [None]:
tree_height = 2
target_idx = 2
format(target_idx, f"0{tree_height}b")

In [None]:
def get_tree_info(target_idx, tree_height):
    binary_tree_code = format(target_idx, f"0{tree_height}b")
    binary_tree_code = [int(c) for c in binary_tree_code]
    path_nodes = [0]
    c = 0
    for char in binary_tree_code:
        if char == 0:
            c = c * 2 + 1
        else:
            c = c * 2 + 2
        path_nodes.append(c)
    path_nodes.pop(-1)
    assert len(path_nodes) == len(binary_tree_code)
    return binary_tree_code, path_nodes


get_tree_info(2, 2)

In [44]:
class Model:
    def __init__(
        self,
        nodes_num: int,
        embedding_dim: int = 128,
    ):
        self.nodes_num = nodes_num
        self.embedding_dim = embedding_dim
        self.embedding = torch.randn([nodes_num, embedding_dim])
        self.theta_p = torch.randn([2 * nodes_num, embedding_dim])

In [45]:
model = Model(nodes_num=nodes.shape[0], embedding_dim=128)

In [132]:
def skip_gram(
    model: Model, sequence: list, window_size: int, lr: float, tree_height, bias=1
):
    assert len(sequence) > (2 * window_size + 1)
    loss_total = 0
    for i in range(window_size, len(sequence)):
        input_node = sequence[i]
        context_nodes = (
            sequence[i - window_size : i] + sequence[i + 1 : i + window_size + 1]
        )
        x = model.embedding[input_node - bias]  # [d]
        for context_node in context_nodes:
            bin_tree_code, path_nodes = get_tree_info(
                context_node - bias, tree_height=tree_height
            )
            q = torch.sigmoid(x @ model.theta_p[path_nodes].T)  # [h]
            loss = torch.nn.functional.binary_cross_entropy(
                q, torch.tensor(bin_tree_code).float()
            )
            loss_total += loss
            g = lr * (1 - torch.tensor(bin_tree_code).float() - q)  # [h]
            model.theta_p[path_nodes] = model.theta_p[path_nodes] + g.unsqueeze(
                dim=-1
            ) * x.unsqueeze(0)
            e = (g.unsqueeze(-1) * model.theta_p[path_nodes]).mean(dim=0)
            model.embedding[input_node - bias] = model.embedding[input_node - bias] + e
    print(loss_total.item() / len(sequence))

In [142]:
def train(
    model,
    walks_num,
    window_size,
    t,
    nodes: list,
    adj_list,
    lr: float,
    bias: int = 1,
    min_lr: float = 0.0001,
):
    node_list = nodes.copy()
    tree_height = int(np.ceil(np.log2(len(nodes))))
    for gamma in tqdm(range(walks_num), desc="Deep Walk"):
        random.shuffle(node_list)
        for node in node_list:
            sequence = one_node_random_walk(
                adj_list=adj_list,
                begin_node=node,
                t=t,
            )
            skip_gram(
                model=model,
                sequence=sequence,
                window_size=window_size,
                lr=lr,
                tree_height=tree_height,
                bias=bias,
            )
        lr = lr - (lr - min_lr) * (gamma / walks_num)

In [None]:
model = Model(nodes_num=nodes.shape[0], embedding_dim=128)
train(
    model=model,
    walks_num=30,
    window_size=10,
    t=40,
    nodes=nodes["node"].tolist(),
    adj_list=adj_list,
    lr=0.025,
    bias=1,
    min_lr=0.025,
)

In [34]:
# 训练和测试分类模型
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

In [None]:
import torch
from model import SkipGramHierarchicalSoftmaxModel

model = SkipGramHierarchicalSoftmaxModel(nodes_num=nodes.shape[0], embedding_dim=128)
model.load_state_dict(torch.load('model_save/model.pth'))

In [37]:
nodes_features = model.embedding.weight.detach().numpy()  # [n,d]

In [None]:
nodes_features.max()

In [None]:
labels = np.zeros([nodes.shape[0], groups.shape[0]])
for i in range(group_edges.shape[0]):
    node = group_edges["node"].iloc[i]
    group = group_edges["group"].iloc[i]
    labels[node - 1][group - 1] = 1
labels  # [n,g]

In [None]:
X = nodes_features
y = labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7)

base_classifier = LogisticRegression(solver="liblinear")
ovr_classifier = OneVsRestClassifier(base_classifier)
ovr_classifier.fit(X_train, y_train)

In [None]:
y_pred = ovr_classifier.predict(X_test)
f1_micro = f1_score(y_test, y_pred, average="micro")
f1_macro = f1_score(y_test, y_pred, average="macro")

print(f"F1-score (Micro): {f1_micro:.4f}")
print(f"F1-score (Macro): {f1_macro:.4f}")

In [None]:
y_pred

In [None]:
import torch

a = torch.arange(0, 12).reshape(2, 6)
a

In [None]:
pos = [
    [[2, 1], [0, 1], [0, 1]],
    [[2, 1], [1, 0], [0, 1]],
]
pos = torch.tensor(pos)
pos.shape

In [10]:
s = a[torch.arange(0, 2).unsqueeze(-1).unsqueeze(-1), pos]

In [None]:
s.shape

In [None]:
s[1]

In [None]:
torch.scatter(torch.zeros([2,3,6]),dim=2,index=torch.tensor([[1,2],[3,4]]).unsqueeze(2),)

In [None]:
index = torch.tensor([[1,2],[3,4]])
t = torch.zeros([2,3,6])
t.scatter_(2, index.unsqueeze(2), 1.0)

In [None]:
t.view(-1,6)