In [5]:
import scipy.sparse as sp
import scipy.io as sio
import pandas as pd
import numpy as np

def load_mat(dataset, train_rate=0.3, val_rate=0.1):
    """Load .mat dataset."""
    data = sio.loadmat("./data/processed/{}/{}.mat".format(dataset, dataset))
    text = pd.read_csv('./data/processed/{}/{}.csv'.format(dataset, dataset))


    label = data['Label'] if ('Label' in data) else data['gnd']
    attr = data['Attributes'] if ('Attributes' in data) else data['X']
    network = data['Network'] if ('Network' in data) else data['A']
    
    adj = sp.csr_matrix(network)
    feat = sp.lil_matrix(attr)

    ano_labels = np.squeeze(np.array(label))

    return adj, feat, ano_labels, text

In [6]:
from torch.utils.data import DataLoader
from sklearn import preprocessing
import numpy as np
import pandas as pd
import argparse
import torch
from random import sample
import random
import time
from model import CMUCL, tokenize
from data import DataHelper
from sklearn import preprocessing
import dgl

import scipy.sparse as sp
import scipy.io as sio
import networkx as nx
from sklearn.metrics import roc_auc_score

In [7]:
def adj_to_dgl_graph(adj):
    """Convert adjacency matrix to dgl format."""
    nx_graph = nx.from_scipy_sparse_array(adj)
    dgl_graph = dgl.DGLGraph(nx_graph)
    return dgl_graph

def preprocess_features_ndarray(features):
    """Row-normalize feature matrix and convert to tuple representation"""
    rowsum = np.array(features.sum(axis=1))  # sum of each row
    r_inv = np.power(rowsum, -1).flatten()  # inverse of row sums
    r_inv[np.isinf(r_inv)] = 0.  # replace inf with 0
    r_mat_inv = np.diag(r_inv)  # create a diagonal matrix with r_inv
    features = r_mat_inv.dot(features)  # row-normalize the feature matrix

    return features

def position_encoding(max_len, emb_size):
    # pe = np.zeros((max_len, emb_size))
    # position = np.arange(0, max_len)[:, np.newaxis]

    pe = np.zeros((max_len, emb_size), dtype=np.float32)
    position = np.arange(0, max_len, dtype=np.float32)[:, np.newaxis]

    div_term = np.exp(np.arange(0, emb_size, 2) * -(np.log(10000.0) / emb_size))

    pe[:, 0::2] = np.sin(position * div_term)
    pe[:, 1::2] = np.cos(position * div_term)
    
    return pe

In [8]:
device = 'cpu'
dataset = 'Citeseer'
adj, features, ano_label, text = load_mat(dataset) 

dgl_graph = adj_to_dgl_graph(adj)
num_nodes = dgl_graph.number_of_nodes()

arr_edge_index = np.vstack((dgl_graph.edges()[0].numpy(), dgl_graph.edges()[1].numpy()))
edge_index = torch.stack(dgl_graph.edges()).to(device)

node_f = features.toarray()

node_f = preprocessing.StandardScaler().fit_transform(node_f)

node_feat = torch.from_numpy(node_f).float()
node_f = torch.from_numpy(node_f).to(device)

tit_list = text['text'].to_numpy()

start = time.perf_counter()



In [9]:
adj, features, ano_label, text = load_mat(dataset) 

In [11]:
dataset = 'Citeseer'
adj_bad, attrs_sp, label, text = load_mat(dataset)

# !!! dgl_graph/edges()에서 뽑은 arr_edge_index가 유효하다는 전제 하에 사용
dgl_graph = adj_to_dgl_graph(adj_bad)
src = dgl_graph.edges()[0].numpy()
dst = dgl_graph.edges()[1].numpy()
arr_edge_index = np.vstack((src, dst))

# ---- 2) arr_edge_index → A (대칭화 + 대각 0) ----
src, dst = arr_edge_index
num_nodes = int(max(src.max(), dst.max()) + 1)

A = sp.csr_matrix(
    (np.ones_like(src, dtype=np.float32), (src, dst)),
    shape=(num_nodes, num_nodes)
)
A = A.maximum(A.T)          # 무방향이면 대칭화
A = A.tolil()
A.setdiag(0.0)              # ★ 대각 0으로 리셋
A = A.tocsr()

A_label = A + sp.eye(num_nodes, dtype=np.float32, format='csr')  # ★ 타깃은 A+I

# ---- 3) scipy.sparse → torch.sparse ----
def sp_to_torch_sparse(m):
    coo = m.tocoo()
    idx = torch.tensor(np.vstack([coo.row, coo.col]), dtype=torch.long)
    val = torch.tensor(coo.data, dtype=torch.float32)
    return torch.sparse_coo_tensor(idx, val, torch.Size(coo.shape))

adj = sp_to_torch_sparse(A).coalesce()
adj_label = sp_to_torch_sparse(A_label).coalesce()

# ---- 4) 특징행렬 X: 정규화된 dense 텐서 ----
# attrs_sp: (N,F) scipy.sparse
X_np = attrs_sp.toarray()
X_np = preprocessing.StandardScaler().fit_transform(X_np)
X = torch.tensor(X_np, dtype=torch.float32)




In [12]:
tit_list = text['text'].to_numpy()
tit_list

array(["Argument in Multi-Agent Systems Multi-agent systems research is concerned both with the modelling of human and animal societies and with the development of principles for the design of practical distributed information management systems. This position paper will, rather than examine the various dierences in perspective within this area of research, discuss issues of communication and commitment that are of interest to multi-agent systems research in general. 1 Introduction A computational society is a collection of autonomous agents that are loosely dependent upon each other. The intentional stance [12] is often taken in describing the state of these agents. An agent may have beliefs, desires, intentions, and it may adopt a role or have relationships with others. Thus, multi-agent systems (MAS) as with most AI research is signi cantly inuenced, at least in its vocabulary, by philosophy and cognitive psychology. 1 So, what's the point? Computational societies are developed for 

In [13]:
from typing import Any, Union, List
from simple_tokenizer import SimpleTokenizer as _Tokenizer
_tokenizer = _Tokenizer()

def tokenize(texts: Union[str, List[str]], context_length: int = 128, truncate: bool = True) -> torch.LongTensor:

    """
    Returns the tokenized representation of given input string(s)

    Parameters
    ----------
    texts : Union[str, List[str]]
        An input string or a list of input strings to tokenize

    context_length : int
        The context length to use; all CLIP models use 77 as the context length

    truncate: bool
        Whether to truncate the text in case its encoding is longer than the context length

    Returns
    -------
    A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
    """
    if isinstance(texts, str):
        texts = [texts]

    sot_token = _tokenizer.encoder["<|startoftext|>"]
    eot_token = _tokenizer.encoder["<|endoftext|>"]

    all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token] for text in texts]
    
    result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)

    for i, tokens in enumerate(all_tokens):
        if len(tokens) > context_length:
            if truncate:
                tokens = tokens[:context_length]
                tokens[-1] = eot_token
            else:
                raise RuntimeError(f"Input {texts[i]} is too long for context length {context_length}")


        
        result[i, :len(tokens)] = torch.tensor(tokens)

    return result

In [14]:
context_length = 128

In [15]:
text_token = tokenize(tit_list, context_length=context_length)

In [16]:
text_token.shape

torch.Size([3186, 128])

In [17]:
from sentence_transformers import SentenceTransformer
import numpy as np, torch

st = SentenceTransformer("all-MiniLM-L6-v2")



In [18]:
emb = st.encode(tit_list.tolist(),
                    convert_to_numpy=True, normalize_embeddings=True)

In [19]:
emb.shape

(3186, 384)

In [None]:
0. tokenizer?? 
1. 텍스트 인코더
2. 텍스트 임베딩
3. 기존 모델과 결합-> concat