In [None]:
# 2. Import
import os
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sentence_transformers import SentenceTransformer

# 3. Load data
df = pd.read_csv('train.csv', skiprows=1, header=None)
df.columns = ['label', 'title', 'description']
df['content'] = df['title'] + ' ' + df['description']
df = df.head(120000)  

df['label'] = df['label'].astype(int)
labels = torch.tensor(df['label'].values) - 1

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# 4. Load Sentence-BERT
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')  # Model nhẹ, nhanh

# 5. Create document embeddings
embedding_file = 'doc_embeddings_sbert.pt'
if os.path.exists(embedding_file):
    print("Loading existing embeddings...")
    doc_embeddings = torch.load(embedding_file)
    if not isinstance(doc_embeddings, torch.Tensor):
        print(f"Error: doc_embeddings is {type(doc_embeddings)}, expected torch.Tensor. Regenerating embeddings...")
        os.remove(embedding_file)
        sentences = df['content'].tolist()
        embeddings = sbert_model.encode(sentences, show_progress_bar=True, convert_to_tensor=True)
        doc_embeddings = embeddings.cpu()
        torch.save(doc_embeddings, embedding_file)
        print(f"Saved SBERT embeddings to {embedding_file}")
else:
    print("Generating new embeddings with Sentence-BERT...")
    sentences = df['content'].tolist()
    embeddings = sbert_model.encode(sentences, show_progress_bar=True, convert_to_tensor=True)
    doc_embeddings = embeddings.cpu()
    torch.save(doc_embeddings, embedding_file)
    print(f"Saved SBERT embeddings to {embedding_file}")

In [None]:
# 6. Build vocabulary
vectorizer = CountVectorizer(stop_words='english', max_features=10000)
X_counts = vectorizer.fit_transform(df['content'])
word_vocab = vectorizer.get_feature_names_out()
word2idx = {word: idx for idx, word in enumerate(word_vocab)}

# 7. Create word embeddings (using random init)
word_embeddings = torch.randn(len(word_vocab), doc_embeddings.size(1))  # phải cùng chiều với doc_embeddings

In [None]:
# 8. Create edges
tfidf = TfidfVectorizer(vocabulary=word_vocab)
X_tfidf = tfidf.fit_transform(df['content'])

# Word-to-Document edges (w2d) & Document-to-Word edges (d2w)
row, col, edge_weight = [], [], []
doc_offset = len(word_vocab)

for doc_idx, row_data in enumerate(X_tfidf):
    non_zero_indices = row_data.nonzero()[1]
    for word_idx in non_zero_indices:
        row.append(word_idx)
        col.append(doc_idx + doc_offset)
        edge_weight.append(X_tfidf[doc_idx, word_idx])
        row.append(doc_idx + doc_offset)
        col.append(word_idx)
        edge_weight.append(X_tfidf[doc_idx, word_idx])

In [None]:
# Word-to-Word edges (w2w) using PMI
word_count = defaultdict(int)
co_occur = defaultdict(int)
total_words = 0

for text in df['content']:
    words = text.split()
    total_words += len(words)
    for i, word in enumerate(words):
        word_count[word] += 1
        for j in range(max(0, i - window_size), min(len(words), i + window_size + 1)):
            if i != j:
                co_occur[(word, words[j])] += 1

pmi_matrix = calculate_pmi(word_count, co_occur, total_words)

w2w_threshold = 0.2
for (word1, word2), pmi in pmi_matrix.items():
    if pmi > w2w_threshold:
        row.append(word2idx[word1])  # Chuyển từ thành chỉ mục
        col.append(word2idx[word2])
        edge_weight.append(pmi)
        row.append(word2idx[word2])
        col.append(word2idx[word1])
        edge_weight.append(pmi)

print(f"Number of edges created: {len(row)}")

In [None]:
# Document-to-Document edges (d2d) using document embeddings and cosine similarity
doc_embeddings_np = doc_embeddings.cpu().numpy()
doc_cos_sim = cosine_similarity(doc_embeddings_np)
d2d_threshold = 0.2
for i in range(doc_embeddings_np.shape[0]):
    for j in range(i + 1, doc_embeddings_np.shape[0]):  # Tránh lặp lại và cạnh tự nối
        if doc_cos_sim[i, j] > d2d_threshold:
            row.append(i + doc_offset)
            col.append(j + doc_offset)
            edge_weight.append(float(doc_cos_sim[i, j]))
            row.append(j + doc_offset)
            col.append(i + doc_offset)
            edge_weight.append(float(doc_cos_sim[i, j]))

print(f"Number of edges created: {len(row)}")

In [19]:
# 7. Create graph datab
edge_index = torch.tensor([row, col], dtype=torch.long)
edge_attr = torch.tensor(edge_weight, dtype=torch.float)
labels = torch.tensor(df['label'].astype(int).values) - 1  # Labels from 0 → 3

# Mask cho train/test (ở đây train hết cho đơn giản)
num_words = word_embeddings.size(0)
num_docs = doc_embeddings.size(0)

# Mask cho các node văn bản
train_mask = torch.zeros(num_words + num_docs, dtype=torch.bool)
train_mask[num_words:] = True  # Chỉ dùng doc nodes để train

# Tạo lại data
data = Data(x=torch.cat([word_embeddings, doc_embeddings], dim=0),
            edge_index=edge_index,
            edge_attr=edge_attr,
            y=labels,
            train_mask=train_mask)

# 8. Define GCN model
class TextGCN(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(TextGCN, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def forward(self, x, edge_index, edge_weight):
        x = self.conv1(x, edge_index, edge_weight=edge_weight)
        x = F.relu(x)
        x = self.conv2(x, edge_index, edge_weight=edge_weight)
        return x

# 9. Train model
input_dim = doc_embeddings.size(1)  # Thường là 384 với SBERT Mini
word_embeddings = torch.randn(len(word_vocab), input_dim)
x = torch.cat([word_embeddings, doc_embeddings], dim=0)

data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr,
            y=labels, train_mask=train_mask)

model = TextGCN(input_dim, 128, 4).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = nn.CrossEntropyLoss()

model.train()
for epoch in range(20):
    optimizer.zero_grad()
    out = model(data.x.to(device), data.edge_index.to(device), data.edge_attr.to(device))
    loss = criterion(out[data.train_mask], data.y.to(device))
    loss.backward()
    optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item():.4f}')

# 10. Evaluate
model.eval()
with torch.no_grad():
    logits = model(data.x.to(device), data.edge_index.to(device), data.edge_attr.to(device))
    preds = logits[data.train_mask].argmax(dim=1)
    acc = (preds == data.y.to(device)).float().mean()
    print(f'Training Accuracy: {acc:.4f}')

# 11. Save model
torch.save(model.state_dict(), 'textgcn_model.pth')

# 12. Load model
# model.load_state_dict(torch.load('textgcn_model.pth'))

Epoch 1, Loss: 1.3889
Epoch 2, Loss: 1.3380
Epoch 3, Loss: 1.2808
Epoch 4, Loss: 1.2090
Epoch 5, Loss: 1.1267
Epoch 6, Loss: 1.0389
Epoch 7, Loss: 0.9504
Epoch 8, Loss: 0.8648
Epoch 9, Loss: 0.7844
Epoch 10, Loss: 0.7111
Epoch 11, Loss: 0.6468
Epoch 12, Loss: 0.5926
Epoch 13, Loss: 0.5484
Epoch 14, Loss: 0.5128
Epoch 15, Loss: 0.4842
Epoch 16, Loss: 0.4619
Epoch 17, Loss: 0.4449
Epoch 18, Loss: 0.4322
Epoch 19, Loss: 0.4227
Epoch 20, Loss: 0.4155
Training Accuracy: 0.8631
