In [1]:
!pip install torch torchvision torchaudio
!pip install torch-geometric
!pip install sentence-transformers scikit-learn pandas tqdm


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [7]:
import pandas as pd
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GATv2Conv
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import numpy as np
from torch_geometric.utils import dense_to_sparse

# === –ó–∞–≥—Ä—É–∑–∫–∞ –∏ –ø–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞–Ω–Ω—ã—Ö ===
df = pd.read_csv('students_interests.csv')  # –£–∫–∞–∂–∏—Ç–µ —Å–≤–æ–π –ø—É—Ç—å
texts = df.iloc[:, 0].astype(str).tolist()
labels = df.iloc[:, 1].astype(str).tolist()

# –ö–æ–¥–∏—Ä–æ–≤–∫–∞ –º–µ—Ç–æ–∫
le = LabelEncoder()
y = torch.tensor(le.fit_transform(labels), dtype=torch.long)

# –¢–µ–∫—Å—Ç ‚Üí —ç–º–±–µ–¥–¥–∏–Ω–≥–∏
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')  # –ø–æ–¥–¥–µ—Ä–∂–∫–∞ —Ä—É—Å—Å–∫–æ–≥–æ
X = model.encode(texts, show_progress_bar=True)
X = torch.tensor(X, dtype=torch.float)

# –ü–æ—Å—Ç—Ä–æ–µ–Ω–∏–µ –≥—Ä–∞—Ñ–∞ (–∫–æ—Å–∏–Ω—É—Å–Ω–∞—è –±–ª–∏–∑–æ—Å—Ç—å)
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(X)
threshold = 0.6
adj_matrix = (similarity > threshold).astype(int)
np.fill_diagonal(adj_matrix, 0)

edge_index = dense_to_sparse(torch.tensor(adj_matrix))[0]
data = Data(x=X, edge_index=edge_index, y=y)

# –î–µ–ª–µ–Ω–∏–µ –Ω–∞ train/test
train_idx, test_idx = train_test_split(range(len(data.y)), test_size=0.2, stratify=data.y, random_state=42)
data.train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
data.train_mask[train_idx] = True
data.test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
data.test_mask[test_idx] = True

# === GATv2 –º–æ–¥–µ–ª—å ===
class GATv2Net(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, heads=4, dropout=0.6):
        super().__init__()
        self.dropout = torch.nn.Dropout(dropout)
        self.gat1 = GATv2Conv(in_channels, hidden_channels, heads=heads, dropout=dropout)
        self.gat2 = GATv2Conv(hidden_channels * heads, out_channels, heads=1, concat=False, dropout=dropout)

    def forward(self, x, edge_index):
        x = self.dropout(x)
        x = self.gat1(x, edge_index)
        x = F.elu(x)
        x = self.dropout(x)
        x = self.gat2(x, edge_index)
        return x

# –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è –º–æ–¥–µ–ª–∏
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GATv2Net(data.num_node_features, 8, len(le.classes_)).to(device)
data = data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

# –û–±—É—á–µ–Ω–∏–µ
def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = F.cross_entropy(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

def test():
    model.eval()
    out = model(data.x, data.edge_index)
    pred = out.argmax(dim=1)
    test_pred = pred[data.test_mask].cpu().numpy()
    test_true = data.y[data.test_mask].cpu().numpy()
    acc = accuracy_score(test_true, test_pred)
    prec = precision_score(test_true, test_pred, average='macro')
    rec = recall_score(test_true, test_pred, average='macro')
    f1 = f1_score(test_true, test_pred, average='macro')
    return acc, prec, rec, f1

# –¶–∏–∫–ª –æ–±—É—á–µ–Ω–∏—è
for epoch in range(1, 101):
    loss = train()
    if epoch % 10 == 0:
        acc, prec, rec, f1 = test()
        print(f'Epoch {epoch:03d}, Loss: {loss:.4f}, Acc: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, F1: {f1:.4f}')

# –§–∏–Ω–∞–ª—å–Ω–∞—è –æ—Ü–µ–Ω–∫–∞
acc, prec, rec, f1 = test()
print("\nüìä Final Metrics (GATv2):")
print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall:    {rec:.4f}")
print(f"F1 Score:  {f1:.4f}")


Batches:   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 010, Loss: 1.0070, Acc: 0.9211, Precision: 0.9187, Recall: 0.9229, F1: 0.9185
Epoch 020, Loss: 0.8311, Acc: 0.8947, Precision: 0.9152, Recall: 0.8979, F1: 0.9018
Epoch 030, Loss: 0.6852, Acc: 0.8947, Precision: 0.9083, Recall: 0.8979, F1: 0.8990
Epoch 040, Loss: 0.5463, Acc: 0.8684, Precision: 0.8958, Recall: 0.8667, F1: 0.8731
Epoch 050, Loss: 0.6030, Acc: 0.8947, Precision: 0.9021, Recall: 0.8979, F1: 0.8955
Epoch 060, Loss: 0.6309, Acc: 0.8947, Precision: 0.9021, Recall: 0.8979, F1: 0.8955
Epoch 070, Loss: 0.5501, Acc: 0.8684, Precision: 0.8958, Recall: 0.8667, F1: 0.8731
Epoch 080, Loss: 0.6245, Acc: 0.8684, Precision: 0.8798, Recall: 0.8667, F1: 0.8656
Epoch 090, Loss: 0.5523, Acc: 0.8947, Precision: 0.9021, Recall: 0.8979, F1: 0.8955
Epoch 100, Loss: 0.5548, Acc: 0.8947, Precision: 0.9021, Recall: 0.8979, F1: 0.8955

üìä Final Metrics (GATv2):
Accuracy:  0.8947
Precision: 0.9021
Recall:    0.8979
F1 Score:  0.8955
