In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install torch numpy scikit-learn tqdm nlp


Collecting nlp
  Downloading nlp-0.4.0-py3-none-any.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cu

In [None]:
# before run restart session
!pip install --upgrade --force-reinstall torch torchvision torchaudio


Collecting torch
  Downloading torch-2.7.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (29 kB)
Collecting torchvision
  Downloading torchvision-0.22.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (6.1 kB)
Collecting torchaudio
  Downloading torchaudio-2.7.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (6.6 kB)
Collecting filelock (from torch)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting typing-extensions>=4.10.0 (from torch)
  Downloading typing_extensions-4.14.1-py3-none-any.whl.metadata (3.0 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch)
  Downloading networkx-3.5-py3-none-any.whl.metadata (6.3 kB)
Collecting jinja2 (from torch)
  Downloading jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2025.7.0-py3-none-any.whl.metadata (12 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.6.77 (from torch)
  Downloading nvidia_

In [None]:

import torch
print(torch.__version__)

2.7.1+cu126


In [None]:
print("--------- PART 1: MOST FREQUENT TAGGER ---------")
import json
from collections import defaultdict, Counter
from sklearn.metrics import classification_report, accuracy_score
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

def load_jsonlines(path):
    data = []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                data.append(json.loads(line))
    return data

def train_most_freq_tagger(train_path):
    train_data = load_jsonlines(train_path)
    word_tag_counts = defaultdict(Counter)
    for item in train_data:
        for word, tag in zip(item['words'], item['labels']):
            word_tag_counts[word][tag] += 1
    word_most_freq = {word: tags.most_common(1)[0][0] for word, tags in word_tag_counts.items()}
    all_tags = [tag for tags in word_tag_counts.values() for tag, _ in tags.items()]
    fallback_tag = Counter(all_tags).most_common(1)[0][0]
    return word_most_freq, fallback_tag

def test_most_freq_tagger(test_path, word_most_freq, fallback_tag):
    test_data = load_jsonlines(test_path)
    y_true, y_pred = [], []
    for item in test_data:
        for word, tag in zip(item['words'], item['labels']):
            y_true.append(tag)
            y_pred.append(word_most_freq.get(word, fallback_tag))
    print(classification_report(y_true, y_pred, digits=3))
    print('Accuracy:', accuracy_score(y_true, y_pred))

if __name__ == '__main__':
    train_path = '/content/drive/MyDrive/nlp2/train.json'
    test_path = '/content/drive/MyDrive/nlp2/test.json'
    word_most_freq, fallback_tag = train_most_freq_tagger(train_path)
    test_most_freq_tagger(test_path, word_most_freq, fallback_tag)


--------- PART 1: MOST FREQUENT TAGGER ---------
              precision    recall  f1-score   support

           #      1.000     1.000     1.000        15
           $      1.000     1.000     1.000       329
          ''      1.000     0.986     0.993       208
           (      0.000     0.000     0.000         0
           )      0.000     0.000     0.000         0
           ,      1.000     1.000     1.000      1790
       -LRB-      0.000     0.000     0.000        53
      -NONE-      1.000     0.998     0.999      2486
       -RRB-      0.000     0.000     0.000        56
           .      1.000     1.000     1.000      1466
           :      1.000     1.000     1.000       158
          CC      0.996     0.994     0.995       829
          CD      0.996     0.828     0.904      1724
          DT      0.992     0.989     0.990      2979
          EX      0.935     1.000     0.967        29
          FW      1.000     1.000     1.000         1
          IN      0.945     0.99

In [None]:
print("--------- PART 2: GRU-BASED TAGGER ---------")
import json
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report, accuracy_score
import numpy as np
from tqdm import tqdm

# 1. Data Preprocessing
def load_jsonlines(filename):
    data = []
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                data.append(json.loads(line))
    return data


def build_vocab(data):
    word2idx = {'<PAD>': 0, '<UNK>': 1}
    tag2idx = {}
    idx2tag = []
    for item in data:
        for w in item['words']:
            if w not in word2idx:
                word2idx[w] = len(word2idx)
        for t in item['labels']:
            if t not in tag2idx:
                tag2idx[t] = len(tag2idx)
                idx2tag.append(t)
    return word2idx, tag2idx, idx2tag

def encode_sentence(words, labels, word2idx, tag2idx):
    x = [word2idx.get(w, word2idx['<UNK>']) for w in words]
    y = [tag2idx[t] for t in labels]
    return x, y

class PosDataset(Dataset):
    def __init__(self, data, word2idx, tag2idx, max_len=50):
        self.sentences = []
        self.labels = []
        self.lengths = []
        self.max_len = max_len
        for item in data:
            x, y = encode_sentence(item['words'], item['labels'], word2idx, tag2idx)
            self.lengths.append(min(len(x), max_len))
            # Pad to max_len
            if len(x) < max_len:
                x = x + [word2idx['<PAD>']] * (max_len - len(x))
                y = y + [-1] * (max_len - len(y))
            else:
                x, y = x[:max_len], y[:max_len]
            self.sentences.append(x)
            self.labels.append(y)
    def __len__(self):
        return len(self.sentences)
    def __getitem__(self, idx):
        return torch.LongTensor(self.sentences[idx]), torch.LongTensor(self.labels[idx]), self.lengths[idx]

# 2. GRU Model
class GRUPOS(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_dim=128, hidden_dim=128, num_layers=2, dropout=0.3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.gru = nn.GRU(embedding_dim, hidden_dim, num_layers,
                          batch_first=True, bidirectional=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim*2, tagset_size)
    def forward(self, x):
        embeds = self.embedding(x)
        out, _ = self.gru(embeds)
        logits = self.fc(out)
        return logits

# 3. Training and Evaluation Loop
def train_epoch(model, dataloader, optimizer, loss_fn, device):
    model.train()
    total_loss = 0
    for x, y, lengths in tqdm(dataloader):
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        logits = model(x)
        logits = logits.view(-1, logits.shape[-1])
        y = y.view(-1)
        mask = y != -1
        loss = loss_fn(logits[mask], y[mask])
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate(model, dataloader, idx2tag, device):
    import numpy as np
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for x, y, lengths in dataloader:
            x = x.to(device)
            logits = model(x)
            preds = torch.argmax(logits, dim=-1).cpu().numpy()
            for i in range(x.shape[0]):
                length = lengths[i]
                gold = y[i][:length].numpy()
                pred = preds[i][:length]
                y_true.extend(gold)
                y_pred.extend(pred)
    # حذف مقدار -1 (PAD) از نتایج
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    mask = y_true != -1
    y_true, y_pred = y_true[mask], y_pred[mask]

    labels = list(range(len(idx2tag)))
    target_names = idx2tag
    print(classification_report(y_true, y_pred, labels=labels, target_names=target_names, digits=3))
    print('Accuracy:', accuracy_score(y_true, y_pred))


def main():
    # Hyperparameters
    batch_size = 32
    embedding_dim = 128
    hidden_dim = 128
    num_layers = 2
    epochs = 5
    max_len = 50

    # Data
    train_data = load_jsonlines('/content/drive/MyDrive/nlp2/train.json')
    test_data = load_jsonlines('/content/drive/MyDrive/nlp2/test.json')

    word2idx, tag2idx, idx2tag = build_vocab(train_data)
    train_set = PosDataset(train_data, word2idx, tag2idx, max_len=max_len)
    test_set = PosDataset(test_data, word2idx, tag2idx, max_len=max_len)
    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_set, batch_size=batch_size)

    # Model
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = GRUPOS(len(word2idx), len(tag2idx), embedding_dim, hidden_dim, num_layers)
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    loss_fn = nn.CrossEntropyLoss(ignore_index=-1)

    # Training loop
    for epoch in range(epochs):
        loss = train_epoch(model, train_loader, optimizer, loss_fn, device)
        print(f'Epoch {epoch+1} Loss: {loss:.4f}')

    # Evaluation
    print("Test set results:")
    evaluate(model, test_loader, idx2tag, device)

if __name__ == "__main__":
    main()


--------- PART 2: GRU-BASED TAGGER ---------


100%|██████████| 408/408 [00:03<00:00, 127.24it/s]


Epoch 1 Loss: 0.9484


100%|██████████| 408/408 [00:03<00:00, 134.91it/s]


Epoch 2 Loss: 0.3666


100%|██████████| 408/408 [00:02<00:00, 173.02it/s]


Epoch 3 Loss: 0.2383


100%|██████████| 408/408 [00:02<00:00, 169.71it/s]


Epoch 4 Loss: 0.1667


100%|██████████| 408/408 [00:02<00:00, 172.04it/s]


Epoch 5 Loss: 0.1192
Test set results:
              precision    recall  f1-score   support

          NN      0.929     0.913     0.921      5240
          IN      0.977     0.976     0.976      3680
          DT      0.976     0.996     0.986      2955
         VBZ      0.964     0.896     0.929       742
          RB      0.889     0.846     0.867       971
         VBN      0.789     0.893     0.837       811
          TO      1.000     1.000     1.000       840
          VB      0.914     0.942     0.928       982
          JJ      0.764     0.907     0.829      2157
         NNS      0.935     0.908     0.921      2336
         NNP      0.900     0.919     0.909      3186
           ,      1.000     1.000     1.000      1780
          CC      0.996     0.993     0.995       820
         POS      0.994     1.000     0.997       345
           .      1.000     1.000     1.000      1424
         VBP      0.905     0.830     0.866       448
         VBG      0.896     0.791     0.84

ner