In [1]:
from google.colab import drive


drive.mount("/gdrive")

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [2]:
from pathlib import Path


p = Path("/gdrive/My Drive/thc")
p.is_dir()

True

In [3]:
import torch
import torch.nn as nn


print(
    f"Is cuda available? {torch.cuda.is_available()}."
)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Is cuda available? True.


In [4]:
from pathlib import Path


REPOSITORY_DIR = Path("/gdrive/My Drive/thc/")

In [5]:
import codecs
from pathlib import Path
from typing import Any, Callable, Dict, Optional, Tuple, Union

from torch.utils.data import Dataset


class TweetsDataset(Dataset):
    def __init__(
        self,
        text_file: Union[str, Path],
        tags_file: Union[str, Path],
        text_open_params: Optional[Dict[str, Any]] = None,
        tags_open_params: Optional[Dict[str, Any]] = None,
        transform: Optional[Callable[[str], str]] = None,
    ) -> None:
        if text_open_params is None:
            text_open_params = {"mode": "r", "encoding": "utf-8"}
        if tags_open_params is None:
            tags_open_params = {"mode": "r"}

        with codecs.open(str(text_file), **text_open_params) as file:
            self.texts = file.read().splitlines()
        with codecs.open(str(tags_file), **tags_open_params) as file:
            self.tags = [int(tag) for tag in file]

        if len(self.texts) != len(self.tags):
            raise TypeError("Files length mismatch")

        self.transform = transform

    def __len__(self) -> int:
        return len(self.texts)

    def __getitem__(self, index: int) -> Tuple[str, int]:
        sample = self.texts[index]

        if self.transform is not None:
            sample = self.transform(sample)

        return sample, self.tags[index]


In [6]:
!pip install emoji



In [7]:
from typing import Callable, Iterable, Optional, Set, Tuple, Union

import emoji


class Compose(object):
    def __init__(self, transforms: Iterable[Callable[[str], str]]) -> None:
        self.transforms = transforms

    def __call__(self, text: str) -> str:
        for transform in self.transforms:
            text = transform(text)

        return text


class Lowercase(object):
    def __call__(self, text: str) -> str:
        return text.lower()


class WordRemove(object):
    def __init__(self, words_to_remove: Union[str, Iterable[str]]) -> None:
        self._words_to_remove = None

        self.words_to_remove = words_to_remove

    def __call__(self, text: str) -> str:
        words = [
            word for word in text.split() if word not in self.words_to_remove
        ]

        return " ".join(words)

    @property
    def words_to_remove(self) -> Set[str]:
        return self._words_to_remove

    @words_to_remove.setter
    def words_to_remove(self, obj: Union[str, Iterable[str]]) -> None:
        if isinstance(obj, str):
            self._words_to_remove = {obj}
        else:
            self._words_to_remove = set(obj)


class Demojize(object):
    def __init__(
        self,
        use_aliases: bool = False,
        delimiters: Optional[Tuple[str, str]] = None,
    ) -> None:
        self.use_aliases = use_aliases

        if delimiters is None:
            delimiters = (" :", ":")
        self.delimiters = delimiters

    def __call__(self, text: str) -> str:
        return emoji.demojize(
            text, use_aliases=self.use_aliases, delimiters=self.delimiters
        )


In [8]:
!pip install transformers



In [9]:
from typing import Callable


TRANSFORMS: Callable[[str], str] = Compose(
    transforms=[
        Demojize(),
        WordRemove(words_to_remove="@anonymized_account"),
    ]
)

In [10]:
PROCESSED_DATA_DIR = REPOSITORY_DIR.joinpath("data", "processed")

In [11]:
from torch.utils.data import DataLoader


train_tweets = TweetsDataset(
    text_file=PROCESSED_DATA_DIR.joinpath("train_text.txt"),
    tags_file=PROCESSED_DATA_DIR.joinpath("train_tags.txt"),
    transform=TRANSFORMS,
)
valid_tweets = TweetsDataset(
    text_file=PROCESSED_DATA_DIR.joinpath("valid_text.txt"),
    tags_file=PROCESSED_DATA_DIR.joinpath("valid_tags.txt"),
    transform=TRANSFORMS,
)

train_dataloader = DataLoader(
    dataset=train_tweets,  batch_size=16, shuffle=True, num_workers=2
)
valid_dataloader = DataLoader(
    dataset=valid_tweets,  batch_size=16, shuffle=False, num_workers=2
)

In [12]:
from typing import Optional

import torch
import torch.nn as nn
from transformers import DistilBertModel


class DistilBertClassifier(nn.Module):
    def __init__(self, output_size: int) -> None:
        super().__init__()
        self.distilbert = DistilBertModel.from_pretrained(
            "distilbert-base-multilingual-cased"
        )
        self.dropout = nn.Dropout(p=0.1)
        self.fc = nn.Linear(self.distilbert.config.hidden_size, output_size)

    def forward(
        self,
        input_ids: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        distilbert_outputs = self.distilbert(
            input_ids=input_ids, attention_mask=attention_mask
        )
        pooled = distilbert_outputs[0][:, 0, :]
        output = self.fc(self.dropout(pooled))

        return output



In [13]:
model = DistilBertClassifier(output_size=3)

In [14]:
from transformers import DistilBertTokenizerFast
from transformers.tokenization_utils_base import PreTrainedTokenizerBase


TOKENIZER: PreTrainedTokenizerBase = DistilBertTokenizerFast.from_pretrained(
    "distilbert-base-multilingual-cased",
    do_lower_case=False,
    strip_accents=False,
)

In [15]:
DISTILBERT_TOKENS_MAX_LENGTH: int = 512

In [16]:
!pip install scikit-learn



In [17]:
from datetime import datetime
from pathlib import Path
from typing import Any, List, Mapping, NamedTuple, Optional, Union, TYPE_CHECKING

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import f1_score
from torch.utils.tensorboard import SummaryWriter
from transformers import DistilBertTokenizer, DistilBertTokenizerFast
from tqdm import tqdm


class EpochResults(NamedTuple):
    outputs: np.ndarray
    targets: np.ndarray
    average_loss: float


class TrainTestDataloaders(NamedTuple):
    train: torch.utils.data.DataLoader
    test: torch.utils.data.DataLoader

def train(
    model: DistilBertClassifier,
    dataloader: torch.utils.data.DataLoader,
    tokenizer: Union[DistilBertTokenizer, DistilBertTokenizerFast],
    device: torch.device,
    optimizer: optim.Optimizer,
    objective: nn.modules.loss._Loss,
) -> EpochResults:
    if objective.reduction != "mean":
        return ValueError(
            "`objective` parameter accepts only losses with `reduction='mean'`"
        )

    model = model.to(device)
    model.train()

    outputs = []
    targets = []
    average_loss = 0.0
    for input_batch, target_batch in tqdm(dataloader):
        encoding_batch = tokenizer.batch_encode_plus(
            list(input_batch),
            max_length=DISTILBERT_TOKENS_MAX_LENGTH,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
            return_attention_mask=True,
        )
        input_ids_batch = encoding_batch["input_ids"].to(device)
        attention_mask_batch = encoding_batch["attention_mask"].to(device)

        optimizer.zero_grad()

        output_batch = model(
            input_ids=input_ids_batch, attention_mask=attention_mask_batch
        )
        outputs.append(output_batch.cpu().detach().numpy())

        target_batch = target_batch.view(-1)
        targets.append(target_batch.cpu().detach().numpy())
        target_batch = target_batch.to(device)

        loss = objective(output_batch, target_batch.long())
        average_loss += loss.item() * len(input_batch)
        loss.backward()

        optimizer.step()

    average_loss /= len(dataloader)

    return EpochResults(
        outputs=np.concatenate(outputs, axis=0),
        targets=np.concatenate(targets, axis=0),
        average_loss=average_loss,
    )


def test(
    model: nn.Module,
    dataloader: torch.utils.data.DataLoader,
    tokenizer: Union[DistilBertTokenizer, DistilBertTokenizerFast],
    device: torch.device,
    objective: nn.modules.loss._Loss,
) -> EpochResults:
    if objective.reduction != "mean":
        return ValueError(
            "`objective` parameter accepts only losses with `reduction='mean'`"
        )

    model = model.to(device)
    model.eval()

    outputs = []
    targets = []
    average_loss = 0.0
    with torch.no_grad():
        for input_batch, target_batch in tqdm(dataloader):
            encoding_batch = tokenizer.batch_encode_plus(
                list(input_batch),
                max_length=DISTILBERT_TOKENS_MAX_LENGTH,
                padding="max_length",
                truncation=True,
                return_tensors="pt",
                return_attention_mask=True,
            )
            input_ids_batch = encoding_batch["input_ids"].to(device)
            attention_mask_batch = encoding_batch["attention_mask"].to(device)

            output_batch = model(
                input_ids=input_ids_batch, attention_mask=attention_mask_batch
            )
            outputs.append(output_batch.cpu().detach().numpy())

            target_batch = target_batch.view(-1)
            targets.append(target_batch.cpu().detach().numpy())
            target_batch = target_batch.to(device)

            average_loss += objective(
                output_batch, target_batch.long()
            ).item() * len(input_batch)

    average_loss /= len(dataloader)

    return EpochResults(
        outputs=np.concatenate(outputs, axis=0),
        targets=np.concatenate(targets, axis=0),
        average_loss=average_loss,
    )


def run_experiment(
    model: nn.Module,
    dataloaders: TrainTestDataloaders,
    tokenizer: Union[DistilBertTokenizer, DistilBertTokenizerFast],
    device: torch.device,
    optimizer: optim.Optimizer,
    objective: nn.modules.loss._Loss,
    epochs: int = 10,
    scheduler: Optional[optim.lr_scheduler._LRScheduler] = None,
    artifacts_dir: Optional[Union[str, Path]] = None,
    writer: Optional[SummaryWriter] = None,
) -> None:
    for epoch in tqdm(range(epochs)):
        train_epoch_results = train(
            model=model,
            dataloader=dataloaders.train,
            tokenizer=tokenizer,
            device=device,
            optimizer=optimizer,
            objective=objective,
        )
        test_epoch_results = test(
            model=model,
            dataloader=dataloaders.test,
            tokenizer=tokenizer,
            device=device,
            objective=objective,
        )
        if scheduler is not None:
            scheduler.step()

        if artifacts_dir is not None:
            checkpoint = {
                "epoch": epoch,
                "model": model.state_dict(),
                "optimizer": optimizer.state_dict(),
                "scheduler": scheduler.state_dict(),
            }
            now = datetime.now()
            torch.save(
                checkpoint,
                Path(artifacts_dir).joinpath(
                    "checkpoint.{}.pth".format(
                        now.strftime("%d-%m-%Y.%H_%M_%S")
                    )
                ),
            )

        if writer is not None:
            writer.add_scalar(
                "avg-loss-train", train_epoch_results.average_loss, epoch
            )
            writer.add_scalar(
                "avg-loss-test", test_epoch_results.average_loss, epoch
            )

            predictions_train = np.argmax(train_epoch_results.outputs, axis=1)
            predictions_test = np.argmax(test_epoch_results.outputs, axis=1)

            writer.add_scalar(
                "micro-f1-train",
                f1_score(
                    y_true=train_epoch_results.targets,
                    y_pred=predictions_train,
                    average="micro",
                ),
                epoch,
            )
            writer.add_scalar(
                "micro-f1-test",
                f1_score(
                    y_true=test_epoch_results.targets,
                    y_pred=predictions_test,
                    average="micro",
                ),
                epoch,
            )

            writer.add_scalar(
                "macro-f1-train",
                f1_score(
                    y_true=train_epoch_results.targets,
                    y_pred=predictions_train,
                    average="macro",
                ),
                epoch,
            )
            writer.add_scalar(
                "macro-f1-test",
                f1_score(
                    y_true=test_epoch_results.targets,
                    y_pred=predictions_test,
                    average="macro",
                ),
                epoch,
            )

            if scheduler is not None:
                writer.add_scalar(
                    "learning-rate", scheduler.get_last_lr()[0], epoch
                )
    if writer is not None:
        writer.close()




In [18]:
from typing import Dict

import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from transformers import get_linear_schedule_with_warmup


EPOCHS: int = 20
CLASS_WEIGHTS: Dict[int, float] = {0: 0.3642, 1: 13.2354, 2: 5.5911}


artifacts_dir = REPOSITORY_DIR.joinpath("models", "distilbert-fine-tuning")
artifacts_dir.mkdir(exist_ok=True)
logs_dir = REPOSITORY_DIR.joinpath("logs", "distilbert-fine-tuning")
logs_dir.mkdir(exist_ok=True)
optimizer = optim.AdamW(model.parameters(), lr=3e-5)
scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=len(train_dataloader) * EPOCHS * 0.05,
    num_training_steps=len(train_dataloader) * EPOCHS,
)
# Apply balanced class weights
class_weights = torch.from_numpy(np.array([CLASS_WEIGHTS[0], CLASS_WEIGHTS[1], CLASS_WEIGHTS[2]])).float().to(DEVICE)
print(class_weights)
objective = nn.CrossEntropyLoss(weight=class_weights)
train_test_dataloaders = TrainTestDataloaders(train=train_dataloader, test=valid_dataloader)
writer = SummaryWriter(log_dir=logs_dir)

run_experiment(
    model=model,
    dataloaders=train_test_dataloaders,
    tokenizer=TOKENIZER,
    device=DEVICE,
    optimizer=optimizer,
    objective=objective,
    epochs=EPOCHS,
    scheduler=scheduler,
    artifacts_dir=artifacts_dir,
    writer=writer,
)

  0%|          | 0/20 [00:00<?, ?it/s]
  0%|          | 0/440 [00:00<?, ?it/s][A

tensor([ 0.3642, 13.2354,  5.5911], device='cuda:0')


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 13%|█▎        | 57/440 [00:45<05:04,  1.26it/s][A
 13%|█▎        | 58/440 [00:46<05:03,  1.26it/s][A
 13%|█▎        | 59/440 [00:47<05:02,  1.26it/s][A
 14%|█▎        | 60/440 [00:48<05:02,  1.26it/s][A
 14%|█▍        | 61/440 [00:49<05:01,  1.26it/s][A
 14%|█▍        | 62/440 [00:49<04:59,  1.26it/s][A
 14%|█▍        | 63/440 [00:50<04:59,  1.26it/s][A
 15%|█▍        | 64/440 [00:51<04:57,  1.26it/s][A
 15%|█▍        | 65/440 [00:52<04:56,  1.26it/s][A
 15%|█▌        | 66/440 [00:53<04:55,  1.26it/s][A
 15%|█▌        | 67/440 [00:53<04:54,  1.26it/s][A
 15%|█▌        | 68/440 [00:54<04:54,  1.27it/s][A
 16%|█▌        | 69/440 [00:55<04:52,  1.27it/s][A
 16%|█▌        | 70/440 [00:56<04:52,  1.27it/s][A
 16%|█▌        | 71/440 [00:56<04:52,  1.26it/s][A
 16%|█▋        | 72/440 [00:57<04:51,  1.26it/s][A
 17%|█▋        | 73/440 [00:58<04:50,  1.26it/s][A
 17%|█▋        | 74/440 [00:59<04:48,  1.27it/s][A