In [9]:
!pip install gdown

Collecting gdown
  Downloading gdown-5.2.0-py3-none-any.whl.metadata (5.8 kB)
Downloading gdown-5.2.0-py3-none-any.whl (18 kB)
Installing collected packages: gdown
Successfully installed gdown-5.2.0


In [10]:
import gdown

url = "https://drive.google.com/file/d/1oP4ktwKPC7s4l_drptKiC1cERTAmjjPp/view?usp=sharing"

out = "acllmdb.zip"

# Download the file in google drive using Python
gdown.download(url, out, quiet=False,fuzzy=True)

# Download a folder in google drive using Python
# gdown.download_folder(url, quiet=True, use_cookies=False)

Downloading...
From (original): https://drive.google.com/uc?id=1oP4ktwKPC7s4l_drptKiC1cERTAmjjPp
From (redirected): https://drive.google.com/uc?id=1oP4ktwKPC7s4l_drptKiC1cERTAmjjPp&confirm=t&uuid=b46a242f-26c6-4ef1-ba82-46c6e39175b0
To: /kaggle/working/acllmdb.zip
100%|██████████| 42.8M/42.8M [00:01<00:00, 26.8MB/s]


'acllmdb.zip'

In [11]:
import zipfile
zip_ref = zipfile.ZipFile("/kaggle/working/acllmdb.zip", 'r')
zip_ref.extractall("/kaggle/working/")
zip_ref.close()

In [1]:
# utils
import os
import random
import numpy as np
import shutil

def clear_folder(folder):
    for filename in os.listdir(folder):
        file_path = os.path.join(folder, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print('Failed to delete %s. Reason: %s' % (file_path, e))

def remove(path):
    os.remove(path)
    
# make archieve
def make_zip(
        output_filename = "/kaggle/working/images28",
        dir_name = "/kaggle/working/images"
):
    shutil.make_archive(output_filename, 'zip', dir_name)

In [21]:
make_zip(dir_name="/kaggle/working/dataset", output_filename="/kaggle/working/dataset")

In [None]:
import re
import string

def preprocess_text(text):
    new_text = text.lower().replace('<br />', '')
    
    new_text = re.sub("[%s]" % re.escape(string.punctuation), '', new_text)
    
    return new_text

test = "This may not be a memorable classic, but it is a touching romance with an important theme that stresses the importance of literacy in modern society and the devastating career and life consequences for any unfortunate individual lacking this vital skill.<br /><br />The story revolves around Iris, a widow who becomes acquainted with a fellow employee at her factory job, an illiterate cafeteria worker named Stanley. Iris discovers that Stanley is unable to read, and after he loses his job, she gives him reading lessons at home in her kitchen. Of course, as you might predict, the two, although initially wary of involvement, develop feelings for each other...<br /><br />Jane Fonda competently plays Iris, a woman with problems of her own, coping with a job lacking prospects, two teenage children (one pregnant), an unemployed sister and her abusive husband. However, Robert DeNiro is of course brilliant in his endearing portrayal of the intelligent and resourceful, but illiterate, Stanley, bringing a dignity to the role that commands respect. They aren't your typical charming young yuppie couple, as generally depicted in on screen romances, but an ordinary working class, middle aged pair with pretty down to earth struggles.<br /><br />I won't give the ending away, but it's a lovely, heartwarming romance and a personal look into the troubling issue of adult illiteracy, albeit from the perspective of a fictional character."

_test = preprocess_text(test)

print(_test)

In [8]:
target_string = "pos_12.txt"
pattern = r"(?<=\_)\d+(?=\.txt)"

result = re.search(pattern, target_string, re.M)

print(result.group())

12


In [2]:
from gensim.models import FastText
from gensim.test.utils import common_texts

# Example corpus (replace with your own corpus)
corpus = common_texts

# Training FastText model
fastText = FastText(sentences=corpus, vector_size=256, window=5, min_count=1, workers=4, sg=1)


In [2]:
import re
from torch.utils.data import Dataset, DataLoader
import string
from transformers import AlbertTokenizer, AlbertModel
import torch
import torch.nn as nn
import torch.nn.functional as F

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def preprocess_text(text):
    new_text = text.lower().replace('<br />', '')
    
    new_text = re.sub("[%s]" % re.escape(string.punctuation), '', new_text)
    
    return new_text


def preprocess_bert(text):
    new_text = text.lower().replace('<br />', '')
    
    new_text = re.sub("[%s]" % re.escape(string.punctuation), '', new_text)
    
    new_text = f"[CLS] {new_text}[SEP]"
    
    return new_text


def get_embeddings(text):
    # Example usage: getting embeddings for a word
    word_embedding = fastText.wv[text]
    return word_embedding


class Vectorizer(nn.Module):
    def __init__(self):
        super().__init__()
        self.tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
        self.model = AlbertModel.from_pretrained('albert-base-v2').to(device)
        
        for param in self.model.parameters():
            param.requires_grad = False

    
    def forward(self, text):
        inputs = self.tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
        
        inputs = {key: value.to(device) for key, value in inputs.items()}

        outputs = self.model(**inputs)
        
        last_hidden_state = outputs.last_hidden_state  # Размер [batch_size, seq_len, hidden_size]

        # Маска внимания, чтобы исключить `<PAD>` токены
        attention_mask = inputs['attention_mask'].unsqueeze(-1).expand(last_hidden_state.size()).float()

        # Вычисление средних эмбеддингов для каждого предложения
        sum_embeddings = torch.sum(last_hidden_state * attention_mask, dim=1)
        sum_mask = torch.clamp(attention_mask.sum(dim=1), min=1e-9)  # Чтобы избежать деления на ноль
        mean_pooled = sum_embeddings / sum_mask

        return mean_pooled.view(-1)    


def get_all_txt(dir_path):
    paths = []
    pattern = r"(?<=\_)\d+(?=\.txt)"
    
    for file_path in os.listdir(dir_path):
        # check if current file_path is a file
        file = os.path.join(dir_path, file_path)
        result = re.search(pattern, file_path)
        
        if os.path.isfile(file) and result is not None:

            paths.append(file)
    
    return paths

vectorizer = Vectorizer().to(device)

from sklearn.model_selection import train_test_split

class BaseDataset(Dataset):
    def __init__(self, dir_path, transform_text):
        paths = get_all_txt(dir_path)
        
        self.paths, _ = train_test_split(paths, test_size=0.3, random_state=42)
        
        self.transform = transform_text
        self.pattern = r"(?<=\_)\d+(?=\.txt)"

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, idx):
        path = self.paths[idx]
        
        result = re.search(self.pattern, path)

        label = int(result.group()) - 1
        
        if label > 3:
            label -= 2

        with open(path, "r") as f:
            text = f.read()
            
        text = self.transform(text)
        
#         sample = get_embeddings(text)
        
# #         sample = torch.mean(sample, dim=1).view(-1)
        
#         y = torch.tensor(label, dtype=torch.long, device = sample.device)
        
        return text, label
    

class TextDataset(Dataset):
    def __init__(self, dir_path, transform_text):
        self.paths = get_all_txt(dir_path)
        self.transform = transform_text
        self.pattern = r"(?<=\_)\d+(?=\.txt)"

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, idx):
        path = self.paths[idx]
        
        result = re.search(self.pattern, path)

        label = int(result.group()) - 1
        
        if label > 3:
            label -= 2

        with open(path, "r") as f:
            text = f.read()
            
        text = self.transform(text)
        
        sample = get_embeddings(text)
        
# #         sample = torch.mean(sample, dim=1).view(-1)
        
#         y = torch.tensor(label, dtype=torch.long, device = sample.device)
        
        return sample, label
    
    
def create_dataloader(dir_path, transform_text, batch_size, shuffle=False, pin_memory=False):

#     dataset = TextDataset(dir_path, transform_text)
    
    dataset = BaseDataset(dir_path, transform_text)
    
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, pin_memory=pin_memory)
    
    return dataloader



In [9]:
import torch
from torch.utils.data import DataLoader

# Предположим, что у вас уже есть датасет

dir_path = "/kaggle/working/dataset/test"

dataset = BaseDataset(dir_path, preprocess_text)

# Используем DataLoader для итерации по датасету
dataloader = DataLoader(dataset, batch_size=32, shuffle=False)

# Инициализируем переменные для хранения min и max значений
min_val = torch.inf
max_val = -torch.inf

# Проходим по всем батчам датасета
for batch in dataloader:
    _, inputs = batch  # inputs - это ваши векторные данные

    # Находим минимум и максимум в текущем батче
    batch_min = inputs.min()
    batch_max = inputs.max()

    # Обновляем общий минимум и максимум
    min_val = min(min_val, batch_min.item())
    max_val = max(max_val, batch_max.item())

# Выводим найденные значения
print(f"Minimum value in dataset: {min_val}")
print(f"Maximum value in dataset: {max_val}")


Minimum value in dataset: 0
Maximum value in dataset: 7


In [31]:
from transformers import AlbertTokenizer, AlbertModel

# Загрузка модели ALBERT
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertModel.from_pretrained('albert-base-v2')

# Пример текста
text = "I love programming in Python I love programming in Python"
inputs = tokenizer(_test, return_tensors="pt")
outputs = model(**inputs)

# Получение эмбеддингов
last_hidden_states = outputs.last_hidden_state
print(last_hidden_states.shape)

import torch

res = torch.mean(last_hidden_states, dim=(1))

print(res.shape)

# torch.Size([1, 247, 768])

torch.Size([1, 247, 768])
torch.Size([1, 768])


In [None]:
!pip install gensim

In [11]:
from gensim.models import FastText
from gensim.test.utils import common_texts

# Example corpus (replace with your own corpus)
corpus = common_texts

# Training FastText model
model = FastText(sentences=corpus, vector_size=100, window=5, min_count=1, workers=4, sg=1)

# Example usage: getting embeddings for a word
word_embedding = model.wv['computer']

print(word_embedding)

# # Most similar words to a given word
# similar_words = model.wv.most_similar('computer')

# print("Most similar words to 'computer':", similar_words)

[ 2.9685666e-04  3.3072010e-04 -8.7743282e-04  3.3974674e-04
 -5.0194800e-04 -2.0421152e-03 -1.2409585e-03 -1.9403716e-03
  1.3458645e-03 -2.4129902e-03  9.1810629e-04 -1.0314244e-03
 -7.6350034e-04  7.3201641e-05  1.3831169e-03  5.1904464e-04
 -2.9898007e-04 -1.1948631e-03 -1.1725607e-03 -6.0877239e-04
 -6.7762885e-04  3.9272508e-04  9.8973374e-05  8.1221922e-04
  5.8214430e-04  7.0246187e-04 -7.3612190e-04 -1.0396213e-03
 -6.2489061e-04 -2.4085796e-04 -1.1932147e-03 -2.6620671e-04
  7.3646189e-04 -7.2184735e-04 -1.2750521e-03  1.2425568e-04
  3.7789033e-04 -1.3312516e-03 -2.7341598e-03 -3.0504598e-04
  9.2881807e-04 -7.2819379e-04 -1.1289539e-03 -3.2218394e-04
 -2.0561849e-04 -1.0497047e-04 -6.2273885e-04 -1.6138694e-03
  9.9103095e-04  9.2174501e-05  3.6866101e-04 -5.3755875e-04
  1.1334866e-03  8.7075913e-04 -1.6392255e-03 -8.5598481e-04
 -6.3172285e-04  6.2362582e-04  8.4005587e-04 -1.1280770e-03
  1.2912388e-03 -3.4063371e-04 -1.1784503e-03 -1.6083722e-03
  1.5276625e-03  3.07625

In [25]:
from transformers import BertTokenizer, BertModel

# Загрузка модели TinyBERT
tokenizer = BertTokenizer.from_pretrained('huawei-noah/TinyBERT_General_4L_312D')
model = BertModel.from_pretrained('huawei-noah/TinyBERT_General_4L_312D')

# Пример текста
text = "I love programming in Python"
inputs = tokenizer(_test, return_tensors="pt")
outputs = model(**inputs)

# Получение эмбеддингов
last_hidden_states = outputs.last_hidden_state
print(last_hidden_states.shape)

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/409 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/62.7M [00:00<?, ?B/s]

torch.Size([1, 248, 312])


In [3]:
def save_model(model, path, name="reviewer"):
    torch.save(model.state_dict(), f"{path}/{name}.model")

def load_model(model, path, name="reviewer"):
    model.load_state_dict(torch.load(f"{path}/{name}.model", weights_only=True))

    model.eval()

In [12]:
import torch
import torch.nn as nn
import torchvision

class MovieReviewer(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.num_classes = 8
        self.vector_size = 256
        self.hidden_features_0 = 512
        self.hidden_features_1 = 128
        
        self.model = nn.Sequential(*[
            nn.Linear(self.vector_size, self.hidden_features_0),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(self.hidden_features_0, self.hidden_features_1),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(self.hidden_features_1, self.num_classes)
        ])
        
    def forward(self, x):
        # x.shape = torch.Size([b, 768])
        
        x = self.model(x)
        
        return x

In [13]:
import torchinfo

reviewer = MovieReviewer()

torchinfo.summary(reviewer, input_size=(1, 256), col_names = ("input_size", "output_size", "num_params"))

Layer (type:depth-idx)                   Input Shape               Output Shape              Param #
MovieReviewer                            [1, 256]                  [1, 8]                    --
├─Sequential: 1-1                        [1, 256]                  [1, 8]                    --
│    └─Linear: 2-1                       [1, 256]                  [1, 512]                  131,584
│    └─ReLU: 2-2                         [1, 512]                  [1, 512]                  --
│    └─Dropout: 2-3                      [1, 512]                  [1, 512]                  --
│    └─Linear: 2-4                       [1, 512]                  [1, 128]                  65,664
│    └─ReLU: 2-5                         [1, 128]                  [1, 128]                  --
│    └─Dropout: 2-6                      [1, 128]                  [1, 128]                  --
│    └─Linear: 2-7                       [1, 128]                  [1, 8]                    1,032
Total params: 198,280
T

In [None]:
class BertClassifier(nn.Module):
    def __init__(self):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        
        for param in self.bert.parameters():
            param.requires_grad = False

        # Добавляем полносвязный классификатор
        self.classifier = nn.Linear(self.bert.config.hidden_size, 8)  # Предположим, задача классификации на 2 класса

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]  # Получаем эмбеддинг для [CLS] токена
        logits = self.classifier(pooled_output)
        return logits

In [16]:
from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

batch_size = 128


def main():
    
    model = MovieReviewer()

    model.to(device)

    for module in model.modules():
        if isinstance(module, nn.Linear):
            nn.init.normal_(module.weight, 0.0, 0.02)
            if module.bias is not None:
                nn.init.constant_(module.bias, 0)
    #     if (isinstance(module, nn.BatchNorm2d)):
    #         nn.init.normal_(module.weight, 1.0, 0.02)
    #         if module.bias is not None:
    #             nn.init.constant_(module.bias, 0)
    #     if (isinstance(module, nn.Embedding)):
    #         nn.init.uniform_(module.weight, -0.1, 0.1)

    train_loader = create_dataloader("/kaggle/working/dataset/train", preprocess_text, batch_size, shuffle=True, pin_memory=False)
    test_loader = create_dataloader("/kaggle/working/dataset/test", preprocess_text, batch_size, shuffle=False, pin_memory=False)

    
    num_epochs = 10
    
    # Определение оптимизатора и функции потерь
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-2)

    def train_one_epoch(epoch):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0

        loop = tqdm(enumerate(train_loader), total=len(train_loader), leave=False)
        loop.set_description(f"Epoch [{epoch+1}/{num_epochs}]")
        
        for batch_idx, (text, target) in loop:
            text = text.to(device)
            target = target.to(device)
            
            optimizer.zero_grad()

            outputs = model(text)

            loss = criterion(outputs, target)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * text.size(0)
            _, predicted = torch.max(outputs, 1)
            
            correct += (predicted == target).sum().item()
            total += text.size(0)

        epoch_loss = running_loss / len(train_loader.dataset)
        epoch_accuracy = correct / total

        print(f'Epoch {epoch+1}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}')

        save_model(model, "/kaggle/working/")

    
    def evaluate(epoch):
        model.eval()
        correct = 0
        total = 0
        
        with torch.no_grad():
            loop = tqdm(enumerate(test_loader), total=len(test_loader), leave=False)
            for bathc_idx, (text, target) in loop:
                text = text.to(device)
                target = target.to(device)

                outputs = model(text)

                _, predicted = torch.max(outputs, 1)
                
                correct += (predicted == target).sum().item()
                total += text.size(0)
        
        accuracy = correct / total

        print(f'Test Accuracy: {accuracy:.4f}')

    
    for epoch in range(num_epochs):
        train_one_epoch(epoch)
        evaluate(epoch)

In [17]:
main()

                                                               

Epoch 1, Loss: 2.0607, Accuracy: 0.2034


                                                 

Test Accuracy: 0.2009


                                                               

Epoch 2, Loss: 2.0270, Accuracy: 0.2040


                                                 

Test Accuracy: 0.2009


                                                             

KeyboardInterrupt: 

In [23]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import ReduceLROnPlateau
from transformers import BertTokenizer, BertModel
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

num_epochs = 10

class BertClassifier(nn.Module):
    def __init__(self, num_classes=8):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased') #'prajjwal1/bert-tiny')
        
        for param in self.bert.parameters():
            param.requires_grad = False
        
        self.relu = nn.ReLU()
        
        # Линейный слой для уменьшения размерности
        self.l1 = nn.Linear(self.bert.config.hidden_size, 64)
        
        # GRU слой с размером hidden state 100 и 10 слоями
        self.gru = nn.GRU(64, 100, num_layers=10, batch_first=True)
        
        # Финальный линейный слой для классификации
        self.out = nn.Linear(100, num_classes)
        

    def forward(self, input_ids, attention_mask):

        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
    
        pooled_output = outputs.last_hidden_state  # [batch_size, sequence_length, hidden_size]
        
        # Применяем линейный слой и ReLU
        x = self.relu(self.l1(pooled_output))  # [batch_size, sequence_length, 64]
        
        # Пропускаем через GRU слой
        gru_output, _ = self.gru(x)  # [batch_size, sequence_length, 100]
        
        # Используем последний hidden state (последний токен)
        gru_last_hidden = gru_output[:, -1, :]  # [batch_size, 100]
        
        # Финальная классификация через линейный слой
        logits = self.out(gru_last_hidden) 
        
        return logits

# Функция для обучения одной эпохи
def train_one_epoch(epoch, model, train_loader, optimizer, scheduler, criterion):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    loop = tqdm(enumerate(train_loader), total=len(train_loader))
    loop.set_description(f"Epoch [{epoch+1}/{num_epochs}]")
    
    for batch_idx, (text, target) in loop:
        # Токенизация и подготовка данных
        inputs = tokenizer(text, 
                           return_tensors="pt", 
                           max_length=512, 
                           truncation=True, 
                           padding=True,
                          return_attention_mask=True).to(device)
        
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        target = target.to(device)

        optimizer.zero_grad()
        
        # Прямой проход
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        
        # Используем CrossEntropy Loss (не требуется one-hot encoding)
        loss = criterion(outputs, target)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * len(text)
        _, predicted = torch.max(outputs, 1)

        correct += (predicted == target).sum().item()
        total += len(text)

    epoch_loss = running_loss / len(train_loader.dataset)
    epoch_accuracy = correct / total
    
    scheduler.step(epoch_loss)

    print(f'Epoch {epoch+1}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}')

# Пример использования
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') #'prajjwal1/bert-tiny')
model = BertClassifier().to(device)

train_loader = create_dataloader("/kaggle/working/dataset/train", preprocess_text, batch_size, shuffle=True, pin_memory=False)
test_loader = create_dataloader("/kaggle/working/dataset/test", preprocess_text, batch_size, shuffle=False, pin_memory=False)


# Оптимизатор и критерий
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-2)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, verbose=False)
criterion = nn.CrossEntropyLoss()


for i in range(10):
    # Вызов функции для тренировки одной эпохи
    train_one_epoch(i, model, train_loader, optimizer, scheduler, criterion)


Epoch [1/10]: 100%|██████████| 137/137 [07:27<00:00,  3.26s/it]


Epoch 1, Loss: 2.0032, Accuracy: 0.2277


Epoch [2/10]: 100%|██████████| 137/137 [07:28<00:00,  3.28s/it]


Epoch 2, Loss: 1.7977, Accuracy: 0.3274


Epoch [3/10]: 100%|██████████| 137/137 [07:28<00:00,  3.28s/it]


Epoch 3, Loss: 1.7293, Accuracy: 0.3394


Epoch [4/10]: 100%|██████████| 137/137 [07:29<00:00,  3.28s/it]


Epoch 4, Loss: 1.6738, Accuracy: 0.3511


Epoch [5/10]: 100%|██████████| 137/137 [07:29<00:00,  3.28s/it]


Epoch 5, Loss: 1.6194, Accuracy: 0.3611


Epoch [6/10]: 100%|██████████| 137/137 [07:29<00:00,  3.28s/it]


Epoch 6, Loss: 1.6145, Accuracy: 0.3643


Epoch [7/10]: 100%|██████████| 137/137 [07:29<00:00,  3.28s/it]


Epoch 7, Loss: 1.5941, Accuracy: 0.3712


Epoch [8/10]: 100%|██████████| 137/137 [07:29<00:00,  3.28s/it]


Epoch 8, Loss: 1.5786, Accuracy: 0.3742


Epoch [9/10]:  14%|█▍        | 19/137 [01:02<06:31,  3.32s/it]


KeyboardInterrupt: 

In [24]:
save_model(model, path="/kaggle/working/", name="bert_cl")

In [12]:
import torch
import torch.nn as nn
from torch.optim.lr_scheduler import ReduceLROnPlateau
from transformers import AutoTokenizer, AlbertForSequenceClassification
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

num_epochs = 10
batch_size = 32

model = AlbertForSequenceClassification.from_pretrained(
                                                    "textattack/albert-base-v2-imdb", 
                                                      num_labels=8, ignore_mismatched_sizes=True,
                                                     problem_type="multi_label_classification").to(device)

# Функция для обучения одной эпохи
def train_one_epoch(epoch, model, train_loader, optimizer, scheduler, criterion):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    loop = tqdm(enumerate(train_loader), total=len(train_loader))
    loop.set_description(f"Epoch [{epoch+1}/{num_epochs}]")
    
    for batch_idx, (text, target) in loop:
        # Токенизация и подготовка данных
        inputs = tokenizer(text, 
                           return_tensors="pt", 
                           max_length=128, 
                           truncation=True, 
                           padding=True,
                           return_attention_mask=True).to(device)
        
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        target = target.to(device)

        optimizer.zero_grad()
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        
        # Логиты находятся в outputs.logits
        logits = outputs.logits

        # Используем CrossEntropy Loss
        loss = criterion(logits, target)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * len(text)
        _, predicted = torch.max(logits, 1)

        correct += (predicted == target).sum().item()
        total += len(text)

    epoch_loss = running_loss / len(train_loader.dataset)
    epoch_accuracy = correct / total
    
    scheduler.step(epoch_loss)

    print(f'Epoch {epoch+1}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}')
    
    model.save_pretrained("/kaggle/working/AlbertForClassification")

# Пример использования
tokenizer = AutoTokenizer.from_pretrained('textattack/albert-base-v2-imdb')

train_loader = create_dataloader("/kaggle/working/dataset/train", preprocess_text, batch_size, shuffle=True, pin_memory=False)
test_loader = create_dataloader("/kaggle/working/dataset/test", preprocess_text, batch_size, shuffle=False, pin_memory=False)

# Оптимизатор и критерий
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-05, weight_decay=1e-2)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, verbose=False)
criterion = nn.CrossEntropyLoss()

for i in range(10):
    # Вызов функции для тренировки одной эпохи
    train_one_epoch(i, model, train_loader, optimizer, scheduler, criterion)


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at textattack/albert-base-v2-imdb and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([8, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([8]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch [1/10]: 100%|██████████| 1094/1094 [03:50<00:00,  4.75it/s]


Epoch 1, Loss: 1.4651, Accuracy: 0.4118


Epoch [2/10]: 100%|██████████| 1094/1094 [03:50<00:00,  4.75it/s]


Epoch 2, Loss: 1.3114, Accuracy: 0.4608


Epoch [3/10]: 100%|██████████| 1094/1094 [03:50<00:00,  4.75it/s]


Epoch 3, Loss: 1.2168, Accuracy: 0.4968


Epoch [4/10]: 100%|██████████| 1094/1094 [03:50<00:00,  4.75it/s]


Epoch 4, Loss: 1.1090, Accuracy: 0.5385


Epoch [5/10]: 100%|██████████| 1094/1094 [03:50<00:00,  4.75it/s]


Epoch 5, Loss: 0.9767, Accuracy: 0.5936


Epoch [6/10]: 100%|██████████| 1094/1094 [03:50<00:00,  4.75it/s]


Epoch 6, Loss: 0.8378, Accuracy: 0.6601


Epoch [7/10]: 100%|██████████| 1094/1094 [03:50<00:00,  4.75it/s]


Epoch 7, Loss: 0.6819, Accuracy: 0.7277


Epoch [8/10]: 100%|██████████| 1094/1094 [03:50<00:00,  4.75it/s]


Epoch 8, Loss: 0.5578, Accuracy: 0.7808


Epoch [9/10]: 100%|██████████| 1094/1094 [03:50<00:00,  4.75it/s]


Epoch 9, Loss: 0.4276, Accuracy: 0.8393


Epoch [10/10]: 100%|██████████| 1094/1094 [03:50<00:00,  4.75it/s]


Epoch 10, Loss: 0.3269, Accuracy: 0.8812


In [None]:
tokenizer.save_pretrained("/kaggle/working/AlbertForClassification")

In [13]:
for i in range(10, 20):
    # Вызов функции для тренировки одной эпохи
    train_one_epoch(i, model, train_loader, optimizer, scheduler, criterion)

Epoch [11/10]: 100%|██████████| 1094/1094 [03:50<00:00,  4.75it/s]


Epoch 11, Loss: 0.2604, Accuracy: 0.9082


Epoch [12/10]: 100%|██████████| 1094/1094 [03:50<00:00,  4.75it/s]


Epoch 12, Loss: 0.2121, Accuracy: 0.9266


Epoch [13/10]: 100%|██████████| 1094/1094 [03:50<00:00,  4.74it/s]


Epoch 13, Loss: 0.1886, Accuracy: 0.9335


Epoch [14/10]: 100%|██████████| 1094/1094 [03:50<00:00,  4.75it/s]


Epoch 14, Loss: 0.1821, Accuracy: 0.9360


Epoch [15/10]: 100%|██████████| 1094/1094 [03:50<00:00,  4.75it/s]


Epoch 15, Loss: 0.1470, Accuracy: 0.9495


Epoch [16/10]:  41%|████▏     | 452/1094 [01:35<02:15,  4.74it/s]


KeyboardInterrupt: 

In [19]:
def evaluate(epoch):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        loop = tqdm(enumerate(test_loader), total=len(test_loader), leave=False)
        for bathc_idx, (text, target) in loop:
            target = target.to(device)

            inputs = tokenizer(text, 
                           return_tensors="pt", 
                           max_length=128, 
                           truncation=True, 
                           padding=True,
                           return_attention_mask=True).to(device)
            
            outputs = model(**inputs)
        
            # Логиты находятся в outputs.logits
            logits = outputs.logits

            _, predicted = torch.max(logits, 1)

            correct += (predicted == target).sum().item()
            total += len(text)

    accuracy = correct / total

    print(f'Test Accuracy: {accuracy:.4f}')

In [None]:
evaluate(0)

 28%|██▊       | 304/1094 [00:22<00:58, 13.44it/s]