In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("train.csv")

In [3]:
df.head(10)


Unnamed: 0,text,label
0,и что мы бы назвали сложными модусными перспек...,1
1,"Эзофастома бокаловилная, вооружена более крупн...",0
2,", 2008; Durlak et al",1
3,где c0 - концентрация раствора,1
4,При сгруктурном исследовании минерала обнаруже...,0
5,Эпимеральная область (рис,0
6,Человек рассматривается как «рефлексивное живо...,1
7,", на верхушке чуть закругленные (рис",0
8,"Тычинок 2, со свободными, волосисто опушенными...",0
9,"Длина нерок 2.9-3.2 мм, отношение их длины к д...",0


In [4]:
texts = df['text'].tolist()  # Convert to lists
labels = df['label'].tolist()

In [5]:

model_name = "./ofline_models/rubert-tiny"
# 1. Load Model and Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2) # num_classes: number of your classes


# 3. Tokenize and Create Input Tensors
encoded_data = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

input_ids = encoded_data['input_ids']
attention_mask = encoded_data['attention_mask']
labels_tensor = torch.tensor(labels)

# 4. Split Data into Training and Validation Sets
train_inputs, val_inputs, train_labels, val_labels = train_test_split(
    input_ids, labels_tensor, test_size=0.2, random_state=42  # Adjust test_size as needed
)
train_masks, val_masks, _, _ = train_test_split(
    attention_mask, input_ids, test_size=0.2, random_state=42
)


# 5. Set up Optimizer
learning_rate = 5e-5 
optimizer = AdamW(model.parameters(), lr=learning_rate)  # Adjust learning rate

# 6. Training Loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device) # Move the model to the device
batch_size = 8  # Adjust batch size
num_epochs = 3  # Adjust the number of epochs
gradient_accumulation_steps = 2  # Simula un batch size de 32 si batch_size es 16

for epoch in range(num_epochs):
    model.train()
    for i in range(0, len(train_inputs), batch_size):
        batch_inputs = train_inputs[i:i+batch_size].to(device)
        batch_labels = train_labels[i:i+batch_size].to(device)
        batch_masks = train_masks[i:i+batch_size].to(device)

        optimizer.zero_grad()
        outputs = model(batch_inputs, attention_mask=batch_masks, labels=batch_labels)
        loss = outputs.loss / gradient_accumulation_steps  # Escala la pérdida
        loss.backward()
        optimizer.step()
        if (i + 1) % gradient_accumulation_steps == 0:  # Acumula gradientes
            optimizer.step()
            optimizer.zero_grad()
            
    # 7. Validation
    model.eval()
    val_preds = []
    with torch.no_grad():
        for i in range(0, len(val_inputs), batch_size):
            batch_inputs = val_inputs[i:i+batch_size].to(device)
            batch_labels = val_labels[i:i+batch_size].to(device)
            batch_masks = val_masks[i:i+batch_size].to(device)

            outputs = model(batch_inputs, attention_mask=batch_masks)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)
            val_preds.extend(preds.cpu().numpy())

    val_accuracy = accuracy_score(val_labels.cpu().numpy(), np.array(val_preds))
    print(f"Epoch {epoch+1}: Validation Accuracy: {val_accuracy}")

# 8. Save the Fine-Tuned Model
model.save_pretrained("fine_tuned_rubert_tiny")
tokenizer.save_pretrained("fine_tuned_rubert_tiny")

2025-02-21 22:28:35.522344: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./ofline_models/rubert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1: Validation Accuracy: 0.9319727891156463
Epoch 2: Validation Accuracy: 0.9387755102040817
Epoch 3: Validation Accuracy: 0.9387755102040817


('fine_tuned_rubert_tiny/tokenizer_config.json',
 'fine_tuned_rubert_tiny/special_tokens_map.json',
 'fine_tuned_rubert_tiny/vocab.txt',
 'fine_tuned_rubert_tiny/added_tokens.json',
 'fine_tuned_rubert_tiny/tokenizer.json')