In [1]:
import pandas as pd
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_data = pd.read_csv('./Checkpoints/train_spell_correcred.csv')
test_data = pd.read_csv('./Checkpoints/test_spell_correcred.csv')

In [None]:
tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased-sentence")
model = AutoModelForSequenceClassification.from_pretrained("DeepPavlov/rubert-base-cased-sentence")

# Map 'generalized_work_class' labels to integers
label_to_id = {label: i for i, label in enumerate(df['generalized_work_class'].unique())}
train_data['label_id'] = train_data['generalized_work_class'].map(label_to_id)

# Tokenize the training data
tokenized_texts = tokenizer(train_data['work_name_corrected'].tolist(), truncation=True, padding=True, return_tensors="pt")

# Convert labels to PyTorch tensors
labels = torch.tensor(train_data['label_id'].tolist())

assert set(label_to_id.values()) == set(range(len(label_to_id)))

# Create a DataLoader
from torch.utils.data import DataLoader, TensorDataset
dataset = TensorDataset(tokenized_texts['input_ids'], tokenized_texts['attention_mask'], labels)
train_dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

# Initialize the optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

# Initialize the loss function
loss_fn = nn.CrossEntropyLoss()

# Training loop
num_epochs = 3  # You can adjust the number of training epochs
model.train()

for epoch in range(num_epochs):
    total_loss = 0
    for batch in train_dataloader:
        input_ids, attention_mask, label_ids = batch

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=label_ids)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}, Loss: {average_loss}")

# Save the fine-tuned model
model.save_pretrained("finetuned_model")
