In [1]:
!pip install tqdm



In [2]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from tqdm.auto import tqdm

file_path = '/content/titles_dataset.csv'
df = pd.read_csv(file_path)
# Split the dataset into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [3]:
# Unique labels
unique_labels = df['Label'].unique()

# Create a mapping from label string to ID
str_to_id = {label: idx for idx, label in enumerate(unique_labels)}

# Create a mapping from ID to label string
id_to_str = {idx: label for idx, label in enumerate(unique_labels)}

In [4]:
id_to_str

{0: 'فناوری', 1: 'سلامت', 2: 'ورزش', 3: 'خارجی', 4: 'هنر'}

In [30]:
# Define a PyTorch Dataset
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        text = self.dataframe.iloc[idx]['Title']
        label = self.dataframe.iloc[idx]['Label']
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(str_to_id[label])
        }

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('HooshvareLab/distilbert-fa-zwnj-base')
model = BertForSequenceClassification.from_pretrained('HooshvareLab/distilbert-fa-zwnj-base', num_labels=5, id2label=id_to_str)

# Define training parameters
batch_size = 16
max_length = 64
epochs = 1
learning_rate = 2e-5

# Prepare the data loaders
train_dataset = CustomDataset(train_df, tokenizer, max_length)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = CustomDataset(val_df, tokenizer, max_length)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=learning_rate)
criterion = torch.nn.CrossEntropyLoss()

# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(train_loader)

    # Validation loop
    model.eval()
    val_loss = 0
    correct_preds = 0
    total_preds = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

            _, predicted = torch.max(outputs.logits, 1)
            correct_preds += (predicted == labels).sum().item()
            total_preds += labels.size(0)

    avg_val_loss = val_loss / len(val_loader)
    val_accuracy = correct_preds / total_preds

    print(f'Epoch {epoch+1}/{epochs}:')
    print(f'Training Loss: {avg_train_loss:.4f} | Validation Loss: {avg_val_loss:.4f} | Validation Accuracy: {val_accuracy:.4f}')

# Save the fine-tuned model
model.save_pretrained("fine_tuned_bert_model")


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.
You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at HooshvareLab/distilbert-fa-zwnj-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.at

  0%|          | 0/500 [00:00<?, ?it/s]

Epoch 1/1:
Training Loss: 1.1357 | Validation Loss: 0.5495 | Validation Accuracy: 0.8025


In [31]:
from transformers import pipeline

pipe = pipeline("text-classification", model=model.cpu(), tokenizer=tokenizer)

In [32]:
id_to_str

{0: 'فناوری', 1: 'سلامت', 2: 'ورزش', 3: 'خارجی', 4: 'هنر'}

In [33]:
pipe('فیلم و تلویزیون')

[{'label': 'هنر', 'score': 0.9243751168251038}]

In [None]:
pipe(list(val_df['Title']))