In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from tqdm import tqdm
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
df=pd.read_csv('final_train.csv')
df.head(8)

Unnamed: 0,text,class,PP_Text_2,length,num_tokens
0,\nاز سوي كميسيون ماده 10 \nاحزابكارگزاران سازن...,Politics,کمیسیون ماده ۱۰ احزابکارگزاران سازندگی مجوز فع...,344,56
1,\nخصوصي سازي كارخانه هاي چاي سال آينده \nآغاز ...,Economy,خصوصی کارخانه چای آینده آغاز مدیرعامل سازمان چ...,687,113
2,\nتوضيح يك مطلب \nسوم بهمن ماه امسال گزارش طرح...,Science and Culture,توضیح مطلب بهمن ماه امسال گزارش طرح بررسی شناس...,1120,196
3,\nبهاي گازوئيل و بنزين در \nفرانسه افزايش يافت...,Economy,بهای گازوئیل بنزین فرانسه افزایش پاریس خبرگزار...,396,70
4,\nرئيس سازمان حج و زيارت اعلام كرد \n شهريور 1...,Science and Culture,رئیس سازمان حج زیارت اعلام شهریور ۱۶ زمان ثبت ...,495,95
5,\nهمزمان با روز جهاني موزه و هفته ميراث \nجديد...,Science and Culture,همزمان روز جهانی موزه هفته میراث کشور افتتاح ف...,838,150
6,\nنگراني درلوشامپيونا \nسهيلا قاسمي \nبازار نق...,Sport,نگران درلوشامپیونا سهیلا قاسم بازار نقل انتقال...,3010,510
7,\nرئيس كل بانك مركزي: مدعيان چاپ اسكناس \nبدون...,Economy,رئیس بانک مرکزی مدعیان چاپ اسکناس پشتوانه نگرا...,1182,201


In [3]:
df.dropna(axis=0,inplace=True)

In [6]:
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
tokenized_texts = [tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors="pt") for text in df['PP_Text_2']]

In [10]:
input_ids = torch.cat([text['input_ids'] for text in tokenized_texts], dim=0)
attention_mask = torch.cat([text['attention_mask'] for text in tokenized_texts], dim=0)

# Encode labels to integers
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(df['class'])

# Create DataLoader objects
batch_size = 32
train_inputs, test_inputs, train_masks, test_masks, train_labels, test_labels = train_test_split(input_ids, attention_mask, labels, test_size=0.2, random_state=42)

In [12]:
train_data = TensorDataset(train_inputs, train_masks, torch.tensor(train_labels, dtype=torch.long))
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

test_data = TensorDataset(test_inputs, test_masks, torch.tensor(test_labels, dtype=torch.long))
test_dataloader = DataLoader(test_data, batch_size=batch_size)

In [18]:
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=len(label_encoder.classes_))  # Adjust num_labels
optimizer = AdamW(model.parameters(), lr=2e-5)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    for batch in tqdm(train_dataloader):
        optimizer.zero_grad()
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()



Downloading model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.




  0%|                                                                                         | 0/1500 [00:00<?, ?it/s][A[A[A[A



  0%|                                                                              | 1/1500 [00:51<21:15:28, 51.05s/it][A[A[A[A



  0%|                                                                              | 2/1500 [01:13<14:09:08, 34.01s/it][A[A[A[A



  0%|▏                                                                             | 3/1500 [01:35<11:58:37, 28.80s/it][A[A[A[A



  0%|▏                                                                             | 4/1500 [01:57<10:49:49, 26.06s/it][A[A[A[A



  0%|▎           

KeyboardInterrupt: 

In [None]:
# Evaluation
model.eval()
test_preds = []
test_labels = []

for batch in tqdm(test_dataloader):
    input_ids = batch[0].to(device)
    attention_mask = batch[1].to(device)
    labels = batch[2].to(device)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    test_preds.extend(logits.argmax(dim=1).tolist())
    test_labels.extend(labels.tolist())

# Calculate and print classification report
class_names = ["class1", "class2", "class3", "class4", "class5"]  # Replace with your actual class names
report = classification_report(test_labels, test_preds, target_names=class_names)
print(report)