<a href="https://colab.research.google.com/github/nguyenducminh2206/NLP-Projects/blob/main/Text_Classification_CMC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade pyarrow

In [2]:
!pip install datasets

In [None]:
!pip install accelerate -U

In [None]:
!pip install scikit-learn

In [4]:
import pandas as pd
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorWithPadding

In [5]:
from datasets import Dataset

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
def train(data_path):
  df = pd.read_json(data_path)
  label2idx = {label: idx for idx, label in enumerate(df['label'].unique())}
  idx2label = {idx: label for idx, label in label2idx.items()}

  # Prepare data
  df['label'] = df['label'].map(label2idx)

  # Split the data into training and validation sets
  train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

  # Initialize tokenizer and model
  tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

  # Tokenization
  def tokenize(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)

  train_dataset = Dataset.from_pandas(train_df)
  val_dataset = Dataset.from_pandas(val_df)
  train_dataset = train_dataset.map(tokenize, batched=True)
  val_dataset = val_dataset.map(tokenize, batched=True)

  # Set dataset format for Pytorch
  train_dataset.set_format(type="torch", columns=['input_ids', 'attention_mask', 'label'])
  val_dataset.set_format(type="torch", columns=['input_ids', 'attention_mask', 'label'])

  # Initialize the model
  model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=len(label2idx))
  model.config.id2label = idx2label
  model.config.label2id = label2idx

  # Data collator
  data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  # Training arguments
  training_args = TrainingArguments(
      output_dir='./results',
      evaluation_strategy="epoch",
      learning_rate=2e-5,
      per_device_train_batch_size=16,
      per_device_eval_batch_size=16,
      num_train_epochs=3,
      weight_decay=0.01,
      logging_dir='./logs',
      save_strategy="epoch"
  )

  # Trainer
  trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=train_dataset,
      eval_dataset=val_dataset,
      data_collator=data_collator
  )

  # Train the model
  trainer.train()

  # Save the trained model
  trainer.save_model('./trained_model_xlmroberta')


In [None]:
train('/content/final_data_category.json')

Epoch,Training Loss,Validation Loss
