This notebook is for training the model classifying a piece of text into 6 classes, to achieve even better results, the model can be trained for more epochs. 


In [None]:
!pip install pyvi
!pip install datasets evaluate transformers[sentenpiece]
!pip install transformers[torch]
!pip install accelerate -U

!pip install sentencepiece
!pip install emot
!pip install wandb

In [3]:
#Importing relevant libraries
import pandas as pd
import numpy as np
import codecs
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch.nn as nn
import torch.optim as optim
import torch
from sklearn.model_selection import train_test_split
from pyvi.ViTokenizer import tokenize
from transformers import AutoModel,AutoModelForSequenceClassification,AutoTokenizer
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
import evaluate
import re
import unicodedata
from emot.emo_unicode import UNICODE_EMOJI, EMOTICONS_EMO
from datasets import load_dataset
from datasets import Dataset, DatasetDict, concatenate_datasets
from transformers import DataCollatorWithPadding
import json
import wandb
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth',None)


In [5]:
#Logging into wandb
wandb.login(key="Create_a_wandb_project_and_paste_your_access_token_here")

[34m[1mwandb[0m: Currently logged in as: [33mmethreh[0m ([33mml_prj[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/dslab/.netrc


True

In [6]:
import torch
import pandas as pd
import numpy as np
import re
import unicodedata
from emoji import UNICODE_EMOJI

# Preparing dataset
class Dataset(torch.utils.data.Dataset):
    """Custom Dataset class for handling encodings and labels."""
    
    def __init__(self, encodings, labels=None):
        """
        Initializes the dataset with encodings and optional labels.
        
        Args:
            encodings (dict): The encodings of the dataset.
            labels (list, optional): The labels for the dataset. Defaults to None.
        """
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        """
        Gets an item from the dataset at the specified index.
        
        Args:
            idx (int): The index of the item.
        
        Returns:
            dict: The item with encodings and optional labels.
        """
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        """
        Gets the length of the dataset.
        
        Returns:
            int: The number of items in the dataset.
        """
        return len(self.encodings['input_ids'])

class Preprocessing:
    """Class for preprocessing text and labels."""
    
    def __init__(self, text, label):
        """
        Initializes the preprocessing with text and labels.
        
        Args:
            text (list): The list of text data.
            label (list): The list of label data.
        """
        self.text = np.array(text)
        self.label = np.array(label)
        self.data = pd.DataFrame({'text': self.text, 'label': self.label})
        self.data.dropna(inplace=True)
    
    def remove_stopwords(self, text):
        """
        Removes stopwords from the text.
        
        Args:
            text (str): The text from which to remove stopwords.
        
        Returns:
            str: The text without stopwords.
        """
        with open("vnstopwords.txt", "r", encoding="utf-8") as f:
            stopword = f.readlines()
        stopword = [word.strip().replace(" ", "_") for word in stopword]
        stopword = set(stopword)
        
        words = text.split()
        # Remove 2-word stopwords
        for i in range(1, len(words)):
            if words[i-1] + "_" + words[i] in stopword:
                words[i-1] = ""
                words[i] = ""
        # Remove 1-word stopwords
        for i in range(len(words)):
            if words[i] in stopword:
                words[i] = ""
        return " ".join(words)

    def remove_links(self, text):
        """
        Removes URLs, emails, etc., from the text.
        
        Args:
            text (str): The text from which to remove links.
        
        Returns:
            str: The text without links.
        """
        pattern = r'http(s)?://[^\s]*|www\.[^\s]*|\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b'
        text = re.sub(pattern, "", text)
        return text

    def remove_emoji(self, text):
        """
        Removes emojis from the text.
        
        Args:
            text (str): The text from which to remove emojis.
        
        Returns:
            str: The text without emojis.
        """
        for emot in UNICODE_EMOJI:
            text = str(text).replace(emot, ' ')
        text = re.sub(' +', ' ', text).strip()
        return text

    def special_character(self, text):
        """
        Removes special characters and digits from the text.
        
        Args:
            text (str): The text from which to remove special characters and digits.
        
        Returns:
            str: The cleaned text.
        """
        text = re.sub(r'\d+', " ", text)
        text = re.sub("[~!@#$%^&*()_+{}“”|:\"<>?`´\-=[\]\;\\\/.,]", " ", text)
        text = re.sub(r'\n', "", text)
        text = re.sub('  +', ' ', text).strip()
        return text

    def repeated_character(self, text):
        """
        Normalizes repeated characters in the text.
        
        Args:
            text (str): The text in which to normalize repeated characters.
        
        Returns:
            str: The text with normalized characters.
        """
        text = re.sub(r'(\w)\1+', r'\1', text)
        text = re.sub('  +', ' ', text).strip()
        return text

    def tag(self, text):
        """
        Removes mention tags and hashtags from the text.
        
        Args:
            text (str): The text from which to remove tags.
        
        Returns:
            str: The text without tags.
        """
        text = re.sub(r"(?:\@|\#|\://)\S+", " ", text)
        text = re.sub('  +', ' ', text).strip()
        return text

    def text_normalize(self, text):
        """
        Normalizes the text.
        
        Args:
            text (str): The text to normalize.
        
        Returns:
            str: The normalized text.
        """
        text = unicodedata.normalize('NFC', text)
        return text

    def clean(self, text):
        """
        Cleans the text by applying various preprocessing steps.
        
        Args:
            text (str): The text to clean.
        
        Returns:
            str: The cleaned text.
        """
        text = text.lower().strip()
        text = self.remove_links(text)
        text = self.remove_emoji(text)
        text = self.special_character(text)
        text = self.repeated_character(text)
        text = self.tag(text)
        text = self.remove_stopwords(text)
        text = self.text_normalize(text)
        return text

    def return_cleaned_text(self):
        """
        Returns the cleaned text dataframe.
        
        Returns:
            pd.DataFrame: The dataframe with cleaned text.
        """
        new = self.data.copy()
        new['text'] = new['text'].apply(self.clean)
        new.dropna(inplace=True)
        return new


In [10]:
import torch
import pandas as pd
import numpy as np
import re
import unicodedata
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, Trainer, TrainingArguments
import evaluate
import wandb

# Preparing for training
class Model:
    """Class for preparing and training a text classification model."""
    
    def __init__(self, name, train_file, test_file, report=True):
        """
        Initializes the model with the given parameters.
        
        Args:
            name (str): The model card name.
            train_file (str): The file path to the training data.
            test_file (str): The file path to the test data.
            report (bool): Whether to report to wandb. Defaults to True.
        """
        self.name = name
        self.dftrain = load_dataset("csv", data_files=train_file)
        self.dftest = load_dataset("csv", data_files=test_file)
        self.report = report
        self.label2id = {"Reactionary": 0, "Hate and violence": 1, "Discrimination": 2, "Self-harm": 3, "Gambling": 4, "Prostitution": 5}
        self.id2label = {0: "Reactionary", 1: "Hate and violence", 2: "Discrimination", 3: "Self-harm", 4: "Gambling", 5: "Prostitution"}
        self.tokenizer = AutoTokenizer.from_pretrained(self.name, use_fast=False, model_max_length=256)
        self.data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)
        self.model = None
        self.metrics = evaluate.load("accuracy")
        self.train_dataset = None
        self.test_dataset = None
        self.training_args = None
        self.output_dir = "Testing_model"
        self.trainer = None
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        print(device)

    def tokenize(self):
        """Tokenizes the training and test datasets."""
        self.dftrain = self.dftrain.map(lambda examples: self.tokenizer(examples['text']), batched=True)
        self.dftest = self.dftest.map(lambda examples: self.tokenizer(examples['text']), batched=True)

    def segment(self, pd):
        """
        Segments the text and converts labels to int32.
        
        Args:
            pd (pd.DataFrame): The dataframe containing text and labels.
        
        Returns:
            pd.DataFrame: The segmented dataframe.
        """
        pd['text'] = pd['text'].apply(lambda x: self.tokenizer(x))
        pd['label'] = pd['label'].astype('int32')
        return pd

    def preprocess_function(self, examples):
        """
        Preprocesses the text examples by tokenizing.
        
        Args:
            examples (dict): The examples containing text.
        
        Returns:
            dict: The tokenized examples.
        """
        examples['text'] = [str(e) for e in examples['text']]
        return self.tokenizer(examples['text'], truncation=True)

    def mapping_dataset(self):
        """Maps the preprocessing function to the training and test datasets."""
        self.train_dataset = self.dftrain.map(self.preprocess_function, batched=True)
        self.test_dataset = self.dftest.map(self.preprocess_function, batched=True)

    def compute_metrics(self, eval_pred):
        """
        Computes the evaluation metrics.
        
        Args:
            eval_pred (tuple): The logits and labels from the evaluation.
        
        Returns:
            dict: The computed metrics.
        """
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=1)
        return self.metrics.compute(predictions=predictions, references=labels)

    def train_args(self, output_dir="Testing_model", learning_rate=2e-5, batch_size=32, epochs=10, weight_decay=0.01, evaluation_strategy='epoch', save_strategy='epoch'):
        """
        Sets the training arguments.
        
        Args:
            output_dir (str): The output directory for the model. Defaults to "Testing_model".
            learning_rate (float): The learning rate for training. Defaults to 2e-5.
            batch_size (int): The batch size for training. Defaults to 32.
            epochs (int): The number of training epochs. Defaults to 10.
            weight_decay (float): The weight decay for optimization. Defaults to 0.01.
            evaluation_strategy (str): The evaluation strategy. Defaults to 'epoch'.
            save_strategy (str): The save strategy. Defaults to 'epoch'.
        """
        self.training_args = TrainingArguments(
            output_dir=output_dir,
            learning_rate=learning_rate,
            evaluation_strategy=evaluation_strategy,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            num_train_epochs=epochs,
            save_strategy=save_strategy,
            load_best_model_at_end=True,
            report_to="wandb"
        )

    def training(self):
        """
        Prepares the trainer and model for training.
        
        Returns:
            Trainer: The Hugging Face Trainer object.
        """
        self.mapping_dataset()
        self.model = AutoModelForSequenceClassification.from_pretrained(
            self.name,
            num_labels=6,
            id2label=self.id2label,
            label2id=self.label2id,
            ignore_mismatched_sizes=True
        )
        self.trainer = Trainer(
            model=self.model,
            args=self.training_args,
            train_dataset=self.train_dataset['train'],
            eval_dataset=self.test_dataset['train'],
            tokenizer=self.tokenizer,
            data_collator=self.data_collator,
            compute_metrics=self.compute_metrics,
        )
        return self.trainer

    def train_with_wandb(self, evaluate_train=False, evaluate_test=False, evaluation_strategy="epoch", epochs=5, output_dir="Testing_model", batch_size=16, max_length=256):
        """
        Trains the model with wandb integration.
        
        Args:
            evaluate_train (bool): Whether to evaluate on the training set. Defaults to False.
            evaluate_test (bool): Whether to evaluate on the test set. Defaults to False.
            evaluation_strategy (str): The evaluation strategy. Defaults to "epoch".
            epochs (int): The number of epochs for training. Defaults to 5.
            output_dir (str): The output directory for the model. Defaults to "Testing_model".
            batch_size (int): The batch size for training. Defaults to 16.
            max_length (int): The maximum sequence length. Defaults to 256.
        """
        self.train_args(output_dir=output_dir, learning_rate=2e-5, batch_size=batch_size, epochs=epochs, evaluation_strategy=evaluation_strategy)
        trainer = self.training()

        if self.report:
            wandb.init(project="text-classification", name="Testing " + self.name)
            trainer.train()
            wandb.finish()
        else:
            trainer.train()

        if evaluate_train:
            training_results = trainer.evaluate(self.train_dataset['train'])
            training_accuracy = training_results['eval_accuracy']
            print(f"Training Accuracy: {training_accuracy}")

        if evaluate_test:
            testing_results = trainer.evaluate(self.test_dataset['train'])
            testing_accuracy = testing_results['eval_accuracy']
            print(f"Testing Accuracy: {testing_accuracy}")


In [11]:
train_link = 'dataset/segmented_train.csv'
test_link = 'dataset/segmented_test.csv'

In [12]:
#Training the model: 
hello = model('vinai/phobert-base-v2',train_link,test_link,report=False)
hello.train_with_wandb(evaluate_train=True,evaluate_test=True,epochs=10)



cuda


Map: 100%|██████████| 2541/2541 [00:00<00:00, 8826.14 examples/s]
Map: 100%|██████████| 610/610 [00:00<00:00, 7056.02 examples/s]
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.856173,0.791803
2,No log,0.500632,0.895082
3,No log,0.300216,0.945902
4,No log,0.204434,0.955738
5,No log,0.219303,0.937705
6,No log,0.197832,0.947541
7,0.512100,0.1574,0.955738
8,0.512100,0.147933,0.963934
9,0.512100,0.155896,0.960656
10,0.512100,0.171682,0.94918


Checkpoint destination directory Testing_model/checkpoint-80 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory Testing_model/checkpoint-160 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory Testing_model/checkpoint-240 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory Testing_model/checkpoint-320 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory Testing_model/checkpoint-400 already exists and is non-empty.Saving will proceed but saved results may be invalid.


Training Accuracy: 0.9956709956709957
Testing Accuracy: 0.9639344262295082


In [None]:
#Saving model 
hello.tokenizer.save_pretrained("Your_link")
hello.trainer.save_pretrained("Your_link")
