In [1]:
def preprocess_dataset(path):
    """ Remove unnecessary characters and encode the sentiment labels.

    The type of preprocessing required changes based on the dataset. For the
    IMDb dataset, the review texts contains HTML break tags (<br/>) leftover
    from the scraping process, and some unnecessary whitespace, which are
    removed. Finally, encode the sentiment labels as 0 for "negative" and 1 for
    "positive". This method assumes the dataset file contains the headers
    "review" and "sentiment".

    Parameters:
        path (str): A path to a dataset file containing the sentiment analysis
            dataset. The structure of the file should be as follows: one column
            called "review" containing the review text, and one column called
            "sentiment" containing the ground truth label. The label options
            should be "negative" and "positive".

    Returns:
        df_dataset (pd.DataFrame): A DataFrame containing the raw data
            loaded from the self.dataset path. In addition to the expected
            "review" and "sentiment" columns, are:

            > review_cleaned - a copy of the "review" column with the HTML
                break tags and unnecessary whitespace removed

            > sentiment_encoded - a copy of the "sentiment" column with the
                "negative" values mapped to 0 and "positive" values mapped
                to 1
    """
    df_dataset = pd.read_csv(path)

    df_dataset['review_cleaned'] = df_dataset['review'].\
        apply(lambda x: x.replace('<br />', ''))

    df_dataset['review_cleaned'] = df_dataset['review_cleaned'].\
        replace('\s+', ' ', regex=True)

    df_dataset['sentiment_encoded'] = df_dataset['sentiment'].\
        apply(lambda x: 0 if x == 'negative' else 1)

    return df_dataset

  replace('\s+', ' ', regex=True)


In [2]:
import numpy as np
import pandas as pd

dataset = preprocess_dataset("C:/Users/Lenovo/Desktop/NLP/Final_project/IMDB Dataset.csv")

print(dataset.head(10))  # ดูตัวอย่าง 5 แถวแรก


                                              review sentiment  \
0  One of the other reviewers has mentioned that ...  positive   
1  A wonderful little production. <br /><br />The...  positive   
2  I thought this was a wonderful way to spend ti...  positive   
3  Basically there's a family where a little boy ...  negative   
4  Petter Mattei's "Love in the Time of Money" is...  positive   
5  Probably my all-time favorite movie, a story o...  positive   
6  I sure would like to see a resurrection of a u...  positive   
7  This show was an amazing, fresh & innovative i...  negative   
8  Encouraged by the positive comments about this...  negative   
9  If you like original gut wrenching laughter yo...  positive   

                                      review_cleaned  sentiment_encoded  
0  One of the other reviewers has mentioned that ...                  1  
1  A wonderful little production. The filming tec...                  1  
2  I thought this was a wonderful way to spend ti..

In [None]:
'''
from datetime import datetime
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
from torch.utils.data import TensorDataset, DataLoader
from transformers import (
    BertForSequenceClassification,
    BertTokenizer,
    get_linear_schedule_with_warmup)


class FineTuningPipeline:

    def __init__(
            self,
            dataset,
            tokenizer,
            model,
            optimizer,
            loss_function = nn.CrossEntropyLoss(),
            val_size = 0.1,
            epochs = 4,
            seed = 42):

        self.df_dataset = dataset
        self.tokenizer = tokenizer
        self.model = model
        self.optimizer = optimizer
        self.loss_function = loss_function
        self.val_size = val_size
        self.epochs = epochs
        self.seed = seed

        # Check if GPU is available for faster training time
        if torch.cuda.is_available():
            self.device = torch.device('cuda:0')
        else:
            self.device = torch.device('cpu')

        # Perform fine-tuning
        self.model.to(self.device)
        self.set_seeds()
        self.token_ids, self.attention_masks = self.tokenize_dataset()
        self.train_dataloader, self.val_dataloader = self.create_dataloaders()
        self.scheduler = self.create_scheduler()
        self.fine_tune()

    def tokenize(self, text):
        """ Tokenize input text and return the token IDs and attention mask.

        Tokenize an input string, setting a maximum length of 512 tokens.
        Sequences with more than 512 tokens will be truncated to this limit,
        and sequences with less than 512 tokens will be supplemented with [PAD]
        tokens to bring them up to this limit. The datatype of the returned
        tensors will be the PyTorch tensor format. These return values are
        tensors of size 1 x max_length where max_length is the maximum number
        of tokens per input sequence (512 for BERT).

        Parameters:
            text (str): The text to be tokenized.

        Returns:
            token_ids (torch.Tensor): A tensor of token IDs for each token in
                the input sequence.

            attention_mask (torch.Tensor): A tensor of 1s and 0s where a 1
                indicates a token can be attended to during the attention
                process, and a 0 indicates a token should be ignored. This is
                used to prevent BERT from attending to [PAD] tokens during its
                training/inference.
        """
        batch_encoder = self.tokenizer.encode_plus(
            text,
            max_length = 128,
            #max_length = 512,
            padding = 'max_length',
            truncation = True,
            return_tensors = 'pt')

        token_ids = batch_encoder['input_ids']
        attention_mask = batch_encoder['attention_mask']

        return token_ids, attention_mask

    def tokenize_dataset(self):
        """ Apply the self.tokenize method to the fine-tuning dataset.

        Tokenize and return the input sequence for each row in the fine-tuning
        dataset given by self.dataset. The return values are tensors of size
        len_dataset x max_length where len_dataset is the number of rows in the
        fine-tuning dataset and max_length is the maximum number of tokens per
        input sequence (512 for BERT).

        Parameters:
            None.

        Returns:
            token_ids (torch.Tensor): A tensor of tensors containing token IDs
            for each token in the input sequence.

            attention_masks (torch.Tensor): A tensor of tensors containing the
                attention masks for each sequence in the fine-tuning dataset.
        """
        token_ids = []
        attention_masks = []

        for review in self.df_dataset['review_cleaned']:
            tokens, masks = self.tokenize(review)
            token_ids.append(tokens)
            attention_masks.append(masks)

        token_ids = torch.cat(token_ids, dim=0)
        attention_masks = torch.cat(attention_masks, dim=0)

        return token_ids, attention_masks

    def create_dataloaders(self):
        """ Create dataloaders for the train and validation set.

        Split the tokenized dataset into train and validation sets according to
        the self.val_size value. For example, if self.val_size is set to 0.1,
        90% of the data will be used to form the train set, and 10% for the
        validation set. Convert the "sentiment_encoded" column (labels for each
        row) to PyTorch tensors to be used in the dataloaders.

        Parameters:
            None.

        Returns:
            train_dataloader (torch.utils.data.dataloader.DataLoader): A
                dataloader of the train data, including the token IDs,
                attention masks, and sentiment labels.

            val_dataloader (torch.utils.data.dataloader.DataLoader): A
                dataloader of the validation data, including the token IDs,
                attention masks, and sentiment labels.

        """
        train_ids, val_ids = train_test_split(
                        self.token_ids,
                        test_size=self.val_size,
                        shuffle=False)

        train_masks, val_masks = train_test_split(
                                    self.attention_masks,
                                    test_size=self.val_size,
                                    shuffle=False)

        labels = torch.tensor(self.df_dataset['sentiment_encoded'].values)
        train_labels, val_labels = train_test_split(
                                        labels,
                                        test_size=self.val_size,
                                        shuffle=False)

        train_data = TensorDataset(train_ids, train_masks, train_labels)
        train_dataloader = DataLoader(train_data, shuffle=True, batch_size=32) # batch_size = 16 before
        val_data = TensorDataset(val_ids, val_masks, val_labels)
        val_dataloader = DataLoader(val_data, batch_size=32) # batch_size = 16 before

        return train_dataloader, val_dataloader

    def create_scheduler(self):
        """ Create a linear scheduler for the learning rate.

        Create a scheduler with a learning rate that increases linearly from 0
        to a maximum value (called the warmup period), then decreases linearly
        to 0 again. num_warmup_steps is set to 0 here based on an example from
        Hugging Face:

        https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2
        d008813037968a9e58/examples/run_glue.py#L308

        Read more about schedulers here:

        https://huggingface.co/docs/transformers/main_classes/optimizer_
        schedules#transformers.get_linear_schedule_with_warmup
        """
        num_training_steps = self.epochs * len(self.train_dataloader)
        scheduler = get_linear_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps=0,
            num_training_steps=num_training_steps)

        return scheduler

    def set_seeds(self):
        """ Set the random seeds so that results are reproduceable.

        Parameters:
            None.

        Returns:
            None.
        """
        np.random.seed(self.seed)
        torch.manual_seed(self.seed)
        torch.cuda.manual_seed_all(self.seed)

    def fine_tune(self):
        """Train the classification head on the BERT model.

        Fine-tune the model by training the classification head (linear layer)
        sitting on top of the BERT model. The model trained on the data in the
        self.train_dataloader, and validated at the end of each epoch on the
        data in the self.val_dataloader. The series of steps are described
        below:

        Training:

        > Create a dictionary to store the average training loss and average
          validation loss for each epoch.
        > Store the time at the start of training, this is used to calculate
          the time taken for the entire training process.
        > Begin a loop to train the model for each epoch in self.epochs.

        For each epoch:

        > Switch the model to train mode. This will cause the model to behave
          differently than when in evaluation mode (e.g. the batchnorm and
          dropout layers are activated in train mode, but disabled in
          evaluation mode).
        > Set the training loss to 0 for the start of the epoch. This is used
          to track the loss of the model on the training data over subsequent
          epochs. The loss should decrease with each epoch if training is
          successful.
        > Store the time at the start of the epoch, this is used to calculate
          the time taken for the epoch to be completed.
        > As per the BERT authors' recommendations, the training data for each
          epoch is split into batches. Loop through the training process for
          each batch.

        For each batch:

        > Move the token IDs, attention masks, and labels to the GPU if
          available for faster processing, otherwise these will be kept on the
          CPU.
        > Invoke the zero_grad method to reset the calculated gradients from
          the previous iteration of this loop.
        > Pass the batch to the model to calculate the logits (predictions
          based on the current classifier weights and biases) as well as the
          loss.
        > Increment the total loss for the epoch. The loss is returned from the
          model as a PyTorch tensor so extract the float value using the item
          method.
        > Perform a backward pass of the model and propagate the loss through
          the classifier head. This will allow the model to determine what
          adjustments to make to the weights and biases to improve its
          performance on the batch.
        > Clip the gradients to be no larger than 1.0 so the model does not
          suffer from the exploding gradients problem.
        > Call the optimizer to take a step in the direction of the error
          surface as determined by the backward pass.

        After training on each batch:

        > Calculate the average loss and time taken for training on the epoch.

        Validation step for the epoch:

        > Switch the model to evaluation mode.
        > Set the validation loss to 0. This is used to track the loss of the
          model on the validation data over subsequent epochs. The loss should
          decrease with each epoch if training was successful.
        > Store the time at the start of the validation, this is used to
          calculate the time taken for the validation for this epoch to be
          completed.
        > Split the validation data into batches.

        For each batch:

        > Move the token IDs, attention masks, and labels to the GPU if
          available for faster processing, otherwise these will be kept on the
          CPU.
        > Invoke the no_grad method to instruct the model not to calculate the
          gradients since we wil not be performing any optimization steps here,
          only inference.
        > Pass the batch to the model to calculate the logits (predictions
          based on the current classifier weights and biases) as well as the
          loss.
        > Extract the logits and labels from the model and move them to the CPU
          (if they are not already there).
        > Increment the loss and calculate the accuracy based on the true
          labels in the validation dataloader.
        > Calculate the average loss and accuracy, and add these to the loss
          dictionary.
        """

        loss_dict = {
            'epoch': [i+1 for i in range(self.epochs)],
            'average training loss': [],
            'average validation loss': []
        }

        t0_train = datetime.now()

        for epoch in range(0, self.epochs):

            # Train step
            self.model.train()
            training_loss = 0
            t0_epoch = datetime.now()

            print(f'{"-"*20} Epoch {epoch+1} {"-"*20}')
            print('\nTraining:\n---------')
            print(f'Start Time:       {t0_epoch}')

            for batch in self.train_dataloader:

                batch_token_ids = batch[0].to(self.device)
                batch_attention_mask = batch[1].to(self.device)
                batch_labels = batch[2].to(self.device)

                self.model.zero_grad()

                loss, logits = self.model(
                    batch_token_ids,
                    token_type_ids = None,
                    attention_mask=batch_attention_mask,
                    labels=batch_labels,
                    return_dict=False)

                training_loss += loss.item()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                self.optimizer.step()
                self.scheduler.step()

            average_train_loss = training_loss / len(self.train_dataloader)
            time_epoch = datetime.now() - t0_epoch

            print(f'Average Loss:     {average_train_loss}')
            print(f'Time Taken:       {time_epoch}')

            # Validation step
            self.model.eval()
            val_loss = 0
            val_accuracy = 0
            t0_val = datetime.now()

            print('\nValidation:\n---------')
            print(f'Start Time:       {t0_val}')

            for batch in self.val_dataloader:

                batch_token_ids = batch[0].to(self.device)
                batch_attention_mask = batch[1].to(self.device)
                batch_labels = batch[2].to(self.device)

                with torch.no_grad():
                    (loss, logits) = self.model(
                        batch_token_ids,
                        attention_mask = batch_attention_mask,
                        labels = batch_labels,
                        token_type_ids = None,
                        return_dict=False)

                logits = logits.detach().cpu().numpy()
                label_ids = batch_labels.to('cpu').numpy()
                val_loss += loss.item()
                val_accuracy += self.calculate_accuracy(logits, label_ids)


            average_val_accuracy = val_accuracy / len(self.val_dataloader)
            average_val_loss = val_loss / len(self.val_dataloader)
            time_val = datetime.now() - t0_val

            print(f'Average Loss:     {average_val_loss}')
            print(f'Average Accuracy: {average_val_accuracy}')
            print(f'Time Taken:       {time_val}\n')

            loss_dict['average training loss'].append(average_train_loss)
            loss_dict['average validation loss'].append(average_val_loss)

        print(f'Total training time: {datetime.now()-t0_train}')

    def calculate_accuracy(self, preds, labels):
        """ Calculate the accuracy of model predictions against true labels.

        Parameters:
            preds (np.array): The predicted label from the model
            labels (np.array): The true label

        Returns:
            accuracy (float): The accuracy as a percentage of the correct
                predictions.
        """
        pred_flat = np.argmax(preds, axis=1).flatten()
        labels_flat = labels.flatten()
        accuracy = np.sum(pred_flat == labels_flat) / len(labels_flat)

        return accuracy

    def predict(self, dataloader):
        """Return the predicted probabilities of each class for input text.
        
        Parameters:
            dataloader (torch.utils.data.DataLoader): A DataLoader containing
                the token IDs and attention masks for the text to perform
                inference on.
        
        Returns:
            probs (PyTorch.Tensor): A tensor containing the probability values
                for each class as predicted by the model.

        """

        self.model.eval()
        all_logits = []

        for batch in dataloader:

            batch_token_ids, batch_attention_mask = tuple(t.to(self.device) \
                for t in batch)[:2]

            with torch.no_grad():
                outputs = self.model(batch_token_ids, attention_mask=batch_attention_mask)
                logits = outputs.logits

                #logits = self.model(batch_token_ids, batch_attention_mask)

            all_logits.append(logits)

        all_logits = torch.cat(all_logits, dim=0)

        probs = F.softmax(all_logits, dim=1).cpu().numpy()
        return probs


'''        


  from .autonotebook import tqdm as notebook_tqdm


In [7]:
import transformers
print(transformers.__version__)


4.51.3


# Fine tune deberta-v3

In [8]:
#pip install sentencepiece

# Fine tune + Lora model

In [None]:
#pip install peft

Collecting peftNote: you may need to restart the kernel to use updated packages.

  Downloading peft-0.16.0-py3-none-any.whl.metadata (14 kB)
Downloading peft-0.16.0-py3-none-any.whl (472 kB)
Installing collected packages: peft
Successfully installed peft-0.16.0


In [5]:
# 🚀 FineTuningPipeline (LoRA-ready)
from datetime import datetime
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
from torch.utils.data import TensorDataset, DataLoader
from transformers import (
    BertForSequenceClassification,
    BertTokenizer,
    get_linear_schedule_with_warmup)


class FineTuningPipeline:
    def __init__(self, dataset, tokenizer, model, optimizer,
                 loss_function=nn.CrossEntropyLoss(), val_size=0.1,
                 epochs=4, seed=42):

        self.df_dataset = dataset
        self.tokenizer = tokenizer
        self.model = model
        self.optimizer = optimizer
        self.loss_function = loss_function
        self.val_size = val_size
        self.epochs = epochs
        self.seed = seed

        self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        self.model.to(self.device)
        self.set_seeds()
        self.token_ids, self.attention_masks = self.tokenize_dataset()
        self.train_dataloader, self.val_dataloader = self.create_dataloaders()
        self.scheduler = self.create_scheduler()
        self.fine_tune()

    def tokenize(self, text):
        encoded = self.tokenizer.encode_plus(
            text,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return encoded['input_ids'], encoded['attention_mask']

    def tokenize_dataset(self):
        token_ids, attention_masks = [], []
        for review in self.df_dataset['review_cleaned']:
            ids, mask = self.tokenize(review)
            token_ids.append(ids)
            attention_masks.append(mask)
        return torch.cat(token_ids, dim=0), torch.cat(attention_masks, dim=0)

    def create_dataloaders(self):
        from sklearn.model_selection import train_test_split
        labels = torch.tensor(self.df_dataset['sentiment_encoded'].values)
        train_ids, val_ids, train_masks, val_masks, train_labels, val_labels = train_test_split(
            self.token_ids, self.attention_masks, labels, test_size=self.val_size, shuffle=False)

        train_data = TensorDataset(train_ids, train_masks, train_labels)
        val_data = TensorDataset(val_ids, val_masks, val_labels)

        return DataLoader(train_data, shuffle=True, batch_size=32), DataLoader(val_data, batch_size=32)

    def create_scheduler(self):
        total_steps = self.epochs * len(self.train_dataloader)
        return get_linear_schedule_with_warmup(self.optimizer, 0, total_steps)

    def set_seeds(self):
        np.random.seed(self.seed)
        torch.manual_seed(self.seed)
        torch.cuda.manual_seed_all(self.seed)

    def fine_tune(self):
        from datetime import datetime
        print(f"🔍 Model type: {type(self.model)}")
        t0_train = datetime.now()
        
        for epoch in range(self.epochs):
            print(f"\n===== Epoch {epoch+1}/{self.epochs} =====")

            # Training
            self.model.train()
            train_loss = 0
            for batch in self.train_dataloader:
                ids, mask, labels = [x.to(self.device) for x in batch]
                self.model.zero_grad()
                outputs = self.model(input_ids=ids, attention_mask=mask, labels=labels)
                loss = outputs.loss
                logits = outputs.logits
                loss.backward()
                train_loss += loss.item()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                self.optimizer.step()
                self.scheduler.step()

            print(f"✅ Avg Train Loss: {train_loss / len(self.train_dataloader):.4f}")

            # Validation
            self.model.eval()
            val_loss, val_accuracy = 0, 0
            t0_val = datetime.now()
            for batch in self.val_dataloader:
                ids, mask, labels = [x.to(self.device) for x in batch]
                with torch.no_grad():
                    outputs = self.model(input_ids=ids, attention_mask=mask, labels=labels)
                loss = outputs.loss
                logits = outputs.logits
                val_loss += loss.item()
                val_accuracy += self.calculate_accuracy(logits.cpu().numpy(), labels.cpu().numpy())

            val_time = datetime.now() - t0_val
            print(f"🧪 Avg Val Loss:  {val_loss / len(self.val_dataloader):.4f}")
            print(f"🎯 Val Accuracy: {val_accuracy / len(self.val_dataloader):.4f}")
            print(f"🕒 Val Time:      {val_time}")

        print(f"\n✅ Total training time: {datetime.now() - t0_train}")


    def calculate_accuracy(self, preds, labels):
        preds_flat = np.argmax(preds, axis=1).flatten()
        return np.sum(preds_flat == labels.flatten()) / len(labels)
    
    def predict(self, dataloader):
        """Return the predicted probabilities of each class for input text.
        
        Parameters:
            dataloader (torch.utils.data.DataLoader): A DataLoader containing
                the token IDs and attention masks for the text to perform
                inference on.
        
        Returns:
            probs (PyTorch.Tensor): A tensor containing the probability values
                for each class as predicted by the model.

        """

        self.model.eval()
        all_logits = []

        for batch in dataloader:

            batch_token_ids, batch_attention_mask = tuple(t.to(self.device) \
                for t in batch)[:2]

            with torch.no_grad():
                outputs = self.model(batch_token_ids, attention_mask=batch_attention_mask)
                logits = outputs.logits

                #logits = self.model(batch_token_ids, batch_attention_mask)

            all_logits.append(logits)

        all_logits = torch.cat(all_logits, dim=0)

        probs = F.softmax(all_logits, dim=1).cpu().numpy()
        return probs
    

In [None]:
'''
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW

# ✅ 1) ใช้โมเดลแม่นกว่า ModernBERT (DeBERTa-v3)
model_name = "microsoft/deberta-v3-base"

# ✅ 2) ใช้ dataset ครึ่งเดียว train เร็วขึ้น
dataset = preprocess_dataset('C:/Users/Lenovo/Desktop/NLP/Final_project/IMDB Dataset.csv')
dataset = dataset.sample(15000, random_state=42)

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)

'''

In [6]:
from peft import LoraConfig, get_peft_model
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW


# ✅ 1) ใช้โมเดลแม่นกว่า ModernBERT (DeBERTa-v3)
model_name = "microsoft/deberta-v3-base"

# ✅ 2) ใช้ dataset ครึ่งเดียว train เร็วขึ้น
dataset = preprocess_dataset('C:/Users/Lenovo/Desktop/NLP/Final_project/IMDB Dataset.csv')
dataset = dataset.sample(25000, random_state=42)

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)

# โหลด base model


base_model = AutoModelForSequenceClassification.from_pretrained(
    "microsoft/deberta-v3-base", num_labels=2
)

# กำหนด LoRA config
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["query_proj", "value_proj"],  # สำหรับ DeBERTa
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS"
)

# สร้างโมเดล LoRA
model = get_peft_model(base_model, peft_config)


optimizer = AdamW(model.parameters(), lr=2e-4)

fine_tuned_model = FineTuningPipeline(
    dataset=dataset,
    tokenizer=tokenizer,
    model=model,
    optimizer=optimizer,
    val_size=0.1,
    epochs=3,
    seed=42
)



Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


🔍 Model type: <class 'peft.peft_model.PeftModelForSequenceClassification'>

===== Epoch 1/3 =====
✅ Avg Train Loss: 0.3065
🧪 Avg Val Loss:  0.2136
🎯 Val Accuracy: 0.9130
🕒 Val Time:      0:00:23.439491

===== Epoch 2/3 =====
✅ Avg Train Loss: 0.2144
🧪 Avg Val Loss:  0.1979
🎯 Val Accuracy: 0.9229
🕒 Val Time:      0:01:07.591575

===== Epoch 3/3 =====
✅ Avg Train Loss: 0.1976
🧪 Avg Val Loss:  0.2015
🎯 Val Accuracy: 0.9233
🕒 Val Time:      0:00:32.793094

✅ Total training time: 1:08:21.390231


In [None]:
'''
merged_model = model.merge_and_unload()
merged_model.save_pretrained("merged_deberta_Lora_model")
tokenizer.save_pretrained("merged_deberta_Lora_model")


'''

('merged_deberta_Lora_model\\tokenizer_config.json',
 'merged_deberta_Lora_model\\special_tokens_map.json',
 'merged_deberta_Lora_model\\spm.model',
 'merged_deberta_Lora_model\\added_tokens.json')

In [8]:

# ✅ Merge LoRA weights → base model
merged_model = model.merge_and_unload()

# ✅ Save model + tokenizer
save_path = "./fine_tuned_deberta_Lora_imdb"
merged_model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)



('./fine_tuned_deberta_Lora_imdb\\tokenizer_config.json',
 './fine_tuned_deberta_Lora_imdb\\special_tokens_map.json',
 './fine_tuned_deberta_Lora_imdb\\spm.model',
 './fine_tuned_deberta_Lora_imdb\\added_tokens.json')

In [9]:
# ✅ Merge LoRA weights → base model
merged_model = model.merge_and_unload()

# ✅ Save model + tokenizer
save_path = 'C:/Users/Lenovo/Desktop/NLP/Final_project/fine_tuned_deberta_Lora_imdb'
merged_model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

('C:/Users/Lenovo/Desktop/NLP/Final_project/fine_tuned_deberta_Lora_imdb\\tokenizer_config.json',
 'C:/Users/Lenovo/Desktop/NLP/Final_project/fine_tuned_deberta_Lora_imdb\\special_tokens_map.json',
 'C:/Users/Lenovo/Desktop/NLP/Final_project/fine_tuned_deberta_Lora_imdb\\spm.model',
 'C:/Users/Lenovo/Desktop/NLP/Final_project/fine_tuned_deberta_Lora_imdb\\added_tokens.json')

In [16]:
# ✅ Merge LoRA weights → base model
merged_model = model.merge_and_unload()

# ✅ Save model + tokenizer
save_path = './fine_tuned_deberta_Lora_imdb'
merged_model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

('./fine_tuned_deberta_Lora_imdb\\tokenizer_config.json',
 './fine_tuned_deberta_Lora_imdb\\special_tokens_map.json',
 './fine_tuned_deberta_Lora_imdb\\spm.model',
 './fine_tuned_deberta_Lora_imdb\\added_tokens.json')

In [None]:
'''
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW

# ✅ 1) ใช้โมเดลแม่นกว่า ModernBERT (DeBERTa-v3)
model_name = "microsoft/deberta-v3-base"

# ✅ 2) ใช้ dataset ครึ่งเดียว train เร็วขึ้น
dataset = preprocess_dataset('C:/Users/Lenovo/Desktop/NLP/Final_project/IMDB Dataset.csv')
dataset = dataset.sample(15000, random_state=42)

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# ✅ 3) ใช้ learning rate ที่เหมาะกับ Transformer
optimizer = AdamW(model.parameters(), lr=2e-5)

# ✅ 4) Fine-tune แค่ 3 epoch + validation 10%
fine_tuned_model = FineTuningPipeline(
    dataset=dataset,
    tokenizer=tokenizer,
    model=model,
    optimizer=optimizer,
    val_size=0.1,
    epochs=3,
    seed=42
)

'''

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


-------------------- Epoch 1 --------------------

Training:
---------
Start Time:       2025-07-24 16:51:03.816008
Average Loss:     0.287409131958092
Time Taken:       1:00:32.755621

Validation:
---------
Start Time:       2025-07-24 17:51:36.602769
Average Loss:     0.20962147541502688
Average Accuracy: 0.9207826747720365
Time Taken:       0:00:08.150164

-------------------- Epoch 2 --------------------

Training:
---------
Start Time:       2025-07-24 17:51:44.754942
Average Loss:     0.16626481108049646
Time Taken:       0:28:47.246521

Validation:
---------
Start Time:       2025-07-24 18:20:32.004835
Average Loss:     0.20655634182881802
Average Accuracy: 0.9220174772036475
Time Taken:       0:00:09.060500

-------------------- Epoch 3 --------------------

Training:
---------
Start Time:       2025-07-24 18:20:41.067574
Average Loss:     0.10255819945529951
Time Taken:       0:36:45.293263

Validation:
---------
Start Time:       2025-07-24 18:57:26.360837
Average Loss:     0

In [11]:
# ทำนายผลบน validation set
predictions = fine_tuned_model.predict(fine_tuned_model.val_dataloader)

# แปลง probability → label 0/1
predicted_labels = np.argmax(predictions, axis=1)
print(predicted_labels[:10])

[1 1 1 1 1 0 1 1 1 1]


In [12]:
# 1) ให้โมเดล predict บน validation set
predictions = fine_tuned_model.predict(fine_tuned_model.val_dataloader)

# 2) แปลงเป็น label 0/1
predicted_labels = np.argmax(predictions, axis=1)

# 3) ดึง label จริงของ validation set
true_labels = fine_tuned_model.df_dataset['sentiment_encoded'][-len(predicted_labels):].values

# 4) คำนวณ accuracy
val_accuracy = np.mean(predicted_labels == true_labels)
print(f"Validation Accuracy: {val_accuracy:.4f}")

Validation Accuracy: 0.9228


# Save model

In [None]:
'''
save_path = "./fine_tuned_deberta_imdb"

# ✅ save model weights + config
fine_tuned_model.model.save_pretrained(save_path)

# ✅ save tokenizer ที่ใช้ตอน fine-tune
fine_tuned_model.tokenizer.save_pretrained(save_path)
'''

('./fine_tuned_deberta_imdb\\tokenizer_config.json',
 './fine_tuned_deberta_imdb\\special_tokens_map.json',
 './fine_tuned_deberta_imdb\\spm.model',
 './fine_tuned_deberta_imdb\\added_tokens.json')

# Text prediction

In [13]:
from torch.utils.data import TensorDataset, DataLoader

text = "Encouraged by the positive comments about this film on here I was looking forward to watching this film. Bad mistake"

# tokenize text
tokens, masks = fine_tuned_model.tokenize(text)

# สร้าง DataLoader สำหรับข้อความเดียว
dataloader = DataLoader(TensorDataset(tokens, masks), batch_size=1)

# ใช้ fine_tuned_model.predict()
probs = fine_tuned_model.predict(dataloader)
print("Review Movie:", text)
print("Probabilities:", probs)
print("Predicted label:", np.argmax(probs))  # 1 = positive, 0 = negative

Review Movie: Encouraged by the positive comments about this film on here I was looking forward to watching this film. Bad mistake
Probabilities: [[0.97148424 0.02851575]]
Predicted label: 0


In [14]:
from torch.utils.data import TensorDataset, DataLoader

text = "Taut and organically gripping, Edward Dmytryk's Crossfire is a distinctive suspense thriller, an unlikely movie using the look and devices of the noir cycle."

# tokenize text
tokens, masks = fine_tuned_model.tokenize(text)

# สร้าง DataLoader สำหรับข้อความเดียว
dataloader = DataLoader(TensorDataset(tokens, masks), batch_size=1)

# ใช้ fine_tuned_model.predict()
probs = fine_tuned_model.predict(dataloader)
print("Review Movie:", text)
print("Probabilities:", probs)
print("Predicted label:", np.argmax(probs))  # 1 = positive, 0 = negative



Review Movie: Taut and organically gripping, Edward Dmytryk's Crossfire is a distinctive suspense thriller, an unlikely movie using the look and devices of the noir cycle.
Probabilities: [[0.00285796 0.997142  ]]
Predicted label: 1


In [15]:
from torch.utils.data import TensorDataset, DataLoader

text = "Protocol is an implausible movie whose only saving grace is that it stars Goldie Hawn along with a good cast of supporting actors. The story revolves around a ditzy cocktail waitress who becomes famous after inadvertently saving the life of an Arab dignitary. The story goes downhill halfway through the movie and Goldie's charm just doesn't save this movie. Unless you are a Goldie Hawn fan don't go out of your way to see this film."


# tokenize text
tokens, masks = fine_tuned_model.tokenize(text)

# สร้าง DataLoader สำหรับข้อความเดียว
dataloader = DataLoader(TensorDataset(tokens, masks), batch_size=1)

# ใช้ fine_tuned_model.predict()
probs = fine_tuned_model.predict(dataloader)
print("Review Movie:", text)
print("Probabilities:", probs)
print("Predicted label:", np.argmax(probs))  # 1 = positive, 0 = negative


Review Movie: Protocol is an implausible movie whose only saving grace is that it stars Goldie Hawn along with a good cast of supporting actors. The story revolves around a ditzy cocktail waitress who becomes famous after inadvertently saving the life of an Arab dignitary. The story goes downhill halfway through the movie and Goldie's charm just doesn't save this movie. Unless you are a Goldie Hawn fan don't go out of your way to see this film.
Probabilities: [[0.9945844  0.00541563]]
Predicted label: 0


In [18]:
from torch.utils.data import TensorDataset, DataLoader

#text = "Protocol is an implausible movie whose only saving grace is that it stars Goldie Hawn along with a good cast of supporting actors. The story revolves around a ditzy cocktail waitress who becomes famous after inadvertently saving the life of an Arab dignitary. The story goes downhill halfway through the movie and Goldie's charm just doesn't save this movie. Unless you are a Goldie Hawn fan don't go out of your way to see this film."

text = "Helen (Kate Capshaw) owns a bookstore in the sleepy, coastal town of Loblolly by the Sea. Divorced, Helen has a young daughter who is going to camp for the summer, giving mother a bit more freedom"

# tokenize text
tokens, masks = fine_tuned_model.tokenize(text)

# สร้าง DataLoader สำหรับข้อความเดียว
dataloader = DataLoader(TensorDataset(tokens, masks), batch_size=1)

# ใช้ fine_tuned_model.predict()
probs = fine_tuned_model.predict(dataloader)
print("Review Movie:", text)
print("Probabilities:", probs)
print("Predicted label:", np.argmax(probs))  # 1 = positive, 0 = negative


Review Movie: Helen (Kate Capshaw) owns a bookstore in the sleepy, coastal town of Loblolly by the Sea. Divorced, Helen has a young daughter who is going to camp for the summer, giving mother a bit more freedom
Probabilities: [[0.24761915 0.75238085]]
Predicted label: 1


# Import model local

In [1]:
model_path = "./fine_tuned_deberta_Lora_imdb"

from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)



# Predict Classification

In [18]:
text = "Protocol is an implausible movie whose only saving grace is that it stars Goldie Hawn along with a good cast of supporting actors"
inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)

import torch
probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
print("Review:" , text)
print("Predicted:", "Positive ✅" if probs.argmax() == 1 else "Negative ❌")

Review: Protocol is an implausible movie whose only saving grace is that it stars Goldie Hawn along with a good cast of supporting actors
Predicted: Negative ❌


In [None]:
import pandas as pd

# ✅ โหลด CSV
df = pd.read_csv("reviews.csv")

# ✅ tokenize ทั้งหมด
inputs = tokenizer(df["review"].tolist(), padding=True, truncation=True, return_tensors="pt")

# ✅ predict
outputs = model(**inputs)
probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
pred_labels = probs.argmax(axis=1)

# ✅ เพิ่มคอลัมน์ผลลัพธ์
df["predicted_sentiment"] = ["positive" if p == 1 else "negative" for p in pred_labels]

# ✅ ดูตัวอย่าง
print(df.head())

# ✅ ถ้าอยากบันทึกผล
df.to_csv("reviews_with_predictions.csv", index=False)