## Download the Dataset

In [1]:
# download the dataset
! kaggle datasets download -d jp797498e/twitter-entity-sentiment-analysis

# unzip the dataset
! unzip /content/twitter-entity-sentiment-analysis.zip

Dataset URL: https://www.kaggle.com/datasets/jp797498e/twitter-entity-sentiment-analysis
License(s): CC0-1.0
Downloading twitter-entity-sentiment-analysis.zip to /content
  0% 0.00/1.99M [00:00<?, ?B/s]
100% 1.99M/1.99M [00:00<00:00, 123MB/s]
Archive:  /content/twitter-entity-sentiment-analysis.zip
  inflating: twitter_training.csv    
  inflating: twitter_validation.csv  


## Import Necessary Libraries

In [2]:
# install necessary libraries
! pip install -q -U peft trl transformers accelerate datasets bitsandbytes

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.8/245.8 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.3/9.3 MB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.1/314.1 kB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.4/103.4 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━

In [4]:
import os
import torch
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, pipeline
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
from trl import SFTTrainer
from datasets import Dataset

import warnings
warnings.filterwarnings('ignore')

## Data Preprocessing

In [17]:
# function to generate a prompt based on a sample containing text and its sentiment
def prompt(sample):
    full_prompt =f"""Give the sentiment of the below sentence.

Sentence: {sample[0]}
Sentiment: {sample[1]}
"""
    return full_prompt

# function to load data from a CSV file and create a dataset
def load_from_dirs(file_path):
    df = pd.read_csv(file_path, names=["ID", "Entity", "Sentiment", "Content"])
    df = df[["Content", "Sentiment"]]
    df['full_prompt'] = df.apply(prompt, axis=1)
    data = Dataset.from_pandas(df)
    return data

In [18]:
# load the training dataset
train_dataset = load_from_dirs("/content/twitter_training.csv")

## Model Finetuning

In [7]:
# base model for tokenization and language modeling
base_model = "microsoft/Phi-3-mini-4k-instruct"

# initialize the tokenizer with specific settings
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True, padding_side='right')
tokenizer.pad_token = '[PAD]'
tokenizer.eos_token = '[EOS]'
tokenizer.add_eos_token = True
tokenizer.add_bos_token, tokenizer.add_eos_token

# configure quantization settings for efficient model loading
bnb_config = BitsAndBytesConfig(
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.bfloat16,
    bnb_4bit_use_double_quant= False,
)

# load the model with specified configurations
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
)

# set model-specific configurations
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache=False
model.config.pretraining_tp=1
model.gradient_checkpointing_enable() # enable gradient checkpointing for memory efficiency
model = prepare_model_for_kbit_training(model) # prepare the model for k-bit training

# Configure LoRA (Low-Rank Adaptation) for the model
peft_config = LoraConfig(
                r=16,
                lora_alpha=16,
                target_modules="all-linear",
                lora_dropout=0.00,
                bias="none",
                task_type="CAUSAL_LM"
            )

# Applying LoRA configuration to the model
model = get_peft_model(model, peft_config)

# Setting up training arguments for the model training process
training_arguments = TrainingArguments(
                            output_dir="finetuned-phi-3",
                            per_device_train_batch_size=4,
                            gradient_accumulation_steps=1,
                            optim="paged_adamw_32bit",
                            learning_rate=2e-4,
                            lr_scheduler_type="cosine",
                            save_strategy="epoch",
                            logging_steps=10,
                            # num_train_epochs=5,
                            max_steps=100,
                            fp16=True,
                        )

# Initializing the trainer for supervised fine-tuning (SFT)
trainer = SFTTrainer(
            model=model,
            train_dataset=train_dataset,
            peft_config=peft_config,
            dataset_text_field="full_prompt",
            args=training_arguments,
            tokenizer=tokenizer,
            packing=False,
            max_seq_length=512
        )

tokenizer_config.json:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

configuration_phi3.py:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py:   0%|          | 0.00/73.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/16.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

Map:   0%|          | 0/74682 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [8]:
# train the model
trainer.train()



Step,Training Loss
10,2.9328
20,2.4259
30,2.203
40,2.2841
50,2.3314
60,2.3511
70,2.4447
80,2.3322
90,2.259
100,2.4284


TrainOutput(global_step=100, training_loss=2.3992642402648925, metrics={'train_runtime': 149.9467, 'train_samples_per_second': 2.668, 'train_steps_per_second': 0.667, 'total_flos': 718757482586112.0, 'train_loss': 2.3992642402648925, 'epoch': 0.005355899523324942})

## Inference using the Finetuned Model

In [9]:
import torch
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Loading the language model with specified configurations
new_model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# Initializing the tokenizer with specific settings
eval_tokenizer = AutoTokenizer.from_pretrained(
    base_model,
    trust_remote_code=True,
    padding_side='right'
)

ft_model = PeftModel.from_pretrained(new_model, "finetuned-phi-3/checkpoint-100")



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
def inference(text):
    eval_prompt = f"""Give the sentiment of the below sentence.

Sentence: {text}
Sentiment:
"""

    model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

    ft_model.eval()
    with torch.no_grad():
        print(eval_tokenizer.decode(ft_model.generate(**model_input, max_new_tokens=64)[0], skip_special_tokens=True))

In [14]:
text = "I hate rain."
inference(text)

Give the sentiment of the below sentence.

Sentence: I hate rain.
Sentiment:
 .
Sentiment: Negative
Sentence: I hate rain.
Sentiment: Negative
Sentence: I hate rain.
Sentiment: Negative
Sentence: I hate rain.
Sentiment: Negative
Sentence: I hate rain
