In [1]:
import pandas as pd
import torch

In [2]:
df = pd.read_csv("data/preprocessed/combined.csv")

In [7]:
df = df.drop_duplicates(subset=['review_text'])

In [9]:
df = df.rename(columns = {"review_text": "Review", "review_score": "Rating"}).drop(columns = ["app_id","app_name"])

In [11]:
df.head()

Unnamed: 0,Review,Rating
0,bored.,0
1,due to you the fact means that this is just th...,0
2,"i didnt play up it much, and my friend is shar...",0
3,"this game is absolutely awful, the controls ar...",0
4,really are poor work from the devs who just se...,0


In [13]:
from torch.utils.data import DataLoader, Dataset

In [65]:
class ReviewDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.dataset = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        review_text = self.dataset.iloc[idx, 0]  # Assuming reviewText is the first column
        labels = self.dataset.iloc[idx, 1]  # Assuming sentiment is the second column

        # Tokenize the review text
        encoding = self.tokenizer.encode_plus(
          review_text,
          add_special_tokens=True,  # Add [CLS] token at the start for classification
          max_length=self.max_length,
          return_token_type_ids=False,
          padding='max_length',
          return_attention_mask=True,
          return_tensors='pt',
          truncation=True
        )

        return {
          'review_text': review_text,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(), # this is NOT self-attention!
          'labels': torch.tensor(labels, dtype=torch.long)
        }

In [68]:
model_name = "bert-base-uncased"

In [70]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

In [72]:
from peft import LoraConfig, get_peft_model
from transformers import BitsAndBytesConfig

In [74]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
review_dataset = ReviewDataset(df, tokenizer, 512)

In [75]:
review_dataset[1]

{'review_text': 'due to you the fact means that this is just the trouble with clones is dlc = for All saints row the third, it just has massive balancing issues, is ♥♥♥♥easy, has elements that there is absolutely no point to (i.e. the vehicle customisation), and if youve played sr3 there is absolutely nothing new to do. tldr: buy the same trouble with clones dlc for 5 saints row the six third.',
 'input_ids': tensor([  101,  2349,  2000,  2017,  1996,  2755,  2965,  2008,  2023,  2003,
          2074,  1996,  4390,  2007, 24418,  2003, 21469,  2278,  1027,  2005,
          2035,  6586,  5216,  1996,  2353,  1010,  2009,  2074,  2038,  5294,
         20120,  3314,  1010,  2003,  1625, 30152, 30152, 30152,  5243,  6508,
          1010,  2038,  3787,  2008,  2045,  2003,  7078,  2053,  2391,  2000,
          1006,  1045,  1012,  1041,  1012,  1996,  4316,  7661,  6648,  1007,
          1010,  1998,  2065,  2017,  3726,  2209,  5034,  2509,  2045,  2003,
          7078,  2498,  2047,  2000

In [78]:
tokenizer.decode(review_dataset[0]['input_ids'])

'[CLS] bored. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [P

In [80]:
from torch.utils.data import random_split

In [82]:
train_size = int(0.8 * len(df))
val_size = len(df) - train_size
train_dataset, test_dataset = random_split(review_dataset, [train_size, val_size])

In [84]:
len(train_dataset), len(test_dataset)

(18524, 4632)

In [86]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

In [130]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=getattr(torch, "float16"),
    bnb_4bit_use_double_quant=False,
)

In [132]:
bnb_config

BitsAndBytesConfig {
  "_load_in_4bit": true,
  "_load_in_8bit": false,
  "bnb_4bit_compute_dtype": "float16",
  "bnb_4bit_quant_storage": "uint8",
  "bnb_4bit_quant_type": "nf4",
  "bnb_4bit_use_double_quant": false,
  "llm_int8_enable_fp32_cpu_offload": false,
  "llm_int8_has_fp16_weight": false,
  "llm_int8_skip_modules": null,
  "llm_int8_threshold": 6.0,
  "load_in_4bit": true,
  "load_in_8bit": false,
  "quant_method": "bitsandbytes"
}

In [134]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map = {"":0},
    num_labels=2,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [135]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["query", "key"],  # Target key/query projections
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS",
)

In [136]:
model = get_peft_model(model, lora_config)

In [140]:
trainable_params1 = [name for name, param in model.named_parameters() if param.requires_grad]
print("Trainable parameters:", trainable_params1)

Trainable parameters: ['base_model.model.bert.encoder.layer.0.attention.self.query.lora_A.default.weight', 'base_model.model.bert.encoder.layer.0.attention.self.query.lora_B.default.weight', 'base_model.model.bert.encoder.layer.0.attention.self.key.lora_A.default.weight', 'base_model.model.bert.encoder.layer.0.attention.self.key.lora_B.default.weight', 'base_model.model.bert.encoder.layer.1.attention.self.query.lora_A.default.weight', 'base_model.model.bert.encoder.layer.1.attention.self.query.lora_B.default.weight', 'base_model.model.bert.encoder.layer.1.attention.self.key.lora_A.default.weight', 'base_model.model.bert.encoder.layer.1.attention.self.key.lora_B.default.weight', 'base_model.model.bert.encoder.layer.2.attention.self.query.lora_A.default.weight', 'base_model.model.bert.encoder.layer.2.attention.self.query.lora_B.default.weight', 'base_model.model.bert.encoder.layer.2.attention.self.key.lora_A.default.weight', 'base_model.model.bert.encoder.layer.2.attention.self.key.lora_

In [142]:
for name, param in model.named_parameters():
    if "classifier" in name:
        param.requires_grad = True

In [144]:
trainable_params2 = [name for name, param in model.named_parameters() if param.requires_grad]
print("Trainable parameters:", trainable_params2)

Trainable parameters: ['base_model.model.bert.encoder.layer.0.attention.self.query.lora_A.default.weight', 'base_model.model.bert.encoder.layer.0.attention.self.query.lora_B.default.weight', 'base_model.model.bert.encoder.layer.0.attention.self.key.lora_A.default.weight', 'base_model.model.bert.encoder.layer.0.attention.self.key.lora_B.default.weight', 'base_model.model.bert.encoder.layer.1.attention.self.query.lora_A.default.weight', 'base_model.model.bert.encoder.layer.1.attention.self.query.lora_B.default.weight', 'base_model.model.bert.encoder.layer.1.attention.self.key.lora_A.default.weight', 'base_model.model.bert.encoder.layer.1.attention.self.key.lora_B.default.weight', 'base_model.model.bert.encoder.layer.2.attention.self.query.lora_A.default.weight', 'base_model.model.bert.encoder.layer.2.attention.self.query.lora_B.default.weight', 'base_model.model.bert.encoder.layer.2.attention.self.key.lora_A.default.weight', 'base_model.model.bert.encoder.layer.2.attention.self.key.lora_

In [146]:
import numpy as np

In [148]:
from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis = 1)  # Predicted class (0 or 1)

    # Calculate metrics
    accuracy = accuracy_score(labels, predictions)

    return {
        "accuracy": accuracy
    }

In [150]:
# training_arguments = TrainingArguments(
#     output_dir="/results",
#     num_train_epochs=1,
#     per_device_train_batch_size=4,
#     gradient_accumulation_steps=1,
#     optim = "paged_adamw_32bit",
#     save_steps=0,
#     logging_steps=25,
#     learning_rate=2e-4,
#     weight_decay=0.001,
#     fp16=False,
#     bf16=False,
#     max_grad_norm=0.3,
#     max_steps=-1,
#     warmup_ratio=0.03,
#     group_by_length=True,
#     lr_scheduler_type="cosine",
#     report_to="tensorboard"
# )

In [152]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory for model checkpoints
    eval_strategy = "epoch",
    num_train_epochs=10,              # total number of training epochs
    per_device_train_batch_size=4,   # batch size per device during training
    per_device_eval_batch_size = 4,
    gradient_accumulation_steps=1,
    save_steps=0,
    fp16=False,
    bf16=True,
    group_by_length=True,
    lr_scheduler_type="cosine",
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    optim = "paged_adamw_32bit",
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.001,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    report_to = None
)

In [155]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset - needs to be a 🤗 Dataset object
    eval_dataset=test_dataset,           # evaluation dataset
    compute_metrics=compute_metrics
)

In [157]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.4597,0.536461,0.876943
2,0.68,0.480269,0.892703
3,0.187,0.40389,0.910406
4,0.1011,0.449211,0.913644
5,0.319,0.432544,0.9212
6,0.7442,0.421449,0.9212
7,0.314,0.422607,0.923575
8,0.3922,0.421526,0.924439
9,0.496,0.425579,0.923791
10,0.4408,0.425634,0.923359


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


TrainOutput(global_step=46310, training_loss=0.3719568388247382, metrics={'train_runtime': 11456.1675, 'train_samples_per_second': 16.169, 'train_steps_per_second': 4.042, 'total_flos': 4.89073889255424e+16, 'train_loss': 0.3719568388247382, 'epoch': 10.0})

In [167]:
from tqdm.notebook import tqdm
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [169]:
device

device(type='cuda')

In [171]:
model.eval()
total_correct = 0
total = 0

for batch in tqdm(test_loader):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].float().to(device)  # Ensure labels are float for BCEWithLogitsLoss

    with torch.inference_mode():
        out = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = out.logits  # Shape: (batch_size, 1)

    # Apply sigmoid and threshold for binary classification
    predictions = torch.argmax(logits, axis=1).float()

    # Comparison with labels
    total_correct += (predictions.squeeze() == labels).sum().item()
    total += labels.size(0)  # Total samples in the batch

accuracy = total_correct / total
print(f'Test Accuracy: {accuracy:.4f}')


  0%|          | 0/290 [00:00<?, ?it/s]

Test Accuracy: 0.9234


In [173]:
model.save_pretrained(r"artifacts/bert_lora/model")
tokenizer.save_pretrained(r"artifacts/bert_lora/tokenizer")

('artifacts/bert_lora/tokenizer\\tokenizer_config.json',
 'artifacts/bert_lora/tokenizer\\special_tokens_map.json',
 'artifacts/bert_lora/tokenizer\\vocab.txt',
 'artifacts/bert_lora/tokenizer\\added_tokens.json',
 'artifacts/bert_lora/tokenizer\\tokenizer.json')

In [175]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [177]:
model.push_to_hub(r"CoolMan67/bert_lora_clf")
tokenizer.push_to_hub(r"CoolMan67/bert_lora_clf")

adapter_model.safetensors:   0%|          | 0.00/1.19M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/CoolMan67/bert_lora_clf/commit/56537d3f9b19d5e8459e46050889eb3bbe01ef4e', commit_message='Upload tokenizer', commit_description='', oid='56537d3f9b19d5e8459e46050889eb3bbe01ef4e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/CoolMan67/bert_lora_clf', endpoint='https://huggingface.co', repo_type='model', repo_id='CoolMan67/bert_lora_clf'), pr_revision=None, pr_num=None)

In [None]:
# Loading peft model
# from transformers import AutoModel
# from peft import PeftModel, PeftConfig
# 
# # Load the base model
# base_model = AutoModel.from_pretrained("bert-base-uncased")
# 
# # Load the LoRA configuration
# peft_model_path = "artifacts/bert_lora/model"
# model = PeftModel.from_pretrained(base_model, peft_model_path)
# model