In [1]:
pip install transformers==4.30



In [2]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from transformers import Trainer, TrainingArguments

In [3]:
#Load data
df = pd.read_csv('/content/drive/MyDrive/NLP/cleaned.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)
df

Unnamed: 0,review_body,product_category,sentiment,count
0,If this product was in petite i would get the...,Blouses,2,60
1,Love this dress! it's sooo pretty. i happene...,Dresses,2,62
2,I had such high hopes for this dress and reall...,Dresses,1,98
3,I love love love this jumpsuit. it's fun fl...,Pants,2,22
4,This shirt is very flattering to all due to th...,Blouses,2,36
...,...,...,...,...
22615,I was very happy to snag this dress at such a ...,Dresses,2,28
22616,It reminds me of maternity clothes. soft stre...,Knits,1,38
22617,This fit well but the top was very see throug...,Dresses,1,42
22618,I bought this dress for a wedding i have this ...,Dresses,1,86


In [4]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification

##Load model
model_name = "distilbert-base-uncased"  # Replace with a valid model name from Hugging Face's model hub
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.we

In [5]:
df

Unnamed: 0,review_body,product_category,sentiment,count
0,If this product was in petite i would get the...,Blouses,2,60
1,Love this dress! it's sooo pretty. i happene...,Dresses,2,62
2,I had such high hopes for this dress and reall...,Dresses,1,98
3,I love love love this jumpsuit. it's fun fl...,Pants,2,22
4,This shirt is very flattering to all due to th...,Blouses,2,36
...,...,...,...,...
22615,I was very happy to snag this dress at such a ...,Dresses,2,28
22616,It reminds me of maternity clothes. soft stre...,Knits,1,38
22617,This fit well but the top was very see throug...,Dresses,1,42
22618,I bought this dress for a wedding i have this ...,Dresses,1,86


In [6]:
texts = df['review_body'].to_list()
labels = df['sentiment'].to_list()


## Train test split

In [7]:
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.1, random_state=0, shuffle=True)
train_texts, test_texts, train_labels, test_labels = train_test_split(train_texts, train_labels, test_size=0.1, random_state=0, shuffle=True)



In [94]:
test_data = pd.DataFrame(test_texts)
test_data.to_csv('./test_dataset.csv')

In [95]:
test_label = pd.DataFrame(test_labels)
test_label.to_csv('./test_datalabels.csv')

In [8]:
len(train_texts), len(val_texts),len(test_texts)

(18322, 2262, 2036)

In [9]:
class ClothDataset(Dataset):
  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels

  def __getitem__(self,idx):
    item = {key:torch.tensor(val[idx]) for key, val in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels[idx])
    return item

  def __len__(self):
    return len(self.labels)




In [10]:
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)

In [11]:
# Set the maximum sequence length
max_length = 128

train_encodings = tokenizer(train_texts, truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')
val_encodings = tokenizer(val_texts, truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')
test_encodings = tokenizer(test_texts, truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')

train_dataset = ClothDataset(train_encodings, train_labels)
val_dataset = ClothDataset(val_encodings, val_labels)
test_dataset = ClothDataset(test_encodings, test_labels)

In [12]:
pip install accelerate



In [86]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

In [87]:
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=3)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.we

In [88]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if device.type == "cuda":
    print("GPU is available and PyTorch is using it.")
else:
    print("No GPU available. PyTorch is using CPU.")

model = model.to(device)



GPU is available and PyTorch is using it.


In [89]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

In [90]:
trainer.train()




  item = {key:torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step,Training Loss
10,1.0139
20,1.0035
30,0.9268
40,0.8723
50,0.7915
60,0.7208
70,0.7386
80,0.6913
90,0.7746
100,0.6649


  item = {key:torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key:torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key:torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key:torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key:torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key:torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key:torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key:torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key:torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key:torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key:torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key:torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key:torch.tensor(val[idx]) for key, val in self.encodings.items()}

TrainOutput(global_step=11460, training_loss=0.1571187130199708, metrics={'train_runtime': 2214.252, 'train_samples_per_second': 82.746, 'train_steps_per_second': 5.176, 'total_flos': 6067777403704320.0, 'train_loss': 0.1571187130199708, 'epoch': 10.0})

In [96]:
import numpy as np
from sklearn.metrics import mean_squared_error
from datasets import load_metric


def compute_metrics(eval_pred):
    # load the metrics to use
    load_accuracy = load_metric("accuracy")
    load_f1 = load_metric("f1")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    # calculate the mertic using the predicted and true value
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)
    f1 = load_f1.compute(predictions=predictions, references=labels, average="weighted")
    return {"accuracy": accuracy, "f1score": f1}

In [97]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,

)

In [98]:
# Launch the final evaluation
trainer.evaluate()

  item = {key:torch.tensor(val[idx]) for key, val in self.encodings.items()}


Trainer is attempting to log a value of "{'accuracy': 0.8443025540275049}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'f1': 0.8499167140525662}" of type <class 'dict'> for key "eval/f1score" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 1.1677284240722656,
 'eval_accuracy': {'accuracy': 0.8443025540275049},
 'eval_f1score': {'f1': 0.8499167140525662},
 'eval_runtime': 9.5485,
 'eval_samples_per_second': 213.228,
 'eval_steps_per_second': 3.351}

In [99]:
#to login to hugging face
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) y
Token is valid (permission: write).
Your token has been saved in your

In [102]:
import os
# Load the fine-tuned model and tokenizer
model.save_pretrained("/content/results/models")
tokenizer.save_pretrained("/content/results/models")
os.chdir('/content/results/models')

# Push the model to the Hub
model.push_to_hub(repo_id="ongaunjie/distilbert-sentiment-cloths-v2", commit_message="End of model")

# Push the tokenizer to the Hub
tokenizer.push_to_hub(repo_id="ongaunjie/distilbert-sentiment-cloths-v2", commit_message="End of tokenizer")



Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]



CommitInfo(commit_url='https://huggingface.co/ongaunjie/distilbert-sentiment-cloths-v2/commit/5b8008d58f3fcc84da5c8d3edddf2a4284ea9047', commit_message='End of tokenizer', commit_description='', oid='5b8008d58f3fcc84da5c8d3edddf2a4284ea9047', pr_url=None, pr_revision=None, pr_num=None)