In [2]:
#import packages
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from langdetect import detect

In [3]:
def Remove1(txt):
    # Apply lowercasing
    txt = str(txt).lower()
    # Remove mentions
    txt = re.sub(r'@[A-Za-z0-9_]+', '', txt)
    # Remove hashtags
    txt = re.sub(r'#', '', txt)
    # Remove retweets:
    txt = re.sub(r'RT : ', '', txt)
    # Remove urls
    txt = re.sub(r'https?:\/\/[A-Za-z0-9\.\/]+', '', txt)
    # Remove punctuations
    txt = re.sub(r'[^\w\s]', '', txt)
    #re
    txt = txt.replace(",000,000", " m").replace(",000", " k").replace("′", "'").replace("’", "'")\
                           .replace("won't", " will not").replace("cannot", " can not").replace("can't", " can not")\
                           .replace("n't", " not").replace("what's", " what is").replace("it's", " it is")\
                           .replace("'ve", " have").replace("'m", " am").replace("'re", " are")\
                           .replace("he's", " he is").replace("she's", " she is").replace("'s", " own")\
                           .replace("%", " percent ").replace("₹", " rupee ").replace("$", " dollar ")\
                           .replace("€", " euro ").replace("'ll", " will").replace("how's"," how has").replace("y'all"," you all")\
                           .replace("o'clock"," of the clock").replace("ne'er"," never").replace("let's"," let us")\
                           .replace("finna"," fixing to").replace("gonna"," going to").replace("gimme"," give me").replace("gotta"," got to").replace("'d"," would")\
                           .replace("daresn't"," dare not").replace("dasn't"," dare not").replace("e'er"," ever").replace("everyone's"," everyone is")\
                           .replace("'cause'"," because")
    return txt

In [4]:
# import files and apply text preprocessing
df = pd.read_csv('Product_Reviews.csv')
df['content'] = df['content'].apply(Remove1)
df1 = df[['content', 'act_label','score']].copy()
df1

Unnamed: 0,content,act_label,score
0,i stream on my phone a lot and the videos wont...,negative,1
1,78755rf de deidnxnzsni you are a a,neutral,3
2,good,positive,5
3,ive been a customer since i rented dvds throug...,negative,1
4,i have no downloads on any device and it still...,negative,1
...,...,...,...
27995,why design ui so bad very bad,negative,1
27996,very poor experience bcoz even after being a p...,negative,1
27997,complete easy to watch,positive,5
27998,i like it,positive,5


In [5]:
#function to detect language
def lange(row):
    """try and except block for catching exception errors"""
    x = None
    # the try will run when everything is ok
    try:
        text = str(row)
        x = detect(text)
        return x

    # this will catch all the errors that occur
    except:
        return x


In [6]:
#filtering dataset to only english
%%time
df1['lang'] = df1['content'].apply(lange)
df1['lang'].value_counts()


options = ['en', 'af','so']

# selecting rows based on condition
df2 = df1[df1['lang'].isin(options)]

df2

CPU times: total: 1min 22s
Wall time: 2min 25s


Unnamed: 0,content,act_label,score,lang
0,i stream on my phone a lot and the videos wont...,negative,1,en
2,good,positive,5,so
3,ive been a customer since i rented dvds throug...,negative,1,en
4,i have no downloads on any device and it still...,negative,1,en
5,its good but movies i search ont it are 98 not...,neutral,3,en
...,...,...,...,...
27994,whats the sense of buying a subscription if yo...,neutral,3,en
27995,why design ui so bad very bad,negative,1,en
27996,very poor experience bcoz even after being a p...,negative,1,en
27997,complete easy to watch,positive,5,en


In [8]:
# Getting numerical labels
def mgg(x):
    n = None
    if x > 3.5: #positive = 2
        n = 2
    elif x == 3: # neutral = 1
        n = 1
    else: # negative = 0
        n = 0
    return n

In [9]:
# RoBERTa trained on numerical labels -> convert labels into integers
df2['label'] = df2['score'].apply(mgg)
df2 = df2[['content','label']]
df2 = df2.dropna()
df2['label'] = df2['label'].astype(int)
df2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['label'] = df2['score'].apply(mgg)


Unnamed: 0,content,label
0,i stream on my phone a lot and the videos wont...,0
2,good,1
3,ive been a customer since i rented dvds throug...,0
4,i have no downloads on any device and it still...,0
7,baibhabi,1
...,...,...
27993,good but not so good,1
27995,why design ui so bad very bad,0
27996,very poor experience bcoz even after being a p...,0
27997,complete easy to watch,1


In [10]:
#splitting data into text and labels separately
X = list(df2['content'])
y = list(df2['label'])

In [11]:
#splitting dataset for train, valid and test for labels and text separately
from sklearn.model_selection import train_test_split as tts
X_train, X_test, y_train, y_test = tts(X,y, train_size = 0.7, random_state =123)
X_valid, X_test, y_valid, y_test = tts(X_test,y_test, train_size = 0.5, random_state =123)

In [15]:
#importing transformers for RoBERTa model and Tokenizer
import transformers

from transformers import AutoTokenizer, AutoModelForSequenceClassification

id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

checkpoint = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(
                checkpoint)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
# Tokenizing text with truncation, padding and returning it in PyTorch format
train_encodings = tokenizer(X_train, truncation = True, padding = True, return_tensors = "pt")
valid_encodings = tokenizer(X_valid, truncation = True, padding = True, return_tensors = "pt")
test_encodings = tokenizer(X_test, truncation = True, padding = True, return_tensors = "pt")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [19]:
#Importing pytorch and dependencies
import torch
from torch.utils.data import Dataset
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction


"\ndef multi_label_metrics(predictions, labels, threshold=0.5):\n    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)\n    sigmoid = torch.nn.Sigmoid()\n    probs = sigmoid(torch.Tensor(predictions))\n    # next, use threshold to turn them into integer predictions\n    y_pred = np.zeros(probs.shape)\n    y_pred[np.where(probs >= threshold)] = 1\n    # finally, compute metrics\n    y_true = labels\n    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')\n    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')\n    accuracy = accuracy_score(y_true, y_pred)\n    # return as dictionary\n    metrics = {'f1': f1_micro_average,\n               'roc_auc': roc_auc,\n               'accuracy': accuracy}\n    return metrics\n\ndef compute_metrics(p: EvalPrediction):\n    preds = p.predictions[0] if isinstance(p.predictions,\n            tuple) else p.predictions\n    result = multi_label_metrics(\n        predictions=preds,\n        l

In [20]:
#function for combining encoded text and respective labels together 
class PTDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

In [21]:
#combining respective sets together
train_dataset = PTDataset(train_encodings, y_train)
val_dataset = PTDataset(valid_encodings, y_valid)
test_dataset = PTDataset(test_encodings, y_test)

In [27]:
#Checking for GPU functionality
print(torch.cuda.device_count())


1


In [28]:
#Checking for name of device
print(torch.cuda.get_device_name(0))

NVIDIA GeForce RTX 3050 Laptop GPU


In [29]:
#Opting to use Laptop GPU for processing
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [36]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

#Function to compute metrics
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}
# Define Trainer arguments and model
args = TrainingArguments(
    output_dir="output",
    evaluation_strategy="steps",
    eval_steps=500,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    seed=0,
    load_best_model_at_end=True,)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],)

# Train pre-trained model
trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 14744
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5529
  Number of trainable parameters = 124647939
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
500,0.3152,0.268276,0.91962,0.896552,0.911585,0.904006
1000,0.2777,0.238256,0.926266,0.925138,0.894817,0.909725
1500,0.2714,0.292202,0.921203,0.91621,0.891768,0.903824
2000,0.2639,0.347031,0.929747,0.94237,0.884909,0.912736
2500,0.1826,0.284449,0.925633,0.900372,0.923018,0.911554


***** Running Evaluation *****
  Num examples = 3160
  Batch size = 8
Saving model checkpoint to output\checkpoint-500
Configuration saved in output\checkpoint-500\config.json
Model weights saved in output\checkpoint-500\pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
***** Running Evaluation *****
  Num examples = 3160
  Batch size = 8
Saving model checkpoint to output\checkpoint-1000
Configuration saved in output\checkpoint-1000\config.json
Model weights saved in output\checkpoint-1000\pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
***** Running Evaluation *****
  Num examples = 3160
  Batch size = 8
Saving model checkpoint to output\checkpoint-1500
Configuration saved in output\checkpoint-1500\config.json
Model weights saved in output\checkpoint-1500\pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
***** Running Evaluation *****
  Num exampl

TrainOutput(global_step=2500, training_loss=0.2621680206298828, metrics={'train_runtime': 22033.886, 'train_samples_per_second': 2.007, 'train_steps_per_second': 0.251, 'total_flos': 2744190723960000.0, 'train_loss': 0.2621680206298828, 'epoch': 1.36})

In [37]:
#saving model for future use
model.save_pretrained("Robert_RV")

Configuration saved in Robert_RV\config.json
Model weights saved in Robert_RV\pytorch_model.bin


In [38]:

pt_model = AutoModelForSequenceClassification.from_pretrained("Robert_RV")

loading configuration file Robert_RV\config.json
Model config RobertaConfig {
  "_name_or_path": "Robert_RV",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "negative",
    "1": "neutral",
    "2": "positive"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "negative": 0,
    "neutral": 1,
    "positive": 2
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

In [42]:
#predicting on test data set
trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 3160
  Batch size = 8
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


PredictionOutput(predictions=array([[-0.31398493,  4.509699  , -5.9656744 ],
       [-0.23284057,  4.4621243 , -6.0920906 ],
       [ 3.9470286 , -0.2997004 , -6.8963494 ],
       ...,
       [ 3.7583184 ,  0.5465431 , -7.8876867 ],
       [ 3.9502933 , -0.43528107, -6.696942  ],
       [ 3.9619007 , -0.18131112, -7.131274  ]], dtype=float32), label_ids=array([1, 1, 0, ..., 0, 0, 0], dtype=int64), metrics={'test_loss': 0.22356122732162476, 'test_accuracy': 0.930379746835443, 'test_precision': 0.9226430298146656, 'test_recall': 0.9022852639873916, 'test_f1': 0.9123505976095617, 'test_runtime': 207.9191, 'test_samples_per_second': 15.198, 'test_steps_per_second': 1.9})