In [1]:
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, Trainer, TrainingArguments
import pandas as pd
from transformers import DataCollatorWithPadding
from datasets import load_metric

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [3]:
df = pd.read_json('AMAZON_FASHION_5.json',lines=True)
df = df.drop(columns=['vote','image','summary','style','reviewerName','unixReviewTime','reviewTime','verified','asin'])
df.head()

Unnamed: 0,overall,reviewerID,reviewText
0,5,ALJ66O1Y6SLHA,Great product and price!
1,5,ALJ66O1Y6SLHA,Great product and price!
2,5,ALJ66O1Y6SLHA,Great product and price!
3,5,ALJ66O1Y6SLHA,Great product and price!
4,5,ALJ66O1Y6SLHA,Great product and price!


In [4]:
df = df.dropna()
df = df.drop_duplicates()

In [5]:
df['label'] = df['overall'].apply(lambda x: 1 if x >= 4 else 0)

In [6]:
df['label'].unique()

array([1, 0], dtype=int64)

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
train_texts, test_texts, train_labels, test_labels = train_test_split(df['reviewText'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42)

In [9]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [10]:
import torch

In [11]:
class AmazonDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [12]:
train_dataset = AmazonDataset(train_encodings, train_labels)
test_dataset = AmazonDataset(test_encodings, test_labels)

In [13]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=100,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer)
)


In [16]:
trainer.train()

  0%|          | 0/12 [00:00<?, ?it/s]

: 

In [None]:
eval_results = trainer.evaluate()

In [None]:
accuracy_metric = load_metric("accuracy")
f1_metric = load_metric("f1")
precision_metric = load_metric("precision")
recall_metric = load_metric("recall")
bert_score = load_metric("bertscore")
bleu_score = load_metric("bleu")