The Amazon product review data is from https://cseweb.ucsd.edu/~jmcauley/datasets/amazon_v2/. The json is parsed with to a csv file. The column heads is the reviewerID, overall, reviewText_LUKE, and reviewText. overall holds the Amazon 1-5 rating. reviewText is the review text that goes with the rating. The reviewText_LUKE is the reviewText with name entities masked with <ENT>.

In [1]:
import json
import csv
import spacy
import torch
from tqdm import trange
from transformers import LukeTokenizer, LukeForEntitySpanClassification

# File name of Amazon product review
file_name = "Office_Products.json"

# File name to save csv file
save_file = "Office_Products Test.csv"

# Load the model checkpoint
model = LukeForEntitySpanClassification.from_pretrained("studio-ousia/luke-large-finetuned-conll-2003")
model.eval()

# Load the tokenizer
tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-conll-2003")

# Masks name entity of the text based on LUKE identified name entitiy
def sentence_LUKE_replace(text, token_str):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)

    entity_spans = []
    original_word_spans = []
    for token_start in doc:
        for token_end in doc[token_start.i:]:
            entity_spans.append((token_start.idx, token_end.idx + len(token_end)))
            original_word_spans.append((token_start.i, token_end.i + 1))

    inputs = tokenizer(text, entity_spans=entity_spans, return_tensors="pt", padding=True)
    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    max_logits, max_indices = logits[0].max(dim=1)

    predictions = []
    for logit, index, span in zip(max_logits, max_indices, original_word_spans):
        if index != 0:  # the span is not NIL
            predictions.append((logit, span, model.config.id2label[int(index)]))

    # construct an IOB2 label sequence
    predicted_sequence = ["O"] * len(doc)
    for _, span, label in sorted(predictions, key=lambda o: o[0], reverse=True):
        if all([o == "O" for o in predicted_sequence[span[0] : span[1]]]):
            predicted_sequence[span[0]] = "B-" + label
            if span[1] - span[0] > 1:
                predicted_sequence[span[0] + 1 : span[1]] = ["I-" + label] * (span[1] - span[0] - 1)

    lst_nnp_words = []

    row_continue = False
    for token, label in zip(doc, predicted_sequence):
        if label != "O":
            if row_continue:
                lst_nnp_words[len(lst_nnp_words) - 1] = str(lst_nnp_words[len(lst_nnp_words) - 1]) + " " + str(token)
            else:
                lst_nnp_words.append(str(token))
            row_continue = True
        else:
            row_continue = False

    for word in lst_nnp_words:
        text = text.replace(word, token_str)
    return text

f = open(file_name)
lines = f.readlines()

with open(save_file, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["reviewerID", "overall", "reviewText_LUKE", "reviewText"])

# Go through each review text, mask the review text, and create a row in csv file
for line in lines:
    data = json.loads(line)
    if "reviewText" in data and "overall" in data and "reviewerID" in data and len(data["reviewText"]) < 150:
        with open(save_file, "a", newline='') as file:
            writer = csv.writer(file)
            writer.writerow([str(data["reviewerID"]), str(data["overall"]), sentence_LUKE_replace(data["reviewText"], "<ENT>"), data["reviewText"]])

f.close()

Some weights of the model checkpoint at studio-ousia/luke-large-finetuned-conll-2003 were not used when initializing LukeForEntitySpanClassification: ['luke.embeddings.position_ids']
- This IS expected if you are initializing LukeForEntitySpanClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LukeForEntitySpanClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


KeyboardInterrupt: 

Set the torch seed to get the same results

In [None]:
torch.manual_seed(1324224321)

In [5]:
import pandas as pd
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

file_name = "Toys_and_Games.csv"
model_name = "albert-base-v2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer_LUKE = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5).to(device)
model_luke = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5).to(device)

# Read amazon product review from csv generated.
def read_amazon_product_review(file_name):
    df = pd.read_csv(file_name, sep=",", header=0)
    dic_review = {}
    dic_review_LUKE = {}
    df1 = df[["reviewerID", "overall", "reviewText_LUKE", "reviewText"]]
    for index in range(len(df1)):
        rating = df.loc[index, "overall"]
        label = int(rating) - 1
        
        review_id = df.loc[index, "reviewerID"]
        if label in dic_review:
            dic_review[label][review_id] = df.loc[index, "reviewText"]
        else:
            dic_review[label] = {review_id: df.loc[index, "reviewText"]}
        if label in dic_review_LUKE:
            dic_review_LUKE[label][review_id] = df.loc[index, "reviewText_LUKE"]
        else:
            dic_review_LUKE[label] = {review_id: df.loc[index, "reviewText_LUKE"]}

    return dic_review, dic_review_LUKE

# Splits data to 80% training, 10% validation, 10% testing
def split_data(amazon_data):
    training = {}
    validation = {}
    test = {}

    for label in amazon_data:
        temp_dic = amazon_data[label]
        lst_amazon_ids = list(temp_dic.keys())
        train_length = int(len(lst_amazon_ids) * 0.8)
        train_ids = lst_amazon_ids[:train_length]
        remaining = lst_amazon_ids[train_length:]
        test_lenght = int(len(remaining) * 0.5)
        test_ids = remaining[:test_lenght]
        validation_id = remaining[test_lenght:]

        for amazon_id in train_ids:
            training[temp_dic[amazon_id]] = label
        for amazon_id in validation_id:
            validation[temp_dic[amazon_id]] = label
        for amazon_id in test_ids:
            test[temp_dic[amazon_id]] = label

    return training, validation, test

dic_review, dic_review_LUKE = read_amazon_product_review(file_name)
training, validation, test = split_data(dic_review)
training_luke, validation_luke, test_luke = split_data(dic_review)

class AmazonProductReviewDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_encodings = tokenizer(list(training.keys()), truncation=True, padding=True)
val_encodings = tokenizer(list(validation.keys()), truncation=True, padding=True)
test_encodings = tokenizer(list(test.keys()), truncation=True, padding=True)
train_encodings_luke = tokenizer_LUKE(list(training_luke.keys()), truncation=True, padding=True)
val_encodings_luke = tokenizer_LUKE(list(validation_luke.keys()), truncation=True, padding=True)
test_encodings_luke = tokenizer_LUKE(list(test_luke.keys()), truncation=True, padding=True)
train_dataset = AmazonProductReviewDataset(train_encodings, list(training.values()))
val_dataset = AmazonProductReviewDataset(val_encodings, list(validation.values()))
test_dataset = AmazonProductReviewDataset(test_encodings, list(test.values()))
train_dataset_luke = AmazonProductReviewDataset(train_encodings_luke, list(training.values()))
val_dataset_luke = AmazonProductReviewDataset(val_encodings_luke, list(validation.values()))
test_dataset_luke = AmazonProductReviewDataset(test_encodings_luke, list(test.values()))

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.decoder.bias', 'predictions.dense.weight', 'predictions.dense.bias', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'predictions.bias', 'predictions.decoder.weight']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You sho

In [3]:
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
import numpy as np
from datasets import load_metric

model_name = "fine_tune_amazon_Office_Original"
model_name_luke = "fine_tune_amazon_Office_LUKE"

# Keeps the training values the same
learning_rate = 2e-5
per_device_train_batch_size = 16
per_device_eval_batch_size = 16
num_train_epochs = 1
save_strategy = "no"

def compute_metrics(eval_pred):
   load_accuracy = load_metric("accuracy")
  
   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   return {"accuracy": accuracy}

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
data_collator_luke = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
   output_dir=model_name,
   learning_rate=learning_rate,
   per_device_train_batch_size=per_device_train_batch_size,
   per_device_eval_batch_size=per_device_eval_batch_size,
   num_train_epochs=num_train_epochs,
   save_strategy = save_strategy
)

trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=train_dataset,
   eval_dataset=val_dataset,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

training_args_luke = TrainingArguments(
   output_dir=model_name_luke,
   learning_rate=learning_rate,
   per_device_train_batch_size=per_device_train_batch_size,
   per_device_eval_batch_size=per_device_eval_batch_size,
   num_train_epochs=num_train_epochs,
   save_strategy = save_strategy
)

trainer_luke = Trainer(
   model=model_luke,
   args=training_args_luke,
   train_dataset=train_dataset_luke,
   eval_dataset=val_dataset_luke,
   tokenizer=tokenizer,
   data_collator=data_collator_luke,
   compute_metrics=compute_metrics,
)

print("Training Original Amazon Product Review Data")
trainer.train()
trainer.save_model()
print(trainer.evaluate())
# Test dataset
trainer.eval_dataset = test_dataset

print("Testing Original Amazon Product Review Data")
print(trainer.evaluate())
trainer.predict(test_dataset)


print("Training LUKE Amazon Product Review Data")
trainer_luke.train()
trainer_luke.save_model()
print(trainer_luke.evaluate())
trainer_luke.eval_dataset = test_dataset

print("Testing LUKE Amazon Product Review Data")
print(trainer_luke.evaluate())
trainer_luke.predict(test_dataset)

Training Original Amazon Product Review Data
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




  0%|          | 0/676 [00:00<?, ?it/s]

You're using a AlbertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.5327, 'learning_rate': 5.207100591715976e-06, 'epoch': 0.74}
{'train_runtime': 407.4634, 'train_samples_per_second': 26.535, 'train_steps_per_second': 1.659, 'train_loss': 0.5154461155276326, 'epoch': 1.0}


  0%|          | 0/88 [00:00<?, ?it/s]

  load_accuracy = load_metric("accuracy")


{'eval_loss': 0.4499070942401886, 'eval_accuracy': 0.851109520400859, 'eval_runtime': 18.4843, 'eval_samples_per_second': 75.578, 'eval_steps_per_second': 4.761, 'epoch': 1.0}
Testing Original Amazon Product Review Data


  0%|          | 0/89 [00:00<?, ?it/s]

{'eval_loss': 0.4362846314907074, 'eval_accuracy': 0.8445706174591909, 'eval_runtime': 19.6091, 'eval_samples_per_second': 71.854, 'eval_steps_per_second': 4.539, 'epoch': 1.0}


  0%|          | 0/89 [00:00<?, ?it/s]

Training LUKE Amazon Product Review Data




  0%|          | 0/676 [00:00<?, ?it/s]

{'loss': 0.5308, 'learning_rate': 5.207100591715976e-06, 'epoch': 0.74}
{'train_runtime': 407.6152, 'train_samples_per_second': 26.525, 'train_steps_per_second': 1.658, 'train_loss': 0.5226578345665565, 'epoch': 1.0}


  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 0.45164045691490173, 'eval_accuracy': 0.8539727988546886, 'eval_runtime': 21.9879, 'eval_samples_per_second': 63.535, 'eval_steps_per_second': 4.002, 'epoch': 1.0}
Testing LUKE Amazon Product Review Data


  0%|          | 0/89 [00:00<?, ?it/s]

Testing Amazon product review model onto other category

In [9]:
import pandas as pd
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
import numpy as np
from datasets import load_metric
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

file_name = "all_beauty.csv"

model_name = "fine_tune_amazon_Video_Games_Original"
model_name_luke = "fine_tune_amazon_Video_Games_LUKE"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer_luke = AutoTokenizer.from_pretrained(model_name_luke)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5).to(device)
model_luke = AutoModelForSequenceClassification.from_pretrained(model_name_luke, num_labels=5).to(device)

def compute_metrics(eval_pred):
   load_accuracy = load_metric("accuracy")
  
   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   return {"accuracy": accuracy}

def read_amazon_product_review(file_name):
    df = pd.read_csv(file_name, sep=",", header=0)
    dic_review = {}
    dic_review_LUKE = {}
    df1 = df[["reviewerID", "overall", "reviewText_LUKE", "reviewText"]]
    for index in range(len(df1)):
        rating = df.loc[index, "overall"]
        label = int(rating) - 1
        
        review_id = df.loc[index, "reviewerID"]
        if label in dic_review:
            dic_review[label][review_id] = df.loc[index, "reviewText"]
        else:
            dic_review[label] = {review_id: df.loc[index, "reviewText"]}
        if label in dic_review_LUKE:
            dic_review_LUKE[label][review_id] = df.loc[index, "reviewText_LUKE"]
        else:
            dic_review_LUKE[label] = {review_id: df.loc[index, "reviewText_LUKE"]}

    return dic_review, dic_review_LUKE

def data_for_testing(amazon_data):
    test = {}

    for label in amazon_data:
        temp_dic = amazon_data[label]
        lst_amazon_ids = list(temp_dic.keys())

        for amazon_id in lst_amazon_ids:
            test[temp_dic[amazon_id]] = label

    return test

dic_review, dic_review_LUKE = read_amazon_product_review(file_name)
test = data_for_testing(dic_review)
test_luke = data_for_testing(dic_review_LUKE)

class AmazonProductReviewDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

test_encodings = tokenizer(list(test.keys()), truncation=True, padding=True)
test_encodings_luke = tokenizer(list(test_luke.keys()), truncation=True, padding=True)
luke_test_encodings = tokenizer_luke(list(test.keys()), truncation=True, padding=True)
luke_test_encodings_luke = tokenizer_luke(list(test_luke.keys()), truncation=True, padding=True)
test_dataset = AmazonProductReviewDataset(test_encodings, list(test.values()))
test_dataset_luke = AmazonProductReviewDataset(test_encodings_luke, list(test.values()))
luke_test_dataset = AmazonProductReviewDataset(luke_test_encodings, list(test.values()))
luke_test_dataset_luke = AmazonProductReviewDataset(luke_test_encodings_luke, list(test.values()))

def testing_model(model, tokenizer, model_name, data_type_str):
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    training_args = TrainingArguments(
        output_dir=model_name,
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=1,
        save_strategy = "no"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    trainer.eval_dataset = test_dataset

    print("Test " + model_name + " -> " + file_name + " " + data_type_str + " Data")
    trainer.predict(test_dataset)
    print(trainer.evaluate())

testing_model(model, tokenizer, model_name, "Original")
testing_model(model_luke, tokenizer, model_name_luke, "Original")
testing_model(model, tokenizer_luke, model_name, "LUKE")
testing_model(model_luke, tokenizer_luke, model_name_luke, "LUKE")

You're using a AlbertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Test fine_tune_amazon_Video_Games_Original -> all_beauty.csv Original Data


  0%|          | 0/546 [00:00<?, ?it/s]

  0%|          | 0/546 [00:00<?, ?it/s]

{'eval_loss': 0.6952040195465088, 'eval_accuracy': 0.791704857928506, 'eval_runtime': 145.3211, 'eval_samples_per_second': 60.06, 'eval_steps_per_second': 3.757}
Test fine_tune_amazon_Video_Games_LUKE -> all_beauty.csv Original Data


  0%|          | 0/546 [00:00<?, ?it/s]

  0%|          | 0/546 [00:00<?, ?it/s]

You're using a AlbertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 1.1054517030715942, 'eval_accuracy': 0.768102658111824, 'eval_runtime': 148.2484, 'eval_samples_per_second': 58.874, 'eval_steps_per_second': 3.683}
Test fine_tune_amazon_Video_Games_Original -> all_beauty.csv LUKE Data


  0%|          | 0/546 [00:00<?, ?it/s]