In [2]:
import torch
import evaluate
import numpy as np
import pandas as pd


from os.path import join
from datasets import load_dataset

from nltk.stem import WordNetLemmatizer
import gensim.parsing.preprocessing as gensim_preprocessing
import gensim.utils as gensim_utils

from tqdm.notebook import tqdm
from accelerate import Accelerator
from torch.utils.data import DataLoader
from transformers import BertTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

device

device(type='cpu')

### Preprocessing Training Data & Testing Data

In [4]:
data_dir = "data/"

train_data_files={"train": join(data_dir, "train.json")}
train_dataset = load_dataset("json", data_files=train_data_files)

test_data_files={"test": join(data_dir, "test.json")}
test_dataset = load_dataset("json", data_files=test_data_files)

Generating train split: 35000 examples [00:00, 287297.40 examples/s]
Generating test split: 35000 examples [00:00, 133812.53 examples/s]


In [5]:
def preprocess_text(text):
    preprocs = [
        gensim_preprocessing.strip_tags, 
        gensim_preprocessing.strip_punctuation,
        gensim_preprocessing.strip_multiple_whitespaces,
        gensim_preprocessing.strip_numeric,
        gensim_preprocessing.remove_stopwords, 
        gensim_preprocessing.strip_short, 
    ]
    text = gensim_utils.to_unicode(text.lower().strip())
    for preproc in preprocs:
        text = preproc(text)
    return text

def lemmatize(text):
    wnl = WordNetLemmatizer()
    return wnl.lemmatize(text)  

In [6]:
def preprocess_train(b):
    b['labels'] = [0.] * 5
    b['labels'][int(b['rating'])-1] = 1
    b['text'] = lemmatize(preprocess_text(b['text']))
    return b

def preprocess_test(b):
    b['text'] = lemmatize(preprocess_text(b['text']))
    return b

In [7]:
train_dataset = train_dataset.map(preprocess_train)
test_dataset = test_dataset.map(preprocess_test)

Map: 100%|██████████| 35000/35000 [00:02<00:00, 12491.59 examples/s]
Map: 100%|██████████| 35000/35000 [00:01<00:00, 17694.79 examples/s]


### Tokenize

In [8]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)

def combime_title_and_text(b):
    s = f"title:{b['title']}\nreview:{b['text']}"
    result = tokenizer(s, padding=True, truncation=True, max_length=128)
    return result


tokenized_train_dataset = train_dataset.map(
    combime_title_and_text,
    remove_columns=["verified_purchase", "title", "text", "helpful_vote", "rating"],
)
tokenized_test_dataset = test_dataset.map(
    combime_title_and_text,
    remove_columns=["verified_purchase", "title", "text", "helpful_vote"],
)

Map: 100%|██████████| 35000/35000 [00:12<00:00, 2839.80 examples/s]
Map: 100%|██████████| 35000/35000 [00:11<00:00, 2965.68 examples/s]


In [9]:
new_train_dataset = tokenized_train_dataset["train"].train_test_split(test_size=0.1, seed=17)

### Metrics

In [10]:
metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def sigmoid(x):
   return 1 / (1 + np.exp(-x))

def compute_metrics(eval_pred):
   predictions, labels = eval_pred
   predictions = sigmoid(predictions)
   predictions = (predictions > 0.5).astype(int).reshape(-1)
   return metrics.compute(predictions=predictions, references=labels.astype(int).reshape(-1))

Downloading builder script: 100%|██████████| 4.20k/4.20k [00:00<00:00, 8.72MB/s]
Downloading builder script: 100%|██████████| 6.77k/6.77k [00:00<00:00, 9.29MB/s]
Downloading builder script: 100%|██████████| 7.55k/7.55k [00:00<00:00, 14.6MB/s]
Downloading builder script: 100%|██████████| 7.36k/7.36k [00:00<00:00, 7.25MB/s]


In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)

batch_size = 32
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
training_args = TrainingArguments(
    output_dir="output",
    overwrite_output_dir=True,
    learning_rate=3e-5,
    weight_decay=0.01,
    num_train_epochs=5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=new_train_dataset['train'],
    eval_dataset=new_train_dataset['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
accelerator = Accelerator()
trainer = accelerator.prepare(trainer)
trainer.train()
trainer.save_model("chekpoints/checkpoint-240512-001")

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
eval_dataloader = DataLoader(
    tokenized_test_dataset["test"], batch_size=32, collate_fn=data_collator
)

checkpoint = "chekpoints/checkpoint-240512-001"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=5)

predictions = []

model.to(device)
model.eval()
for batch in tqdm(eval_dataloader):
    with torch.no_grad():
        batch = batch.to(device)
        outputs = model(**batch)
        prediction = outputs.logits.argmax(dim=1).tolist()
        predictions.extend([pred+1 for pred in prediction])

index = [f"index_{i}" for i in range(tokenized_test_dataset["test"].num_rows)]

results = pd.DataFrame({"index": index, "rating": predictions})
results.to_csv(f"submission.csv", index=False)