In [12]:
import pandas as pd
import gzip

from sklearn.metrics import precision_recall_fscore_support, accuracy_score, mean_absolute_error, roc_auc_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
import torch
import numpy as np
import json

In [13]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  df = {}
  for i, d in enumerate(parse(path)):
    df[i] = d
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF('../../../data/raw/AMAZON_FASHION.json.gz')

In [14]:
# Drop reviews with no reviewText or summary since we are primarily interested in analyzing review text
df = df.dropna(subset=['reviewText', 'summary'])

# sample 50,000 reviews
df = df.sample(50000, random_state=42)

df['overallInt'] = df['overall'].astype(int) - 1 # convert 1-5 to 0-4
df['reviewText'] = df['reviewText'].astype(str)
df['reviewFull'] = df['reviewText'] + df['summary']
df['reviewFull'] = df['reviewFull'].astype(str)

In [15]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

In [16]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize(batch):
    tokenized_inputs = tokenizer(batch['reviewText'], padding=True, truncation=True, max_length=128, return_tensors='pt')
    #tokenized_inputs["labels"] = torch.tensor(batch['overall'])
    tokenized_inputs["labels"] = torch.tensor(batch['overallInt'])
    tokenized_inputs['input_ids'] = tokenized_inputs['input_ids'].squeeze(0)
    tokenized_inputs['attention_mask'] = tokenized_inputs['attention_mask'].squeeze(0)

    return tokenized_inputs

train_dataset = Dataset.from_pandas(df_train).map(tokenize, batched=True)
test_dataset = Dataset.from_pandas(df_test).map(tokenize, batched=True)

Map: 100%|██████████| 40000/40000 [00:09<00:00, 4400.01 examples/s]
Map: 100%|██████████| 10000/10000 [00:02<00:00, 4385.05 examples/s]


In [17]:
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

In [18]:
model = AutoModelForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=len(np.unique(df['overall']))
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
# Function to compute metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions

    # Hard predictions are needed for accuracy, precision, recall, and F1
    hard_preds = np.argmax(preds, axis=1)

    precision, recall, f1, _ = precision_recall_fscore_support(labels, hard_preds, average='weighted')
    acc = accuracy_score(labels, hard_preds)
    mae = mean_absolute_error(labels, hard_preds)

    # Compute ROC AUC for each class
    roc_auc = {}
    for i in range(preds.shape[1]):  # Iterate over each class
        roc_auc[f"roc_auc_class_{i}"] = roc_auc_score((labels == i).astype(int), preds[:, i])

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'mae': mae,
        **roc_auc  # This will expand the dictionary to include the roc_auc for each class
    }

training_args = TrainingArguments(
    output_dir='/results',
    num_train_epochs=3,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    learning_rate=1e-5,
    weight_decay=0.01,
    logging_dir='/logs',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_steps=10,
)

In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  0%|          | 0/939 [19:55<?, ?it/s]


  1%|          | 10/939 [16:39<25:50:18, 100.13s/it]

{'loss': 1.5444, 'grad_norm': 2.2187864780426025, 'learning_rate': 9.893503727369543e-06, 'epoch': 0.03}


  2%|▏         | 20/939 [32:49<24:07:26, 94.50s/it] 

{'loss': 1.4321, 'grad_norm': 1.6585981845855713, 'learning_rate': 9.787007454739084e-06, 'epoch': 0.06}


  3%|▎         | 30/939 [48:13<23:49:11, 94.34s/it]

{'loss': 1.35, 'grad_norm': 1.3057266473770142, 'learning_rate': 9.680511182108628e-06, 'epoch': 0.1}


  4%|▍         | 40/939 [1:04:15<23:58:11, 95.99s/it]

{'loss': 1.2543, 'grad_norm': 1.159224271774292, 'learning_rate': 9.57401490947817e-06, 'epoch': 0.13}


  5%|▍         | 44/939 [1:10:32<23:22:18, 94.01s/it]

KeyboardInterrupt: 

In [None]:
model.save_pretrained('../../../models/distilbert_amazon_fashion_ver2')

In [None]:
trainer.evaluate()