In [1]:
import pandas as pd
import gzip

from sklearn.metrics import precision_recall_fscore_support, accuracy_score, mean_absolute_error, roc_auc_score, \
    classification_report
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset
from torch.utils.data import DataLoader
import torch
import numpy as np
import json
from tqdm import tqdm


In [2]:

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  df = {}
  for i, d in enumerate(parse(path)):
    df[i] = d
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF('../../../data/raw/AMAZON_FASHION.json.gz')

In [3]:
# Drop reviews with no reviewText since we are primarily interested in analyzing review text
df = df.dropna(subset=['reviewText'])

df['overallInt'] = df['overall'].astype(int)
df['reviewText'] = df['reviewText'].astype(str)
df['reviewFull'] = df['reviewText']
df['reviewFull'] = df['reviewFull'].astype(str)
df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image,overallInt,reviewFull
0,5.0,True,"10 20, 2014",A1D4G1SNUZWQOT,7106116521,Tracy,Exactly what I needed.,perfect replacements!!,1413763200,,,,5,Exactly what I needed.
1,2.0,True,"09 28, 2014",A3DDWDH9PX2YX2,7106116521,Sonja Lau,"I agree with the other review, the opening is ...","I agree with the other review, the opening is ...",1411862400,3.0,,,2,"I agree with the other review, the opening is ..."
2,4.0,False,"08 25, 2014",A2MWC41EW7XL15,7106116521,Kathleen,Love these... I am going to order another pack...,My New 'Friends' !!,1408924800,,,,4,Love these... I am going to order another pack...
3,2.0,True,"08 24, 2014",A2UH2QQ275NV45,7106116521,Jodi Stoner,too tiny an opening,Two Stars,1408838400,,,,2,too tiny an opening
4,3.0,False,"07 27, 2014",A89F3LQADZBS5,7106116521,Alexander D.,Okay,Three Stars,1406419200,,,,3,Okay


In [4]:
# Keep relevant columns
df = df[['reviewText', 'overall', 'overallInt', 'reviewFull']]
df['overallInt'] = df['overallInt'].apply(lambda x: x - 1)
df['overall'] = df['overall'].apply(lambda x: x - 1)

df.head()

Unnamed: 0,reviewText,overall,overallInt,reviewFull
0,Exactly what I needed.,4.0,4,Exactly what I needed.
1,"I agree with the other review, the opening is ...",1.0,1,"I agree with the other review, the opening is ..."
2,Love these... I am going to order another pack...,3.0,3,Love these... I am going to order another pack...
3,too tiny an opening,1.0,1,too tiny an opening
4,Okay,2.0,2,Okay


In [5]:
# Split data into train and test sets
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [6]:
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

In [7]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize(batch):
    tokenized_inputs = tokenizer(batch['reviewText'], padding=True, truncation=True, max_length=128, return_tensors='pt')
    #tokenized_inputs["labels"] = torch.tensor(batch['overall'])
    tokenized_inputs["labels"] = torch.tensor(batch['overallInt'])
    tokenized_inputs['input_ids'] = tokenized_inputs['input_ids'].squeeze(0)
    tokenized_inputs['attention_mask'] = tokenized_inputs['attention_mask'].squeeze(0)

    return tokenized_inputs

train_dataset = Dataset.from_pandas(df_train).map(tokenize, batched=True)
test_dataset = Dataset.from_pandas(df_test).map(tokenize, batched=True)



Map:   0%|          | 0/705922 [00:00<?, ? examples/s]



Map:   0%|          | 0/176481 [00:00<?, ? examples/s]

In [8]:
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
# Function to compute metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions

    # Hard predictions are needed for accuracy, precision, recall, and F1
    hard_preds = np.argmax(preds, axis=1)

    precision, recall, f1, _ = precision_recall_fscore_support(labels, hard_preds, average='weighted')
    acc = accuracy_score(labels, hard_preds)
    mae = mean_absolute_error(labels, hard_preds)

    # Compute ROC AUC for each class
    roc_auc = {}
    for i in range(preds.shape[1]):  # Iterate over each class
        roc_auc[f"roc_auc_class_{i}"] = roc_auc_score((labels == i).astype(int), preds[:, i])

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'mae': mae,
        **roc_auc  # This will expand the dictionary to include the roc_auc for each class
    }


In [9]:
# load the model
model = AutoModelForSequenceClassification.from_pretrained('../../../models/distilbert_amazon_fashion_ver2')

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
model.to(device)


In [35]:
from torch.utils.data import DataLoader

# Create a DataLoader for test_dataset
test_dataloader = DataLoader(test_dataset, batch_size=128)



model.eval()
predictions = []
labels = []


for batch in tqdm(test_dataloader):
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    predictions.extend(outputs.logits.argmax(dim=-1).cpu().numpy())
    labels.extend(batch['labels'].cpu().numpy())

print(classification_report(labels, predictions))

  input_ids = torch.tensor(test_dataset['input_ids'])
100%|██████████| 1379/1379 [04:53<00:00,  4.69it/s]


              precision    recall  f1-score   support

           0       0.68      0.81      0.74     21276
           1       0.43      0.24      0.31     12966
           2       0.48      0.51      0.49     19390
           3       0.56      0.39      0.46     29979
           4       0.84      0.93      0.89     92870

    accuracy                           0.73    176481
   macro avg       0.60      0.58      0.58    176481
weighted avg       0.71      0.73      0.71    176481



In [36]:
predictions[:10], labels[:10]

([4, 4, 4, 0, 4, 4, 3, 4, 2, 3], [4, 3, 4, 1, 2, 4, 4, 4, 2, 4])