In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
dataset_path = "/content/drive/MyDrive/CS497/Final Project/Fake reviews data files/generated-datasets/"
model_weights_path = "/content/drive/MyDrive/CS497/Final Project/Fake reviews data files/gpt-2models/models/"

In [3]:
!pip install -q transformers

In [4]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
import json
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler, Subset
from transformers import AutoTokenizer, GPT2TokenizerFast, TFGPT2Model, GPT2LMHeadModel, GPT2Config
import torch.nn.functional as F
import pandas as pd
from sklearn.model_selection import train_test_split

In [5]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [6]:
file_name = dataset_path + "Generated_Sentences_Books_5.csv"
df = pd.read_csv(file_name)

print(df[df['label'] == 'CG'].head())
print(df[df['label'] == 'OR'].head())

  category  rating label                                              text_
0  Books_5     5.0    CG  I'm hooked on this writer and will be reading ...
1  Books_5     5.0    CG  Good book and exactly as described. The charac...
2  Books_5     5.0    CG  Fitting that the ultimate survivor of an epide...
3  Books_5     5.0    CG  Absolutely loved every word!  We have the book...
4  Books_5     5.0    CG  I loved it! It was a real, believable, and eng...
   category  rating label                                              text_
24  Books_5     4.0    OR  It's witty and intriguing. I hang on to every ...
25  Books_5     5.0    OR  Good read. I really enjoyed the characters in ...
26  Books_5     5.0    OR  It's Dr. Seuss, how can you miss??? Great qual...
27  Books_5     5.0    OR  Great read.  Makes you wish you lived in Nantu...
28  Books_5     4.0    OR  A good read which keeps you wondering until th...


Splitting into training, validation, and testing sets

In [13]:
train_df, temp_df = train_test_split(df, test_size=0.1, stratify=df['label'], random_state=42)
valid_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['label'], random_state=42)

print(f"FULL Dataset: {len(df)}")
print(f"TRAIN Dataset Size: {len(train_df)}")
print(f"VALID Dataset Size: {len(valid_df)}")

print("\nTraining set:")
print(train_df['label'].value_counts(normalize=True))
print(train_df.head(2))

print("\nValidation set:")
print(valid_df['label'].value_counts(normalize=True))
print(valid_df.head(2))

print("\nTest set:")
print(test_df['label'].value_counts(normalize=True))
print(test_df.head(2))


FULL Dataset: 4370
TRAIN Dataset Size: 3933
VALID Dataset Size: 218

Training set:
label
OR    0.500127
CG    0.499873
Name: proportion, dtype: float64
     category  rating label                                              text_
1264  Books_5     5.0    OR  I love when King really gets inside the head o...
3446  Books_5     4.0    CG  This is a really moving book. The characters a...

Validation set:
label
CG    0.5
OR    0.5
Name: proportion, dtype: float64
     category  rating label                                              text_
3796  Books_5     1.0    CG  Unfortunately this book falls far short of the...
3065  Books_5     5.0    CG  Carl Sagan is an astounding writer. The story ...

Test set:
label
CG    0.502283
OR    0.497717
Name: proportion, dtype: float64
     category  rating label                                              text_
2775  Books_5     4.0    CG  I really liked the overall story. The characte...
643   Books_5     5.0    CG  great book  I still find it ent

Instantiating and testing the tokenizer

In [None]:
# We get the max and mean length of the text to set an appropriate
# max_length parameter for tokenization
max_length = df['text_'].apply(len).max()
average_length = df['text_'].apply(len).mean()

print(f"Maximum length of text: {max_length}")
print(f"Average length of text: {average_length}")

Maximum length of text: 2114
Average length of text: 432.34279176201375


In [None]:
# Choose a maximum input sequence length of 512
MAX_LEN = 512

# Initialize tokenizer
tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

# Assuming labels are stored as 'OR' and 'CG' in the 'label' column
labels_dict = {'OR': 0, 'CG': 1}

class ReviewDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        return {
            'text_': self.dataframe.iloc[idx]['text_'],
            'label': self.dataframe.iloc[idx]['label']
        }

# Custom collate function to handle tokenization and batching
def collate_fn(batch):
    texts = [item['text_'] for item in batch]
    class_labels = [labels_dict[item['label']] for item in batch]  # Convert labels to indices

    # Calculate lengths before padding for use in the model
    lengths = [len(tokenizer.encode(text)) for text in texts]

    encodings = tokenizer(texts,
                          return_tensors='pt',
                          max_length=MAX_LEN,
                          padding='max_length',
                          truncation=True)

    # Add labels to the dictionary
    encodings['labels'] = torch.tensor(class_labels, dtype=torch.long)
    encodings['lengths'] = torch.tensor(lengths)

    return encodings

# Creating the dataset and data loader
example_df = pd.DataFrame({
    'category': ['Books_5', 'Books_5'],
    'rating': [4.0, 5.0],
    'label': ['CG', 'OR'],
    'text_': ['I really liked the overall story...', 'great book I still find it entertaining...']
})

dataset = ReviewDataset(example_df)
example_data_loader = DataLoader(dataset, batch_size=2, collate_fn=collate_fn)

# Iterate over DataLoader
for batch in example_data_loader:
    decoded_texts = [tokenizer.decode(ids, skip_special_tokens=True) for ids in batch['input_ids']]
    print(decoded_texts)
    print(batch['input_ids'])
    print(batch['attention_mask'])  # Attention masks
    print(batch['lengths']) # Length of original input sequence without appended padding tokens
    print(batch['labels'])  # Labels


['I really liked the overall story...', 'great book I still find it entertaining...']
tensor([[   40,  1107,  8288,  ..., 50256, 50256, 50256],
        [18223,  1492,   314,  ..., 50256, 50256, 50256]])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
tensor([7, 8])
tensor([1, 0])


Tokenizing the datasets and creating dataloaders

In [None]:
BATCHSIZE = 32

train_dataset = ReviewDataset(train_df)
valid_dataset = ReviewDataset(valid_df)
test_dataset = ReviewDataset(test_df)

exclude_batches = [25, 41]  # Zero-indexed; hence, 26-1 and 42-1
exclude_indices = []
for batch_num in exclude_batches:
    start_index = batch_num * BATCHSIZE
    end_index = start_index + BATCHSIZE
    exclude_indices.extend(range(start_index, end_index))

# Calculate the indices to include
all_indices = set(range(len(train_dataset)))
include_indices = list(all_indices - set(exclude_indices))

# Create a new subset of data excluding the specified batches
filtered_train_dataset = Subset(train_dataset, include_indices)

train_dataloader = DataLoader(filtered_train_dataset, batch_size=BATCHSIZE, collate_fn=collate_fn, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=BATCHSIZE, collate_fn=collate_fn, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=BATCHSIZE, collate_fn=collate_fn, shuffle=True)

Initializing the researchers' fine-tuned model for Book reviews

In [None]:
# Initialize the model with the GPT-2 configuration
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Retrieve the fine-tuned model checkpoint
model_path = model_weights_path + "gpt2-finetuned-amazon-reviews-sample-1cycle-Books_5.pth"
checkpoint = torch.load(model_path, map_location=device)

# Access the model state_dict
model_state_dict = checkpoint['model']

# Load the state dictionary
model.load_state_dict(model_state_dict, strict = False)
model.to(device)

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

Adding a head to the model to retask the fine-tuned gpt-2 generator model for discrimination

In [None]:
class GPT2Discriminator(torch.nn.Module):
    def __init__(self, gpt2_model):
        super(GPT2Discriminator, self).__init__()
        self.l1 = gpt2_model  # Pre-trained & fine-tuned model
        self.dropout = torch.nn.Dropout(0.1)  # Dropout layer
        self.ln = torch.nn.LayerNorm(model.config.n_embd)  # Layer normalization
        self.l3 = torch.nn.Linear(model.config.n_embd, 2)  # Linear layer (tensor: [768, 2]) maps output to our two classes

    def forward(self, input_ids, attention_mask, seq_lengths):
        outputs = self.l1(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)

        # We extract representations of the last non-padding token in our input sequence from the final hidden state of the model
        last_hidden_state = outputs.hidden_states[-1]  # tensor: [batch_size, seq_len=512, hidden_size=768]

        # Apply dropout and layer normalization
        last_hidden_state = self.dropout(last_hidden_state)
        last_hidden_state = self.ln(last_hidden_state)

        batch_size = last_hidden_state.size(0)  # typically 64
        seq_indices = torch.arange(batch_size, device=input_ids.device)  # tensor: [batch_size]
        last_token_indices = seq_lengths - 1  # tensor: [batch_size]
        cls_output = last_hidden_state[seq_indices, last_token_indices]  # Shape: [batch_size, hidden_size=768]

        # Compute the logits using the linear layer
        logits = self.l3(cls_output)  # Shape: [batch_size, 2]

        return logits


model = GPT2Discriminator(model)
model.to(device)

GPT2Discriminator(
  (l1): GPT2LMHeadModel(
    (transformer): GPT2Model(
      (wte): Embedding(50257, 768)
      (wpe): Embedding(1024, 768)
      (drop): Dropout(p=0.1, inplace=False)
      (h): ModuleList(
        (0-11): 12 x GPT2Block(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): GPT2Attention(
            (c_attn): Conv1D()
            (c_proj): Conv1D()
            (attn_dropout): Dropout(p=0.1, inplace=False)
            (resid_dropout): Dropout(p=0.1, inplace=False)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): GPT2MLP(
            (c_fc): Conv1D()
            (c_proj): Conv1D()
            (act): NewGELUActivation()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    )
    (lm_head): Linear(in_features=768, out_features=50257, bias=False)
  )
  (dropout): Dropout(p=0.1, inpla

In [None]:
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Trainable parameters: {trainable_params/1000000:.0f}M")

Trainable parameters: 124M


Instantiating training metrics

In [None]:
# Returns the class from the logits returned by the model
def get_preds(logits):
    return torch.argmax(logits, dim=1)

# Returns the number of correct classifications in a batch
def get_count_correct(choices, labels):
    matches = torch.eq(choices, labels)
    return torch.sum(matches).item()

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [None]:
!pip install torchmetrics
from torchmetrics import Accuracy, Precision, Recall, F1Score

Collecting torchmetrics
  Downloading torchmetrics-1.4.0.post0-py3-none-any.whl (868 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m868.8/868.8 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.11.2-py3-none-any.whl (26 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->torchmetrics)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->torchmetrics)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->torchmetrics)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->torchmetrics)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Col

Training

In [None]:
# Define hyperparameters
LEARNING_RATE = 1e-5
EPOCHS = 4

# Initialize the loss function
loss_fn = torch.nn.CrossEntropyLoss()

# Create the optimizer
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

# Initialize best loss to a high value
best_validation_loss = float('inf')

# Metrics for binary classification
accuracy = Accuracy(num_classes=2, average='macro', task="binary").to(device)
precision = Precision(num_classes=2, average='macro', task="binary").to(device)
recall = Recall(num_classes=2, average='macro', task="binary").to(device)
f1 = F1Score(num_classes=2, average='macro', task="binary").to(device)

# Training loop
print("training model...")
for epoch in range(EPOCHS):
    model.train()
    total_train_loss = 0
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        lengths = batch['lengths'].to(device)
        labels = batch['labels'].to(device)  # tensor: [batch_size]

        # Forward pass
        logits = model(input_ids, attention_mask, lengths)  # tensor: [batch_size, 2]

        # Compute loss
        loss = loss_fn(logits, labels)
        total_train_loss += loss.item()

        # Update metrics
        preds = get_preds(logits)
        accuracy.update(preds, labels)
        precision.update(preds, labels)
        recall.update(preds, labels)
        f1.update(preds, labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    train_accuracy = accuracy.compute()
    train_precision = precision.compute()
    train_recall = recall.compute()
    train_f1 = f1.compute()
    print(f"Epoch {epoch+1}/{EPOCHS}, Training Loss: {total_train_loss / len(train_dataloader):.4f}, "
          f"Accuracy: {train_accuracy:.4f}, Precision: {train_precision:.4f}, "
          f"Recall: {train_recall:.4f}, F1: {train_f1:.4f}")

    # Validation phase
    model.eval()
    total_validation_loss = 0
    accuracy.reset()
    precision.reset()
    recall.reset()
    f1.reset()
    with torch.no_grad():
        for batch in valid_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            lengths = batch['lengths'].to(device)
            labels = batch['labels'].to(device)  # tensor: [batch_size]

            logits = model(input_ids, attention_mask, lengths) # tensor: [batch_size, 2]

            # Calculate loss
            loss = loss_fn(logits, labels)
            total_validation_loss += loss.item()

            # Update metrics
            preds = get_preds(logits)
            accuracy.update(preds, labels)
            precision.update(preds, labels)
            recall.update(preds, labels)
            f1.update(preds, labels)

    avg_validation_loss = total_validation_loss / len(valid_dataloader)
    valid_accuracy = accuracy.compute()
    valid_precision = precision.compute()
    valid_recall = recall.compute()
    valid_f1 = f1.compute()
    print(f"Epoch {epoch+1}/{EPOCHS}, Validation Loss: {avg_validation_loss:.4f}, "
          f"Accuracy: {valid_accuracy:.4f}, Precision: {valid_precision:.4f}, "
          f"Recall: {valid_recall:.4f}, F1: {valid_f1:.4f}")

    # Save the model if it has the best validation loss so far
    if avg_validation_loss < best_validation_loss:
        best_validation_loss = avg_validation_loss
        torch.save(model.state_dict(), '/content/drive/MyDrive/CS497/Final Project/gpt2_discriminator_Books_state_dict.pth')
        print("Saved Best Model")

training model...
Epoch 1/4, Training Loss: 0.4629, Accuracy: 0.7795, Precision: 0.7907, Recall: 0.7592, F1: 0.7746
Epoch 1/4, Validation Loss: 0.2534, Accuracy: 0.9220, Precision: 0.9035, Recall: 0.9450, F1: 0.9238
Saved Best Model
Epoch 2/4, Training Loss: 0.2005, Accuracy: 0.9339, Precision: 0.9443, Recall: 0.9221, F1: 0.9330
Epoch 2/4, Validation Loss: 0.1566, Accuracy: 0.9404, Precision: 0.9138, Recall: 0.9725, F1: 0.9422
Saved Best Model
Epoch 3/4, Training Loss: 0.1106, Accuracy: 0.9626, Precision: 0.9614, Recall: 0.9637, F1: 0.9625
Epoch 3/4, Validation Loss: 0.1245, Accuracy: 0.9358, Precision: 0.9060, Recall: 0.9725, F1: 0.9381
Saved Best Model
Epoch 4/4, Training Loss: 0.0720, Accuracy: 0.9726, Precision: 0.9707, Recall: 0.9745, F1: 0.9726
Epoch 4/4, Validation Loss: 0.0734, Accuracy: 0.9771, Precision: 0.9815, Recall: 0.9725, F1: 0.9770
Saved Best Model


Testing on the Book reviews dataset

In [None]:
# Assuming model has been loaded and set to the appropriate device
model.load_state_dict(torch.load('/content/drive/MyDrive/CS497/Final Project/gpt2_discriminator_Books_state_dict.pth'))
model.to(device)
model.eval()

# Metrics for binary classification
accuracy = Accuracy(num_classes=2, average='macro', task="binary").to(device)
precision = Precision(num_classes=2, average='macro', task="binary").to(device)
recall = Recall(num_classes=2, average='macro', task="binary").to(device)
f1 = F1Score(num_classes=2, average='macro', task="binary").to(device)

# Testing loop
print("testing model...")
with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        lengths = batch['lengths'].to(device)
        labels = batch['labels'].to(device)  # tensor: [batch_size]

        logits = model(input_ids, attention_mask, lengths)

        # Update metrics
        preds = get_preds(logits)
        accuracy.update(preds, labels)
        precision.update(preds, labels)
        recall.update(preds, labels)
        f1.update(preds, labels)

# Print the fine-tuned test accuracy
test_accuracy = accuracy.compute()
test_precision = precision.compute()
test_recall = recall.compute()
test_f1 = f1.compute()
print()
print("Category: Books")
print(f"Test Accuracy: {test_accuracy:.4f}, Precision: {test_precision:.4f}, "
      f"Recall: {test_recall:.4f}, F1: {test_f1:.4f}")
print(68*"-")

testing model...

Category: Books
Test Accuracy: 0.9726, Precision: 0.9815, Recall: 0.9636, F1: 0.9725
--------------------------------------------------------------------


Evaluating discriminator accuracy on reviews of the other Amazon product review categories to examine generalization

In [None]:
categories = ['Clothing Shoes and Jewelry',
              'Electronics',
              'Home and Kitchen',
              'Kindle Store',
              'Movies and TV',
              'Pet Supplies',
              'Sports and Outdoors',
              'Tools and Home Improvement',
              'Toys and Games']

dataloaders = []

for category in categories:
    category_path = category.replace(' ', '_')
    file_name = dataset_path + f"Generated_Sentences_{category_path}_5.csv"
    df = pd.read_csv(file_name)

    _, other_test_df = train_test_split(df, test_size=0.05, stratify=df['label'], random_state=42)

    print()
    print(f"Category: {category}")
    print("Testing set...")
    print(other_test_df['label'].value_counts(normalize=True))
    print(other_test_df.head())
    print(55*"-")

    other_test_dataset = ReviewDataset(other_test_df)
    other_test_dataloader = DataLoader(other_test_dataset, batch_size=BATCHSIZE, collate_fn=collate_fn, shuffle=True)
    dataloaders.append(other_test_dataloader)


Category: Clothing Shoes and Jewelry
Testing set...
label
CG    0.502591
OR    0.497409
Name: proportion, dtype: float64
                          category  rating label  \
3373  Clothing_Shoes_and_Jewelry_5     1.0    CG   
2359  Clothing_Shoes_and_Jewelry_5     4.0    CG   
652   Clothing_Shoes_and_Jewelry_5     5.0    CG   
1017  Clothing_Shoes_and_Jewelry_5     5.0    OR   
296   Clothing_Shoes_and_Jewelry_5     4.0    OR   

                                                  text_  
3373  I'm 5'3" and 173lbs, 38D.  I wear a medium in ...  
2359  The boots are a little tight on the toes, but ...  
652   The large was a perfect fit. The wide knee par...  
1017  My daughter abusulutley loved them. she says  ...  
296   The shoes are great.  They run a little (half ...  
-------------------------------------------------------

Category: Electronics
Testing set...
label
CG    0.5
OR    0.5
Name: proportion, dtype: float64
           category  rating label  \
3448  Electronics_5     5.0

In [None]:
# Assuming our fine-tuned model has been loaded and set to the appropriate device
model.load_state_dict(torch.load('/content/drive/MyDrive/CS497/Final Project/gpt2_discriminator_Books_state_dict.pth'))
model.to(device)
model.eval()

# Evaluate metrics on the other product categories (excluding Books) on which we did not fine-tune
for i, dataloader in enumerate(dataloaders):
    # Metrics for binary classification
    accuracy = Accuracy(num_classes=2, average='macro', task="binary").to(device)
    precision = Precision(num_classes=2, average='macro', task="binary").to(device)
    recall = Recall(num_classes=2, average='macro', task="binary").to(device)
    f1 = F1Score(num_classes=2, average='macro', task="binary").to(device)

    # Testing loop
    print()
    print(f"Category: {categories[i]}")
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            lengths = batch['lengths'].to(device)
            labels = batch['labels'].to(device)  # tensor: [batch_size]

            logits = model(input_ids, attention_mask, lengths)

            # Update metrics
            preds = get_preds(logits)
            accuracy.update(preds, labels)
            precision.update(preds, labels)
            recall.update(preds, labels)
            f1.update(preds, labels)

    # Print the fine-tuned test accuracy
    other_test_accuracy = accuracy.compute()
    other_test_precision = precision.compute()
    other_test_recall = recall.compute()
    other_test_f1 = f1.compute()
    print(f"Test Accuracy: {other_test_accuracy:.4f}, Precision: {other_test_precision:.4f}, "
          f"Recall: {other_test_recall:.4f}, F1: {other_test_f1:.4f}")
    print(68*"-")


Category: Clothing Shoes and Jewelry
Test Accuracy: 0.8860, Precision: 0.9747, Recall: 0.7938, F1: 0.8750
--------------------------------------------------------------------

Category: Electronics
Test Accuracy: 0.9200, Precision: 1.0000, Recall: 0.8400, F1: 0.9130
--------------------------------------------------------------------

Category: Home and Kitchen
Test Accuracy: 0.9113, Precision: 0.9773, Recall: 0.8431, F1: 0.9053
--------------------------------------------------------------------

Category: Kindle Store
Test Accuracy: 0.9662, Precision: 0.9912, Recall: 0.9412, F1: 0.9655
--------------------------------------------------------------------

Category: Movies and TV
Test Accuracy: 0.9556, Precision: 0.9457, Recall: 0.9667, F1: 0.9560
--------------------------------------------------------------------

Category: Pet Supplies
Test Accuracy: 0.9202, Precision: 1.0000, Recall: 0.8411, F1: 0.9137
--------------------------------------------------------------------

Category: