#### Data Augmentation

In [84]:
import json
import random
import torch

import kagglehub
from tqdm import tqdm
import pandas as pd
from transformers import pipeline, GPT2Tokenizer, GPT2LMHeadModel
from nltk.corpus import wordnet
from nltk import download

#### Setting up GPU

In [85]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Is CUDA available: ", torch.cuda.is_available())
print("Number of GPUs: ", torch.cuda.device_count())
print("Current device: ", torch.cuda.current_device())
print("Device name: ", torch.cuda.get_device_name())
print(f"Using device: {device}")

Is CUDA available:  True
Number of GPUs:  1
Current device:  0
Device name:  NVIDIA GeForce GTX 1080 Ti
Using device: cuda


#### Load the dataset

In [86]:
# Load the dataset
path = kagglehub.dataset_download("rmisra/news-category-dataset")
dataset_path = f"{path}/News_Category_Dataset_v3.json"

# Load and preprocess the dataset
print("Loading dataset...")
with open(dataset_path, "r", encoding="utf-8") as f:
    raw_data = [json.loads(line) for line in f]

# Convert raw data into a DataFrame
data_df = pd.DataFrame(raw_data)

Loading dataset...


|#### Filter categories and sample the dataset

In [87]:
# Filter categories
# categories = [
#     "POLITICS", "WELLNESS", "ENTERTAINMENT", "TRAVEL", "STYLE & BEAUTY", "PARENTING",
#     "HEALTHY LIVING", "QUEER VOICES", "FOOD & DRINK", "BUSINESS", "COMEDY", "SPORTS",
#     "BLACK VOICES", "HOME & LIVING", "PARENTS"
# ]
categories = [
    "POLITICS"
]
filtered_df = data_df[data_df["category"].isin(categories)]

# Remove rows with empty descriptions
filtered_df = filtered_df[filtered_df["short_description"] != ""]

# Step 1: Limit dataset to a percentage
def sample_dataset(df, percentage):
    return df.sample(frac=percentage, random_state=42)

# Adjust the percentage here
percentage = 0.002
sampled_df = sample_dataset(filtered_df, percentage)

print(f"Original dataset size: {len(filtered_df)}")
print(f"Sampled dataset size: {len(sampled_df)}")

Original dataset size: 32441
Sampled dataset size: 65


#### Data augmentation using GPT-2 and word replacements

In [88]:
# Ensure WordNet is downloaded
download("wordnet")

# Load GPT-2
print("Loading GPT-2 model...")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Synonym replacement function using WordNet
def replace_with_synonyms(text, num_replacements=3):
    words = text.split()
    for _ in range(num_replacements):
        idx = random.randint(0, len(words) - 1)
        word = words[idx]
        synonyms = [lemma.name() for syn in wordnet.synsets(word) for lemma in syn.lemmas() if lemma.name() != word]
        if synonyms:
            words[idx] = random.choice(synonyms)
    return " ".join(words)

# Augmentation function
def augment_instance(instance, num_augmentations=1):
    augmented_instances = []
    for _ in range(num_augmentations):
        # Construct GPT-2 prompt
        # prompt = f"Link: {instance['link']}\nHeadline: {instance['headline']}\nCategory: {instance['category']}\nAuthors: {instance['authors']}\nDate: {instance['date']}\nDescription: {instance['short_description']}\n"
        prompt = f"Headline: {instance['headline']}\nCategory: {instance['category']}\nAuthors: {instance['authors']}\nDate: {instance['date']}\nDescription: {instance['short_description']}\n"

        # Generate new text using GPT-2
        generated_text = generator(prompt, max_length=150, num_return_sequences=1)[0]["generated_text"]

        # Extract the generated description
        new_description = generated_text.split("Description:")[-1].strip()

        # Replace random words in the description with synonyms
        new_description = replace_with_synonyms(new_description)

        # Create a new instance with updated description
        new_instance = instance.copy()
        new_instance["short_description"] = new_description
        augmented_instances.append(new_instance)
    return augmented_instances

# Apply augmentation
print("Applying data augmentation...")
augmented_data = []
for _, row in tqdm(sampled_df.iterrows(), total=len(sampled_df)):
    num_augmentations = random.randint(0, 2)
    augmented_data.extend(augment_instance(row.to_dict(), num_augmentations))

# Combine original and augmented data
# final_dataset = sampled_df.to_dict(orient="records") + augmented_data

# Combine only augmented data
final_dataset = augmented_data

# Save augmented dataset with each instance on a single line
output_path = "augmented_news_category_dataset.json"
with open(output_path, "w", encoding="utf-8") as f:
    for instance in final_dataset:
        json.dump(instance, f, ensure_ascii=False)
        f.write("\n")

print(f"Augmented dataset saved to {output_path}")


Loading GPT-2 model...

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Stefan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!





Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Applying data augmentation...


  0%|          | 0/65 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  2%|▏         | 1/65 [00:01<02:03,  1.93s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  3%|▎         | 2/65 [00:03<01:41,  1.61s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 11%|█         | 7/65 [00:04<00:29,  1.97it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 17%|█▋        | 11/65 [0

Augmented dataset saved to augmented_news_category_dataset.json





In [89]:
# test gpt2 with a prompt
prompt = "Trump promises that"

generated_text = generator(prompt, max_length=150, num_return_sequences=1)[0]["generated_text"]

print(generated_text)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Trump promises that his administration would not negotiate with Iran over its nuclear program unless the U.S. gets a final deal. One would hope that Obama's administration would find that the only problem with it is a looming government-imposed deadline to fulfill a major campaign promise. Perhaps it would return to the "nuclear nonproliferation treaties" (NPT), as we did in 2001. But it would be a long, dangerous journey to reach that goal despite the assurances from President Bush and Vice President Cheney that Iran must have a comprehensive program to develop a nuclear weapon.

The real reason behind this has not been determined. And the problem is likely deeper than the rhetoric we see it making clear.

The reason is that Washington sees a
