In [None]:
import nltk
import random
import pandas as pd
import numpy as np
from nltk.corpus import brown, reuters

In [None]:
!pip install --upgrade transformers



In [None]:
nltk.download('brown')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [None]:
categories = brown.categories()
print(str(categories))

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [None]:
data = []

check = True
for genre in categories:
    sentences = brown.sents(categories=genre)
    for sentence in sentences:
        # Skipping sentences shorter than 5 words
        if len(sentence) >= 5:
            length = random.choice([3, 4])
            truncated_sentence = ' '.join(sentence[:length])
            if check:
                check = False
                print(truncated_sentence)
            data.append({
                'truncated_sentence': truncated_sentence,
                'genre': genre
                })


Dan Morgan told himself


In [None]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,truncated_sentence,genre
0,Dan Morgan told himself,adventure
1,He was well rid,adventure
2,He certainly didn't,adventure
3,If he had married,adventure
4,But all of this,adventure


In [None]:
df = df.drop_duplicates(subset=['truncated_sentence'])
print(df['genre'].value_counts())

genre
belles_lettres     6773
learned            6591
lore               4356
adventure          4194
news               3978
hobbies            3658
fiction            3632
romance            3619
mystery            3193
editorial          2615
government         2418
reviews            1513
religion           1397
humor               903
science_fiction     762
Name: count, dtype: int64


In [None]:
#making the dataset unique
target_per_genre = 100

# Initialize an empty list to store the sampled data
sampled_data = []

for genre in df['genre'].unique():
    genre_df = df[df['genre'] == genre]
    sampled_genre_df = genre_df.sample(n=target_per_genre, random_state=42)
    sampled_data.append(sampled_genre_df)

uniform_df = pd.concat(sampled_data)

# Randomized the order of the rows
uniform_df = uniform_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Check the final distribution and total count
print(uniform_df['genre'].value_counts())
print(f"Total data points: {len(uniform_df)}")


genre
religion           100
romance            100
government         100
lore               100
news               100
learned            100
reviews            100
editorial          100
science_fiction    100
adventure          100
humor              100
fiction            100
mystery            100
hobbies            100
belles_lettres     100
Name: count, dtype: int64
Total data points: 1500


In [None]:
#saving the prompts to a csv file
uniform_df.to_csv('truncated_sentences.csv', index=False)

In [None]:
uniform_df.head()

Unnamed: 0,truncated_sentence,genre
0,"If possible ,",religion
1,It was enough,romance
2,The school year,government
3,As a source,government
4,Members of The,government


In [None]:
from huggingface_hub import login
from google.colab import userdata

login(userdata.get('HUGGINGFACE_TOKEN'))

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
import re

#function to extract first sentence from the block of text generated / if no termination is found, the entire sentence is returned.
def sentence_extractor(text_to_extract):
  pattern = r'^[^.]*?(?:\.{1,3})(?=\s+[A-Z]|$)'
  match = re.search(pattern, text_to_extract)
  if match:
    return match.group()
  else:
    return text_to_extract

In [None]:
import transformers
import torch
import re
from transformers import pipeline
from transformers.pipelines.pt_utils import KeyDataset
from tqdm.auto import tqdm

generator = pipeline("text-generation", model="openai-community/gpt2")
current_model = 'openai-community/gpt2'


outputs = generator(uniform_df['truncated_sentence'].tolist(), max_length=50, num_return_sequences=3, return_full_text=False, top_k = 50)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end 

In [None]:
outputs[0]

data = []
for i in range(len(uniform_df)):
  for output in outputs[i]:
      data.append([uniform_df['truncated_sentence'][i], uniform_df['genre'][i], sentence_extractor(output['generated_text']), current_model])

# dataset = pd.DataFrame(data, ['truncated_sentence','genre','generated_sentence','model'])
# dataset.head()

In [None]:
outputs[4]
columns = ['truncated_sentence', 'genre', 'generated_sentence', 'model']
dataset = pd.DataFrame(data, columns=columns)


In [None]:
dataset.head()

Unnamed: 0,truncated_sentence,genre,generated_sentence,model
0,"If possible ,",religion,add a separate layer of complexity as far as ...,openai-community/gpt2
1,"If possible ,",religion,make sure you are at least 32 bytes long).,openai-community/gpt2
2,"If possible ,",religion,please contact us:\n\n1.,openai-community/gpt2
3,It was enough,romance,for one woman.,openai-community/gpt2
4,It was enough,romance,to cause the whole area of the town to quake ...,openai-community/gpt2


In [None]:
#checkpoint - saving data created so far
dataset.to_csv('gpt2data.csv', index=False)

In [None]:
generator = pipeline("text-generation", model="openai-community/gpt2")
current_model = 'meta-llama/Llama-3.2-1B'


outputs_llama = generator(uniform_df['truncated_sentence'].tolist(), truncation = True, max_length=50, num_return_sequences=3, return_full_text=False, top_k = 50)




Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin