In [4]:
import nltk
import random
import pandas as pd
import numpy as np
from nltk.corpus import brown, reuters

In [5]:
df = pd.read_csv("/kaggle/input/truncated-sentences/truncated_sentences.csv")


In [6]:
df.head()

Unnamed: 0,truncated_sentence,genre
0,"If possible ,",religion
1,It was enough,romance
2,The school year,government
3,As a source,government
4,Members of The,government


In [7]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
login(user_secrets.get_secret("HUGGINGFACE_TOKEN"))

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [8]:
import re

#function to extract first sentence from the block of text generated / if no termination is found, the entire sentence is returned.
def sentence_extractor(text_to_extract):
  pattern = r'^[^.]*?(?:\.{1,3})(?=\s+[A-Z]|$)'
  match = re.search(pattern, text_to_extract)
  if match:
    return match.group()
  else:
    return text_to_extract

In [29]:
from datasets import Dataset


dataset = Dataset.from_pandas(df.drop(columns='genre'))

dataset

Dataset({
    features: ['truncated_sentence'],
    num_rows: 1500
})

In [30]:
import transformers
import torch
import re
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
from transformers.pipelines.pt_utils import KeyDataset
from tqdm.auto import tqdm

current_model = 'meta-llama/Llama-3.2-1B'
generator = pipeline("text-generation", model=current_model, device=0)

# KeyDataset (only *pt*) will simply return the item in the dict returned by the dataset item
# as we're not interested in the *target* part of the dataset. For sentence pair use KeyPairDataset
outputs = []
for out in tqdm(generator(KeyDataset(dataset, "truncated_sentence"), truncation=True, max_length=50, num_return_sequences=3, return_full_text=False, do_sample=True, top_k = 50)):
    outputs.append(out)


  0%|          | 0/1500 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

In [37]:
def post_data_processing(file_name):
  print("I was called.")
  data = []
  for i in range(len(df)):
    for output in outputs[i]:
        data.append([df['truncated_sentence'][i], df['genre'][i], sentence_extractor(output['generated_text']), current_model])
  columns = ['truncated_sentence', 'genre', 'generated_sentence', 'model']
  dataset = pd.DataFrame(data, columns=columns)
  dataset.to_csv(file_name, index=False)

In [38]:
post_data_processing("llama.csv")


I was called.


In [39]:
current_model = 'google/gemma-2-2b-it'
generator = pipeline("text-generation", model=current_model, device=0)

# KeyDataset (only *pt*) will simply return the item in the dict returned by the dataset item
# as we're not interested in the *target* part of the dataset. For sentence pair use KeyPairDataset
outputs = []
for out in tqdm(generator(KeyDataset(dataset, "truncated_sentence"), truncation=True, max_length=50, num_return_sequences=3, return_full_text=False, do_sample=True, top_k = 50, pad_token_id=generator.tokenizer.eos_token_id)):
    outputs.append(out)

config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

  0%|          | 0/1500 [00:00<?, ?it/s]

In [40]:
post_data_processing("gemma2.csv")

I was called.
