In [1]:
!pip install datasets transformers huggingface_hub transformers[torch] accelerate --upgrade

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting transformers
  Downloading transformers-4.43.3-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface_hub
  Downloading huggingface_hub-0.24.5-py3-none-any.whl.metadata (13 kB)
Collecting accelerate
  Downloading accelerate-0.33.0-py3-none-any.whl.metadata (18 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)

In [3]:
from transformers import TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments, AutoModelWithLMHead, AutoTokenizer
import re
import torch
from datasets import load_dataset
from sklearn.model_selection import train_test_split

In [4]:
def read_input_file(file_path):
    with open(file_path, 'r') as f:
        data = f.read().splitlines()
    return data

In [5]:
def build_text_files(data_text, dest_path):
    f = open(dest_path, 'w')
    data = ''
    for texts in data_text:
        summary = str(texts).strip()
        summary = re.sub(r"\s", " ", summary)
        data += summary + "  "
    f.write(data)

In [6]:
def load_dataset(train_path,test_path,tokenizer):
  train_dataset = TextDataset(tokenizer=tokenizer,file_path=train_path,block_size=64)
  test_dataset = TextDataset(tokenizer=tokenizer,file_path=test_path, block_size=64)
  data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
  return train_dataset,test_dataset,data_collator

In [9]:
def fine_tune_model(input_file_path, predict_future_text):
  text = read_input_file(input_file_path)
  train, test = train_test_split(text, test_size=0.2)
  build_text_files(train, 'train_dataset.txt')
  build_text_files(test, 'test_dataset.txt')

  tokenizer = AutoTokenizer.from_pretrained('gpt2', truncation=True, padding=True)
  model = AutoModelWithLMHead.from_pretrained('gpt2')

  train_dataset, test_dataset, data_collator = load_dataset('train_dataset.txt', 'test_dataset.txt', tokenizer)

  training_args = TrainingArguments(output_dir='./results', num_train_epochs=2, logging_steps=100, save_steps=100, per_device_train_batch_size=64, per_device_eval_batch_size=64)

  trainer = Trainer(model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=test_dataset)
  trainer.train()
  trainer.save_model()

  input_ids = tokenizer.encode(predict_future_text, return_tensors='pt').to('cuda')

  output=model.generate(input_ids, max_length=500, num_return_sequences=1)

  return tokenizer.decode(output[0], skip_special_tokens=True)


In [10]:
fine_tune_model("./train.txt", "Nandan Nilekani is ")



Step,Training Loss


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


'Nandan Nilekani is \xa0(19)\nThe first-class passenger of the Indian Express, the first Indian Express passenger to be named by the Times of India, was a passenger of the Indian Express.\nThe Express has been named by the Times of India as the first Indian Express to be named by the Times of India.\nThe Express has been named by the Times of India as the first Indian Express to be named by the Times of India.\nThe Express has been named by the Times of India as the first Indian Express to be named by the Times of India.\nThe Express has been named by the Times of India as the first Indian Express to be named by the Times of India.\nThe Express has been named by the Times of India as the first Indian Express to be named by the Times of India.\nThe Express has been named by the Times of India as the first Indian Express to be named by the Times of India.\nThe Express has been named by the Times of India as the first Indian Express to be named by the Times of India.\nThe Express has been