In [2]:
!pip install datasets transformers huggingface_hub transformers[torch] accelerate --upgrade
import math




In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import load_dataset
from huggingface_hub import login

In [4]:
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
import re
from sklearn.model_selection import train_test_split

In [6]:
f = open("./datasets/cinema_of_india.txt", "r")
text = f.readlines()

In [7]:
print(len(text))

1005


In [8]:
def build_text_files(data_text, dest_path):
    f = open(dest_path, 'w')
    data = ''
    for texts in data_text:
        summary = str(texts).strip()
        summary = re.sub(r"\s", " ", summary)
        data += summary + "  "
    f.write(data)

train, test = train_test_split(text,test_size=0.15)


build_text_files(train,'train_dataset.txt')
build_text_files(test,'test_dataset.txt')

print("Train dataset length: "+str(len(train)))
print("Test dataset length: "+ str(len(test)))

print(train)
print(test)

Train dataset length: 854
Test dataset length: 151
[' Prasad, Shishir; Ramnath, N. S.; Mitter, Sohini (27 April 2013). "25 Greatest Acting Performances of Indian Cinema". Forbes. Retrieved 27 January 2015.\n', "Velayutham, Selvaraj (2008). Tamil Cinema: The Cultural Politics of India's Other Film Industry. Psychology Press. ISBN 978-0-415-39680-6.\n", 'Whistling Woods International\n', ' Maker of innovative, meaningful movies. The Hindu, 15 June 2007\n', ' "Interview with Sange Dorjee". DearCinema. Archived from the original on 8 July 2014. Retrieved 22 July 2014.\n', 'International Film Festival of India\n', ' Gokulsing, K. Moti; Dissanayake, Wimal (2004). Indian Popular Cinema: A Narrative of Cultural Change. Trentham Books. pp. 98–99. ISBN 1-85856-329-1.\n', 'Punjabi\n', 'Multilingual\n', 'Main article: Punjabi cinema\n', ' "Awards". Busan International Film Festival. Archived from the original on 20 June 2017. Retrieved 27 June 2017.\n', ' Muthiah, S. (7 September 2009). "The pione

In [9]:
tokeinzer = AutoTokenizer.from_pretrained("gpt2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [10]:
train_path = "train_dataset.txt"
test_path = "test_dataset.txt"

In [11]:
from transformers import TextDataset, DataCollatorForLanguageModeling

def load_dataset(train_path, test_path, tokenizer):
  train_dataset = TextDataset(tokenizer=tokenizer, file_path=train_path, block_size=64)

  test_dataset = TextDataset(tokenizer=tokenizer, file_path=test_path, block_size=64)

  data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

  return train_dataset, test_dataset, data_collator

train_dataset, test_dataset, data_collator = load_dataset(train_path, test_path, tokeinzer)



In [12]:
from transformers import Trainer, TrainingArguments, AutoModelWithLMHead

model = AutoModelWithLMHead.from_pretrained("gpt2")



In [13]:
training_args = TrainingArguments(
    output_dir="./gpt2-wiki-indian-cinema",
    overwrite_output_dir=True,
    num_train_epochs=2,
    per_device_train_batch_size=64,
    save_steps=100,
    warmup_steps=50,
    per_device_eval_batch_size=64,
    evaluation_strategy="steps",
    save_total_limit=1,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)



In [14]:
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=22, training_loss=4.3198557767001065, metrics={'train_runtime': 1534.3498, 'train_samples_per_second': 0.889, 'train_steps_per_second': 0.014, 'total_flos': 44550291456000.0, 'train_loss': 4.3198557767001065, 'epoch': 2.0})

In [15]:
trainer.save_model()

In [16]:
input_text = "Raj Kapoor was "
input_ids = tokeinzer.encode(input_text, return_tensors="pt")
output = model.generate(input_ids, max_length=100, num_return_sequences=1)

generated_text = tokeinzer.decode(output[0], skip_special_tokens=True)
print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Raj Kapoor was  the first Indian to be a Hindu. He was a Hindu and was the first Indian to be elected as a member of the Parliament. He was the first Indian to be elected as a member of the Supreme Court of India. He was the first Indian to be elected as a member of the Supreme Court. He was the first Indian to be elected as a member of the Supreme Court of India. He was the first Indian to be elected as a member of the Supreme


In [17]:
model.save_pretrained("gpt2-wiki-indian-cinema")
tokeinzer.save_pretrained("gpt2-wiki-indian-cinema")

('gpt2-wiki-indian-cinema/tokenizer_config.json',
 'gpt2-wiki-indian-cinema/special_tokens_map.json',
 'gpt2-wiki-indian-cinema/vocab.json',
 'gpt2-wiki-indian-cinema/merges.txt',
 'gpt2-wiki-indian-cinema/added_tokens.json',
 'gpt2-wiki-indian-cinema/tokenizer.json')

In [19]:
from huggingface_hub import login, create_repo, Repository
login()
repo_name = "fine-tuned-gpt2-wiki-indian-cinema"  # Change this to your desired repository name
from huggingface_hub import HfApi

# Initialize the HfApi instance
api = HfApi()

# Create a new repository
username = api.whoami()['name']  # Get your Hugging Face username
full_repo_name = f"{username}/{repo_name}"

# Create the repository (you can also create it on the Hugging Face website)
api.create_repo(repo_name, private=False)

api.upload_folder(
    folder_path='./gpt2-wiki-indian-cinema',  # Path to the folder with your model
    repo_id=full_repo_name,  # Model repository name
    commit_message="GPT-2 Indian Cinema - Wiki"
)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

optimizer.pt:   0%|          | 0.00/996M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Upload 9 LFS files:   0%|          | 0/9 [00:00<?, ?it/s]

rng_state.pth:   0%|          | 0.00/14.0k [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

events.out.tfevents.1719847370.4cf3ebad8ddb.6079.1:   0%|          | 0.00/4.18k [00:00<?, ?B/s]

events.out.tfevents.1719847906.4cf3ebad8ddb.24254.0:   0%|          | 0.00/5.44k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/metriccoders/fine-tuned-gpt2-wiki-indian-cinema/commit/bdeecdddc11f9d75572b0e2fc2d7a11ba855383e', commit_message='GPT-2 Indian Cinema - Wiki', commit_description='', oid='bdeecdddc11f9d75572b0e2fc2d7a11ba855383e', pr_url=None, pr_revision=None, pr_num=None)