In [2]:
# Assignment 2 - Maaz Saad

# Import libraries
import os
import re
import numpy as np 
import pandas as pd 
!pip install transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
! pip install datasets
from datasets import Dataset
import requests

# Question 1

# Step 1 - Download the three books in plain text format from Project Gutenberg
# Source - https://stackoverflow.com/questions/73598825/how-to-get-file-from-url-in-python
# Books: Alice in Wonderland, Great Gatsby, Tale of Two Cities

file_url = 'https://www.gutenberg.org/files/11/11-0.txt'
alice = requests.get(file_url).text

file_url = 'https://www.gutenberg.org/files/64317/64317-0.txt'
gatsby = requests.get(file_url).text

file_url = 'https://www.gutenberg.org/files/98/98-0.txt'
two_cities = requests.get(file_url).text

# Step 2 - Clean the text by removing any special characters, punctuation, and converting everything to lowercase.
# Source 1: https://stackoverflow.com/questions/55187374/cleaning-text-with-python-and-re 
# Source 2: https://machinelearningmastery.com/clean-text-machine-learning-python/
# Source 3: https://stackoverflow.com/questions/13613336/how-do-i-concatenate-text-files-in-python
# Source 4: Progamming & Data Processing - Fall 2022 - Professor Amin Ibrahim

# Clean Data
def clean_text(text):
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text).lower()
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text

# Load Data
books = [alice, gatsby, two_cities]
sentences = []
for book in books:
    sentences.extend(book.split("."))
sentences = [clean_text(x) for x in sentences] # Data File

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.1-py3-none-any.whl (6.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m93.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m105.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.2-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.2/199.2 KB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.2 tokenizers-0.13.2 transformers-4.27.1
Looking in indexes: https://pypi.org/simple, https://u

In [3]:
# For the remaining assignment, I use a combination of the following links for guidance in addition to the notebooks/notes provided by the professor.
# Additionally, I used ChatGPT to get an understanding of some code in these links. I gave it a piece of code & asked it to explain it to me in 'simple terms'.
# Source 1: https://huggingface.co/docs/transformers/training
# Source 2: https://huggingface.co/course/chapter7/6?fw=tf
# Source 3: https://www.kaggle.com/code/tuckerarrants/text-generation-with-huggingface-gpt2
# Source 4: https://towardsdatascience.com/text-generation-with-python-and-gpt-2-1fecbff1635b (Incognito Mode)
# Source 5: https://flowygo.com/en/blog/gpt-2-automatic-text-generation-with-python/
# Source 6: https://stackoverflow.com/questions/70544129/transformers-asking-to-pad-but-the-tokenizer-does-not-have-a-padding-token
# Source 7: https://huggingface.co/docs/transformers/main_classes/trainer
# Source 8: https://stackoverflow.com/questions/68759885/print-input-output-grad-loss-at-every-step-epoch-when-training-transformer
# Source 9: https://web.eecs.umich.edu/~justincj/teaching/eecs442/WI2021/colab.html

# Step 3 - Load Tokenizer and GPT2 Model 

tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")

# Step 3.5 - Specifies special token for padding 
# Without this step, step 4 will give an error

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

# Step 4 - Create a dataset for processing using Hugging Face, and tokenize dataset

dataset = Dataset.from_dict({'text': sentences})
tokenized_data = dataset.map(lambda x:
    tokenizer(x['text'], return_tensors="pt", padding='max_length', truncation=True, max_length=128))

tokenized_data = tokenized_data.remove_columns(['text'])

# Step 5 - Initialize collator for creating labelled data

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, return_tensors="pt")

# Step 6 - Specify the training parameters

training_args = TrainingArguments(
    output_dir='test_dir',
    overwrite_output_dir=True,
    num_train_epochs=1,
    logging_steps=100,
    report_to="none",
    fp16=True,
    disable_tqdm=True,
    debug="underflow_overflow"
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_data, 
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Map:   0%|          | 0/11374 [00:00<?, ? examples/s]

In [4]:
# Step 7 - Train Model

trainer.train()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 16.0487, 'learning_rate': 4.676511954992968e-05, 'epoch': 0.07}
{'loss': 4.783, 'learning_rate': 4.324894514767933e-05, 'epoch': 0.14}
{'loss': 4.616, 'learning_rate': 3.973277074542898e-05, 'epoch': 0.21}
{'loss': 4.5507, 'learning_rate': 3.621659634317862e-05, 'epoch': 0.28}
{'loss': 4.4971, 'learning_rate': 3.270042194092827e-05, 'epoch': 0.35}
{'loss': 4.4511, 'learning_rate': 2.9184247538677924e-05, 'epoch': 0.42}
{'loss': 4.4162, 'learning_rate': 2.5668073136427567e-05, 'epoch': 0.49}
{'loss': 4.3949, 'learning_rate': 2.2151898734177217e-05, 'epoch': 0.56}
{'loss': 4.3914, 'learning_rate': 1.8635724331926866e-05, 'epoch': 0.63}
{'loss': 4.3838, 'learning_rate': 1.5119549929676513e-05, 'epoch': 0.7}
{'loss': 4.3636, 'learning_rate': 1.160337552742616e-05, 'epoch': 0.77}
{'loss': 4.3828, 'learning_rate': 8.08720112517581e-06, 'epoch': 0.84}
{'loss': 4.3125, 'learning_rate': 4.571026722925457e-06, 'epoch': 0.91}
{'loss': 4.2783, 'learning_rate': 1.0548523206751055e-06, 'epo

TrainOutput(global_step=1422, training_loss=5.26128532849619, metrics={'train_runtime': 493.6604, 'train_samples_per_second': 23.04, 'train_steps_per_second': 2.881, 'train_loss': 5.26128532849619, 'epoch': 1.0})

In [5]:
# Question 2

# Generate tokenized text

generated_text = model.to('cpu').generate(
    input_ids=tokenizer.encode("Test", return_tensors="pt"),
    max_length=100,
    num_return_sequences=1,
    no_repeat_ngram_size=2,
    repetition_penalty=1.5,
    top_p=0.92,
    temperature=0.75
)

# Decode tokenized text to text

decoded_text = tokenizer.decode(generated_text[0], skip_special_tokens=True)

print(decoded_text)

# The is the text that was generated (96 words)

# Test the same time he said to me with a smile of satisfaction and an air that 
# was not so much as his own but rather more than mine it had been my own fault 
# for having intention in making him out on account i should have known better 
# when we were young then what you are going through now if your father is dead 
# or alive at any rate mr. alice replied no one knows how she will be remembered 
# by her husband until after this day which may happen very soon afterwards though they

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Test the same time he said to me with a smile of satisfaction and an air that was not so much as his own but rather more than mine it had been my own fault for having intention in making him out on account i should have known better when we were young then what you are going through now if your father is dead or alive at any rate mr. alice replied no one knows how she will be remembered by her husband until after this day which may happen very soon afterwards though they
