In [1]:
# %pip install transformers torch
# Author: Chittenden, R. H. (Russell Henry)

from urllib.request import urlopen



In [2]:
def download_text(url):
    response = urlopen(url)
    raw_text = response.read().decode('utf-8')
    return raw_text

# On Digestive Proteolysis
url1 = "https://www.gutenberg.org/cache/epub/47938/pg47938.txt"  
raw_text1 = download_text(url1)

# The nutrition of man
url2 = "https://www.gutenberg.org/cache/epub/69439/pg69439.txt"  
raw_text2 = download_text(url2)

# Physiological economy in nutrition, with special reference to the minimal proteid requirement of the healthy man an 
# experimental study
url3 = "https://www.gutenberg.org/cache/epub/68830/pg68830.txt"  
raw_text3 = download_text(url3)


In [3]:
combined_dataset = raw_text1 + raw_text2 + raw_text3
# combined_dataset


In [4]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments



  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

encodings = tokenizer(combined_dataset, return_tensors='pt', max_length=1024, truncation=True)




In [6]:
from torch.utils.data import Dataset, DataLoader



In [7]:
class BookDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings['input_ids']

    def __len__(self):
        return self.encodings.size(0)

    def __getitem__(self, idx):
        return self.encodings[idx]

dataset = BookDataset(encodings)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)


In [8]:
from torch.optim import AdamW
from transformers.optimization import get_linear_schedule_with_warmup
from tqdm import tqdm




In [9]:
# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)



In [10]:
# model.train()
num_epochs = 5
gradient_accumulation_steps = 7
print(len(dataloader))

# for epoch in range(epochs):
#     loop = tqdm(dataloader, leave=True)
#     for batch in loop:
#         optimizer.zero_grad()
#         outputs = model(batch, labels=batch)
#         loss = outputs.loss
#         loss.backward()
#         optimizer.step()
        
#         # Update the progress bar
#         loop.set_description(f'Epoch {epoch}')
#         loop.set_postfix(loss=loss.item())
        

        






1


In [11]:
# Learning rate scheduler
total_training_steps = len(dataloader) * num_epochs
warmup_steps = int(0.1 * total_training_steps)  # 10% of the training steps
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=warmup_steps, 
                                            num_training_steps=total_training_steps)




In [12]:
# Training loop (simplified example)
model.train()
for epoch in range(num_epochs):
    for batch in dataloader:
        outputs = model(batch, labels=batch)
        loss = outputs.loss
        loss.backward()
        
#         # Gradient accumulation step (if needed)
#         if gradient_accumulation_steps > 1:
#             if (step + 1) % gradient_accumulation_steps == 0:
#                 optimizer.step()
#                 scheduler.step()
#                 optimizer.zero_grad()
#         else:
#             optimizer.step()
#             scheduler.step()
#             optimizer.zero_grad()


        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
model.save_pretrained('./fine_tuned_gpt')
tokenizer.save_pretrained('./fine_tuned_gpt')


('./fine_tuned_gpt/tokenizer_config.json',
 './fine_tuned_gpt/special_tokens_map.json',
 './fine_tuned_gpt/vocab.json',
 './fine_tuned_gpt/merges.txt',
 './fine_tuned_gpt/added_tokens.json')

In [13]:
import textwrap


In [14]:
from transformers import pipeline



In [15]:
generator = pipeline('text-generation', model='./fine_tuned_gpt', tokenizer='./fine_tuned_gpt')


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [16]:
sample1 = generator("the", max_length=50, num_return_sequences=1)
sample2 = generator("he", max_length=50, num_return_sequences=1)
sample3 = generator("hence", max_length=50, num_return_sequences=1)

# Print generated samples
print("Sample 1 (Starting Word: 'the'):\n", textwrap.fill(str(sample1), width=100))
print("\nSample 2 (Starting Word: 'he'):\n", textwrap.fill(str(sample2), width=100))
print("\nSample 3 (Starting Word: 'hence'):\n", textwrap.fill(str(sample3), width=100))


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Sample 1 (Starting Word: 'the'):
 [{'generated_text': 'the-day-book">How To Survive The Yearbook</div></article>\n\n1) You won\'t be
able to catch the yearbook, but you\'ll have access to the internet.\n\nAs we saw several other
days'}]

Sample 2 (Starting Word: 'he'):
 [{'generated_text': 'he-Shiite Shiites, among others, who have the most power. However, when the
Shiites were attacked by the Iranian army, they went to a village and found the bodies of a large
number of Shiites. After a time'}]

Sample 3 (Starting Word: 'hence'):
 [{'generated_text': 'hence, a former leader of the Democratic Party in South Africa\'s Legislative
Assembly (LCA), said.\n\n"I am so tired of saying it, but that\'s not the way that our country\'s
future was supposed to be held in'}]


In [17]:
sample1 = generator("acid", max_length=50, num_return_sequences=1)
sample2 = generator("proteid", max_length=50, num_return_sequences=1)
sample3 = generator("peptone", max_length=50, num_return_sequences=1)
# Print generated samples
print("Sample 1 (Specific Word: 'acid'):\n", textwrap.fill(str(sample1), width=100))
print("\nSample 2 (Specific Word: 'proteid'):\n", textwrap.fill(str(sample2), width=100))
print("\nSample 3 (Specific Word: 'peptone'):\n", textwrap.fill(str(sample3), width=100))


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Sample 1 (Specific Word: 'acid'):
 [{'generated_text': 'acid has been found in all 653 cases of manganese skin lesions caused by this
chemical type, and some studies have suggested that these lesions can lead to the development of
some autoimmune thyroid diseases. In a study led by the University of California,'}]

Sample 2 (Specific Word: 'proteid'):
 [{'generated_text': 'proteidosis.\n\n3-HTR-D3-HIP3-4-INV-H-3,INR-3-PIPE-3-p3+ (1–42 mM, 0.'}]

Sample 3 (Specific Word: 'peptone'):
 [{'generated_text': 'peptone (7), the most prominent compound that has a role in opioid addiction
and related behaviors. These substances can increase the risk of addiction, which is a symptom of
increased opioid use. If users are taking them with a high dose of opioids'}]


In [18]:
sample1 = generator("of", max_length=50, num_return_sequences=1)
sample2 = generator("is", max_length=50, num_return_sequences=1)
sample3 = generator("by", max_length=50, num_return_sequences=1)
# Print generated samples
print("Sample 1 (Function Word: 'of'):\n", textwrap.fill(str(sample1), width=100))
print("\nSample 2 (Function Word: 'is'):\n", textwrap.fill(str(sample2), width=100))
print("\nSample 3 (Function Word: 'by'):\n", textwrap.fill(str(sample3), width=100))


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Sample 1 (Function Word: 'of'):
 [{'generated_text': 'of on your side. It takes only two to three points for me to have a good
understanding of what a certain group of characters are doing and what they should be
thinking.\n\nGrowth as such has been a big part of my play,'}]

Sample 2 (Function Word: 'is'):
 [{'generated_text': 'is a place where the first responders can learn about what has happened.\n\nIt
has led to questions about whether police can properly handle complaints. One has the potential to
undermine community trust due to possible bias for officers trying to intervene.\n\nAnd'}]

Sample 3 (Function Word: 'by'):
 [{'generated_text': 'by-laws that are passed in the community," said Richard L. Clements, the
director of the community program at the South Carolina Legal Education Association. "It doesn\'t
matter who is elected, but in general, the more that we have the'}]


In [19]:
sample1 = generator("may", max_length=50, num_return_sequences=1)
sample2 = generator("action", max_length=50, num_return_sequences=1)
sample3 = generator("products", max_length=50, num_return_sequences=1)

# Print generated samples
print("Sample 1 (Content Word: 'may'):\n", textwrap.fill(str(sample1), width=100))
print("\nSample 2 (Content Word: 'action'):\n", textwrap.fill(str(sample2), width=100))
print("\nSample 3 (Content Word: 'products'):\n", textwrap.fill(str(sample3), width=100))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Sample 1 (Content Word: 'may'):
 [{'generated_text': "may, this will help us in the long run to not have to deal with this situation
because in the long term, if we want to fight effectively we have to fight the people. Right now,
I'd say if we win, we can get"}]

Sample 2 (Content Word: 'action'):
 [{'generated_text': 'action of the system by the government; and this, moreover, was made a matter
of policy by the government in order to further the public interest. That the government had
provided for the provision of services in particular, and therefore that the government and its'}]

Sample 3 (Content Word: 'products'):
 [{'generated_text': 'products.\n\nThe study also provides new insight into the process of gene
editing the genomes of plants and animals, and may help scientists understand how the immune system
may react to genetically modified organisms (GM), as well as what might happen to wild
populations'}]
