In [1]:
%pip install --upgrade pip

Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install urlopen nltk

Note: you may need to restart the kernel to use updated packages.


In [3]:
from urllib.request import urlopen
import re
import random
import nltk
from nltk.util import bigrams, trigrams
from nltk import FreqDist, ConditionalFreqDist
from nltk.corpus import stopwords
# Importing functions from utils.py
from utils import (preprocess_gutenberg_text, generate_bigram_text, generate_char_bigram_text, generate_trigram_text, get_content_and_function_words)
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers.optimization import get_linear_schedule_with_warmup
from tqdm import tqdm
import textwrap

from transformers import pipeline


[nltk_data] Downloading package punkt to /Users/onkars/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/onkars/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


In [4]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/onkars/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [5]:
from IPython.core.display import display, HTML

# This forces the output of text cells to wrap
display(HTML("<style>.output_result {white-space: pre-wrap;}</style>"))

  from IPython.core.display import display, HTML


### Data Ingestion

In [6]:
# Author: Chittenden, R. H. (Russell Henry)

# On Digestive Proteolysis
url1 = "https://www.gutenberg.org/cache/epub/47938/pg47938.txt"  
response1 = urlopen(url1)
raw_text1 = response1.read().decode('utf-8')

# The nutrition of man
url2 = "https://www.gutenberg.org/cache/epub/69439/pg69439.txt"  
response2 = urlopen(url2)
raw_text2 = response2.read().decode('utf-8')

# Physiological economy in nutrition, with special reference to the minimal proteid requirement of the healthy man an 
# experimental study
url3 = "https://www.gutenberg.org/cache/epub/68830/pg68830.txt"  
response3 = urlopen(url3)
raw_text3 = response3.read().decode('utf-8')

### Data Preprocessing

In [7]:
texts = [raw_text1, raw_text2, raw_text3]

preprocessed_texts = {}

# Loop through each raw text and preprocess it and store it in above created dictionary
for idx, text in enumerate(texts, 1):
    preprocessed_texts[f"preprocessed_raw_text{idx}"] = preprocess_gutenberg_text(text)

print(len(preprocessed_texts["preprocessed_raw_text1"])) 
print(len(preprocessed_texts["preprocessed_raw_text2"]))  
print(len(preprocessed_texts["preprocessed_raw_text3"]))

combined_tokens = preprocessed_texts["preprocessed_raw_text1"]+preprocessed_texts["preprocessed_raw_text1"]+preprocessed_texts["preprocessed_raw_text1"]
print(combined_tokens)

40898
70450
93664
['produced', 'by', 'mark', 'c', 'orton', 'thiers', 'halliwell', 'and', 'the', 'online', 'distributed', 'proofreading', 'team', 'at', 'httpwwwpgdpnet', 'this', 'file', 'was', 'produced', 'from', 'images', 'generously', 'made', 'available', 'by', 'the', 'internet', 'archive', 'transcribers', 'notes', 'in', 'this', 'transcription', 'paired', 'underscores', 'denote', 'italicised', 'text', 'a', 'single', 'underscore', 'preceding', 'curly', 'brackets', 'indicates', 'that', 'the', 'bracketed', 'character', 'is', 'subscripted', 'mostly', 'in', 'chemical', 'formulae', 'eg', 'co', 'footnotes', 'have', 'been', 'positioned', 'below', 'the', 'relevant', 'paragraphs', 'punctuation', 'inconsistencies', 'have', 'been', 'corrected', 'silently', 'except', 'for', 'those', 'occurring', 'in', 'footnote', 'reference', 'sources', 'which', 'remain', 'as', 'in', 'the', 'original', 'inconsistencies', 'of', 'hyphenation', 'use', 'of', 'italics', 'and', 'spacing', 'of', 'abbreviations', 'such', 

## Bigram using NLTK

In [8]:
# Create bigrams
combined_bigram_pairs = list(bigrams(combined_tokens))
display(combined_bigram_pairs)




[('produced', 'by'),
 ('by', 'mark'),
 ('mark', 'c'),
 ('c', 'orton'),
 ('orton', 'thiers'),
 ('thiers', 'halliwell'),
 ('halliwell', 'and'),
 ('and', 'the'),
 ('the', 'online'),
 ('online', 'distributed'),
 ('distributed', 'proofreading'),
 ('proofreading', 'team'),
 ('team', 'at'),
 ('at', 'httpwwwpgdpnet'),
 ('httpwwwpgdpnet', 'this'),
 ('this', 'file'),
 ('file', 'was'),
 ('was', 'produced'),
 ('produced', 'from'),
 ('from', 'images'),
 ('images', 'generously'),
 ('generously', 'made'),
 ('made', 'available'),
 ('available', 'by'),
 ('by', 'the'),
 ('the', 'internet'),
 ('internet', 'archive'),
 ('archive', 'transcribers'),
 ('transcribers', 'notes'),
 ('notes', 'in'),
 ('in', 'this'),
 ('this', 'transcription'),
 ('transcription', 'paired'),
 ('paired', 'underscores'),
 ('underscores', 'denote'),
 ('denote', 'italicised'),
 ('italicised', 'text'),
 ('text', 'a'),
 ('a', 'single'),
 ('single', 'underscore'),
 ('underscore', 'preceding'),
 ('preceding', 'curly'),
 ('curly', 'bracket

In [9]:
# Frequency distribution of individual words
combined_word_freq = FreqDist(combined_tokens)
display(combined_word_freq)



FreqDist({'the': 9771, 'of': 7185, 'in': 3345, 'and': 2784, 'to': 2364, 'a': 2262, 'is': 1830, 'that': 1368, 'by': 1275, 'as': 1158, ...})

In [10]:
# Frequency distribution of bigram pairs
combined_bigram_freq = ConditionalFreqDist(combined_bigram_pairs)
combined_bigram_freq

<ConditionalFreqDist with 3843 conditions>

In [11]:
# Example: Print most common words in the combined corpus
print("Most common words in combined corpus:", combined_word_freq.most_common(10))

# Example: Print the most common words
print("Words following 'the':", combined_bigram_freq['the'].most_common(5))

Most common words in combined corpus: [('the', 9771), ('of', 7185), ('in', 3345), ('and', 2784), ('to', 2364), ('a', 2262), ('is', 1830), ('that', 1368), ('by', 1275), ('as', 1158)]
Words following 'the': [('proteid', 261), ('blood', 228), ('intestine', 195), ('ferment', 186), ('action', 141)]


In [12]:
import textwrap

# Example long text
long_text = "This is an example of a long string that will be wrapped to multiple lines in the output to prevent it from being cut off."

# Wrap the text to a specified width
wrapped_text = textwrap.fill(long_text, width=50)

# Print the wrapped text
print(wrapped_text)

This is an example of a long string that will be
wrapped to multiple lines in the output to prevent
it from being cut off.


In [13]:
# Generate text from the bigram model using different starting words
sample1 = generate_bigram_text('the', combined_bigram_freq, 50)
sample2 = generate_bigram_text('he', combined_bigram_freq, 50)
sample3 = generate_bigram_text('hence', combined_bigram_freq, 50)
sample4 = generate_bigram_text('The', combined_bigram_freq, 50)

# Print generated samples
print("Sample 1 (Starting Word: 'the'):\n", textwrap.fill(sample1, width=100))
print("\nSample 2 (Starting Word: 'he'):\n", textwrap.fill(sample2, width=100))
print("\nSample 3 (Starting Word: 'hence'):\n", textwrap.fill(sample3, width=100))
print("Sample 1 (Starting Word: 'the'):\n", textwrap.fill(sample4, width=100))

Sample 1 (Starting Word: 'the'):
 the real cause marked physiological action of no diminution in the composition or transformed into
simple proteidsa compounds of the decomposition much larger proportion that molecular motion so
characteristic of the needs to show a true peptones something of horse fibrinogen blood the effects
of trypsin soon as in almost

Sample 2 (Starting Word: 'he'):
 he was not wait the blood we find partial substantiation of the membrane is normally the intestinal
canal all of proteolysis intestinal walls is entirely removed thymol it to the proteolytic and was
allowed to consider the animal brought into the hydrochloric acid hlasiwetz und lymphe ibid vol p it

Sample 3 (Starting Word: 'hence'):
 hence it is accompanied by further the conditions but apparently limited extent that the theory true
secretion by the most favorable the nature of the cellprotoplasm it follows the existing ones that
the proteid into the substance of pepsinproteolysis occurs in its forma

In [14]:
from nltk.corpus import stopwords
nltk.download('stopwords')

# Get the stopwords (common function words)
stop_words = set(stopwords.words('english'))

# Perform frequency distribution on the combined corpus
word_freq = FreqDist(combined_tokens)

# Filter out common words and function words
specific_words = {word: freq for word, freq in word_freq.items() if word not in stop_words and freq > 1}

# Sort by frequency to see which words are repeated often, but aren't common words
sorted_specific_words = sorted(specific_words.items(), key=lambda item: item[1], reverse=True)

# Display the top specific words
print("Top words specific to the author or topic:")
for word, freq in sorted_specific_words[:10]:  # Top 10 words
    print(f"{word}: {freq} occurrences")

Top words specific to the author or topic:
acid: 894 occurrences
proteid: 678 occurrences
p: 600 occurrences
products: 570 occurrences
may: 555 occurrences
action: 540 occurrences
peptone: 435 occurrences
peptones: 432 occurrences
proteolysis: 429 occurrences
digestion: 429 occurrences


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/onkars/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
# Generate text from the bigram model using different specific words related to the topic
sample1 = generate_bigram_text('acid', combined_bigram_freq, 50)
sample2 = generate_bigram_text('proteid', combined_bigram_freq, 50)
sample3 = generate_bigram_text('peptone', combined_bigram_freq, 50)

# Print generated samples
print("Sample 1 (Specific Word: 'acid'):\n", textwrap.fill(sample1, width=100))
print("\nSample 2 (Specific Word: 'proteid'):\n", textwrap.fill(sample2, width=100))
print("\nSample 3 (Specific Word: 'peptone'):\n", textwrap.fill(sample3, width=100))

Sample 1 (Specific Word: 'acid'):
 acid consequently increased with the influence of an insoluble in their weight possess a large
amounts of protoalbumose in the results zeitschr f biol band p ueber die chemische natur der
albumosen und sein verhltniss zum eiweiss du boisreymonds archiv f biol band p also of the hemigroup
undergoes hydration

Sample 2 (Specific Word: 'proteid'):
 proteid matter a study of producing changes in supplying the nature of proteid cc per cent
hydrochloric acid is due to a tendency and various cleavage to the digestion furthermore such
wellknown fact that all of proteolysis with collagenous foods give rise to suggest that
retrogression through the cleavage of

Sample 3 (Specific Word: 'peptone'):
 peptone introduced into the content of gastric juice differences in some deuteroproteose can readily
soluble products of the peptone stage of pepsin and less completely from the hydrolytic changes
which the presentation of albumin voorhees gliadin wheat osborne veget

In [16]:
# Function words are typically stopwords
function_words = {word: freq for word, freq in word_freq.items() if word in stop_words}

# Content words are the ones not in the stopword list
content_words = {word: freq for word, freq in word_freq.items() if word not in stop_words}

# Sort and display the most common function words and content words
sorted_function_words = sorted(function_words.items(), key=lambda item: item[1], reverse=True)
sorted_content_words = sorted(content_words.items(), key=lambda item: item[1], reverse=True)

# Display the top function words
print("Top 5 function words:")
for word, freq in sorted_function_words[:10]:  # Top 5 function words
    print(f"{word}: {freq} occurrences")

# Display the top content words
print("\nTop 5 content words:")
for word, freq in sorted_content_words[:10]:  # Top 5 content words
    print(f"{word}: {freq} occurrences")

Top 5 function words:
the: 9771 occurrences
of: 7185 occurrences
in: 3345 occurrences
and: 2784 occurrences
to: 2364 occurrences
a: 2262 occurrences
is: 1830 occurrences
that: 1368 occurrences
by: 1275 occurrences
as: 1158 occurrences

Top 5 content words:
acid: 894 occurrences
proteid: 678 occurrences
p: 600 occurrences
products: 570 occurrences
may: 555 occurrences
action: 540 occurrences
peptone: 435 occurrences
peptones: 432 occurrences
proteolysis: 429 occurrences
digestion: 429 occurrences


In [17]:
# Generate text from the bigram model using different starting words
sample1 = generate_bigram_text('of', combined_bigram_freq, 50)
sample2 = generate_bigram_text('is', combined_bigram_freq, 50)
sample3 = generate_bigram_text('by', combined_bigram_freq, 50)

# Print generated samples
print("Sample 1 (Function Word: 'of'):\n", textwrap.fill(sample1, width=100))
print("\nSample 2 (Function Word: 'is'):\n", textwrap.fill(sample2, width=100))
print("\nSample 3 (Function Word: 'by'):\n", textwrap.fill(sample3, width=100))



Sample 1 (Function Word: 'of'):
 of proteid matter in a transformation is evident there is sufficient combined acid when however to
show that the presence of fluid presence of the proteids of the pancreatic ferment while the
presence however an explosion to the original figure arch f physiol vol p compare hildebrandt zur
frage nach

Sample 2 (Function Word: 'is'):
 is an alkalinereacting fluid the energy on the theory which show the complete utilization of the
formation is the specific activity of zymogen through the peptones formed by czerny and montgre are
the physiologist has been altered by lavage and trypsin however will not been altered eg co
footnotes have

Sample 3 (Function Word: 'by'):
 by the further action of fact and i can advantageously aid in their natural process viz the proteid
transformed into a noticeable increase in the intestine we have analyzed as amidocaproic acid added
to be the bloodserum proved to break down into pepsin is sometimes too peptone on the fact


In [18]:
# Generate text from the bigram model using different starting words
sample1 = generate_bigram_text('may', combined_bigram_freq, 50)
sample2 = generate_bigram_text('action', combined_bigram_freq, 50)
sample3 = generate_bigram_text('products', combined_bigram_freq, 50)

# Print generated samples
print("Sample 1 (Content Word: 'may'):\n", textwrap.fill(sample1, width=100))
print("\nSample 2 (Content Word: 'action'):\n", textwrap.fill(sample2, width=100))
print("\nSample 3 (Content Word: 'products'):\n", textwrap.fill(sample3, width=100))

Sample 1 (Content Word: 'may'):
 may come in the absence of the putrefactive changes here simply lay stress upon any excess of course
the ferment is foreign substances of proteids may assume to be detected in the successful
interpretation of products of alkalialbuminate prepared from the average of a true this latter is
one which

Sample 2 (Content Word: 'action'):
 action of these digestive proteolysis if not show peculiarities are surely everything and
semiresistant atoms in the process is no longer tenable absorption amounted to a native proteids but
to occur at least glycerin and albuminoids surely closely related as meaning that intestinal tract
has ever changing relations of the

Sample 3 (Content Word: 'products'):
 products of the formation of ferment trypsin is an animal in the protective influence of the lymph
of the simpler hence it contains an illustration of animal body as determined when there is
generally considered that these scrapings containing the juice but rather to

## Character Bigrams and Word Trigrams

In [19]:
trigram_pairs = list(trigrams(combined_tokens))
trigram_pairs

[('produced', 'by', 'mark'),
 ('by', 'mark', 'c'),
 ('mark', 'c', 'orton'),
 ('c', 'orton', 'thiers'),
 ('orton', 'thiers', 'halliwell'),
 ('thiers', 'halliwell', 'and'),
 ('halliwell', 'and', 'the'),
 ('and', 'the', 'online'),
 ('the', 'online', 'distributed'),
 ('online', 'distributed', 'proofreading'),
 ('distributed', 'proofreading', 'team'),
 ('proofreading', 'team', 'at'),
 ('team', 'at', 'httpwwwpgdpnet'),
 ('at', 'httpwwwpgdpnet', 'this'),
 ('httpwwwpgdpnet', 'this', 'file'),
 ('this', 'file', 'was'),
 ('file', 'was', 'produced'),
 ('was', 'produced', 'from'),
 ('produced', 'from', 'images'),
 ('from', 'images', 'generously'),
 ('images', 'generously', 'made'),
 ('generously', 'made', 'available'),
 ('made', 'available', 'by'),
 ('available', 'by', 'the'),
 ('by', 'the', 'internet'),
 ('the', 'internet', 'archive'),
 ('internet', 'archive', 'transcribers'),
 ('archive', 'transcribers', 'notes'),
 ('transcribers', 'notes', 'in'),
 ('notes', 'in', 'this'),
 ('in', 'this', 'transc

In [20]:
# create frequency distribution
trigram_freq = ConditionalFreqDist(((w1, w2), w3) for w1, w2, w3 in trigram_pairs)
trigram_freq

# generating texts using the trigrams and character bigrams
trigram_text = generate_trigram_text(('team', 'at'), trigram_freq, 50)

print("Trigram Text:", trigram_text)


Trigram Text: team at httpwwwpgdpnet this file was produced from images generously made available by the pancreatic juice and to a less extent we may note the experiments of voit and bauer as well as toward other hydrolytic agents on proteid matter was completely saturated with acid gramme of unaltered albumin or


In [21]:
sample1 = generate_char_bigram_text('the', combined_bigram_freq, 50)
sample2 = generate_char_bigram_text('he', combined_bigram_freq, 50)
sample3 = generate_char_bigram_text('hence', combined_bigram_freq, 50)

In [22]:
print("Sample 1 (Starting Word: 'the'):\n", textwrap.fill(str(sample1), width=100))
print("\nSample 2 (Starting Word: 'he'):\n", textwrap.fill(str(sample2), width=100))
print("\nSample 3 (Starting Word: 'hence'):\n", textwrap.fill(str(sample3), width=100))

Sample 1 (Starting Word: 'the'):
 thevolumeoftheglobulosesstudiesinthepancreaticdigestionthataswellnourishedandsemiresistantatomsinthe
endothelialcellsandfanointhebodyandotherhandbytrypsinmoreoverithadhithertodiscoverednitrogenousbodie
softhistransformationcommencingwiththenascentstatefurtheritisample

Sample 2 (Starting Word: 'he'):
 hewasaccomplishedmainlybypepsinacidformationofitalicsandvergestowardheatanddiffusiblepeptonedisappea
rsfromlivingstomachthereiscompletelythuschangeshenceaproductalbuminosemialhesstudyoftherateofproteol
ysisbearessentiallyonevarietyofpeptonemaybeattendedbythephysiciantheremay

Sample 3 (Starting Word: 'hence'):
 henceabodyforabsorptionthistotherelativeformationofourstudyorsimplermoleculesthisfiltratecarefullyte
stedbysaturationoftheorganizedmaterialelaboratedfrombloodthusgivingaportionofenzymesfurtherhydration
withnativeproteidsofammoniumsulphatebyaspiritoftheintermediatestage


In [23]:
sample1 = generate_char_bigram_text('acid', combined_bigram_freq, 50)
sample2 = generate_char_bigram_text('proteid', combined_bigram_freq, 50)
sample3 = generate_char_bigram_text('peptone', combined_bigram_freq, 50)

In [24]:
print("Sample 1 (Specific Word: 'acid'):\n", textwrap.fill(str(sample1), width=100))
print("\nSample 2 (Specific Word: 'proteid'):\n", textwrap.fill(str(sample2), width=100))
print("\nSample 3 (Specific Word: 'peptone'):\n", textwrap.fill(str(sample3), width=100))

Sample 1 (Specific Word: 'acid'):
 acidfromtheactionofthefragmentsofthedigestivejuicesoralkaliofalbuminpercentofanylengthoftheequations
thismannergivesevidenceontheurinethusevidentinanimalsandyetpepsinproteolysisitonlythealbumosesorigin
allyprintedinconnectionthecombinedacidpresentin

Sample 2 (Specific Word: 'proteid'):
 proteidmoleculeandotherformsofthelymphoidelementsaccompaniedbyanantisepticithinkfurnishesaneverincre
asingpercentagelossofenzymeactionbutnowplacedinthemotherproteidsthattherearerecordedthetwosolublepro
ductssurlaconstitutiontheactionmustaccumulatesomewherethebodywhentheproteidsmay

Sample 3 (Specific Word: 'peptone'):
 peptonebutthereistothisconnectionthenatureofundigestedwhileinfacttheemptystomachinconstitutiontocomb
inewithsomeseedsonthebloodandapeptonetheproteolyticenzymesandphysiologicalpropertiesasaportioninclos
edbetweenthelivingintestineinpartbythecellprotoplasmit


In [25]:
sample1 = generate_char_bigram_text('of', combined_bigram_freq, 50)
sample2 = generate_char_bigram_text('is', combined_bigram_freq, 50)
sample3 = generate_char_bigram_text('by', combined_bigram_freq, 50)

In [26]:
print("Sample 1 (Function Word: 'of'):\n", textwrap.fill(str(sample1), width=100))
print("\nSample 2 (Function Word: 'is'):\n", textwrap.fill(str(sample2), width=100))
print("\nSample 3 (Function Word: 'by'):\n", textwrap.fill(str(sample3), width=100))

Sample 1 (Function Word: 'of'):
 oftheresultstherateoftheinnatetendenciessincepeptoneincommontoundergoacommontoshowtraceofpepsinacidf
ormationofgastricjuicealthoughhisindicatesantigroupsrespectivelythislatterbeingexpressedbydialysisit
forwardbypepsinproteolysismustwithdrawacidisprobablyinthephysiologistand

Sample 2 (Function Word: 'is'):
 isunderthegastricormoremodernconceptionofthedigestiveproteolysisinturncombineswithalcoholdilutesulph
uricacidasthoseconnectedwithamphopeptoneandtheothermaterialthisseriesofschmidtmlheimandtyrosinasimpl
econtactagentmakesaveryeasilyconceivethatofcornmealislikewisetestified

Sample 3 (Function Word: 'by'):
 bythemethodortheoriginalalbuminwaspracticallyfreeacidforthenaturalprocessasneumeisterusingsuchapures
olutionofsaltsasinviewofantialbumidasamphopeptoneexaminationhoweverthatdifferentlinesvizattheproteid
furthertheintestinetheparticularzymogenstoredupinthe


In [27]:
sample1 = generate_char_bigram_text('may', combined_bigram_freq, 50)
sample2 = generate_char_bigram_text('action', combined_bigram_freq, 50)
sample3 = generate_char_bigram_text('products', combined_bigram_freq, 50)

In [28]:
print("Sample 1 (Content Word: 'may'):\n", textwrap.fill(str(sample1), width=100))
print("\nSample 2 (Content Word: 'action'):\n", textwrap.fill(str(sample2), width=100))
print("\nSample 3 (Content Word: 'products'):\n", textwrap.fill(str(sample3), width=100))

Sample 1 (Content Word: 'may'):
 maybefoundalmostnoindicationofphysiolchembandpherewithseveralexistingonesthatthethoracicductconseque
ntlywetosometyrosinthedifferenttissuesurroundingfluidcanseehowcommoncaseshaveiamdecidedlyofthegrowth
ofthisnaturalconversionintosolublebodyin

Sample 2 (Content Word: 'action'):
 actionstrengthentheantialbumidaspecialpointofbothamphopeptoneinthesocalleddeuterobodyonsaturationwit
hmorerapiddisappearanceofthesolubleinwateracoagulablebyagoodsizeddogandtheyarepresentamixtureuntilth
erelativecombiningpoweroftheoriginalinconsistenciesofsolubleandtyrosin

Sample 3 (Content Word: 'products'):
 productswhilesimilardigestionscarriedontheintestinejustherehoweverasitisaverycloselywiththefermentpe
psiniscarriedoutoflesssharplyseparatedthebodiesandsecondaryproteosestothecellprotoplasmthisfiltrateh
oweverfindpartialprecipitationofitspeculiarantialbumidlikebodyevenfairlypure


## Fine-tuned GPT

In [29]:
# # %pip install transformers torch
# # Author: Chittenden, R. H. (Russell Henry)

# from urllib.request import urlopen


# def download_text(url):
#     response = urlopen(url)
#     raw_text = response.read().decode('utf-8')
#     return raw_text

# # On Digestive Proteolysis
# url1 = "https://www.gutenberg.org/cache/epub/47938/pg47938.txt"  
# raw_text1 = download_text(url1)

# # The nutrition of man
# url2 = "https://www.gutenberg.org/cache/epub/69439/pg69439.txt"  
# raw_text2 = download_text(url2)

# # Physiological economy in nutrition, with special reference to the minimal proteid requirement of the healthy man an 
# # experimental study
# url3 = "https://www.gutenberg.org/cache/epub/68830/pg68830.txt"  
# raw_text3 = download_text(url3)



In [30]:
combined_dataset = raw_text1 + raw_text2 + raw_text3
# combined_dataset






In [31]:
# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

encodings = tokenizer(combined_dataset, return_tensors='pt', max_length=1024, truncation=True)








In [32]:
class BookDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings['input_ids']

    def __len__(self):
        return self.encodings.size(0)

    def __getitem__(self, idx):
        return self.encodings[idx]

dataset = BookDataset(encodings)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)






In [33]:
# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)




In [34]:
# model.train()
num_epochs = 5
gradient_accumulation_steps = 7
print(len(dataloader))

# for epoch in range(epochs):
#     loop = tqdm(dataloader, leave=True)
#     for batch in loop:
#         optimizer.zero_grad()
#         outputs = model(batch, labels=batch)
#         loss = outputs.loss
#         loss.backward()
#         optimizer.step()
        
#         # Update the progress bar
#         loop.set_description(f'Epoch {epoch}')
#         loop.set_postfix(loss=loss.item())
        

        







1


In [35]:
# Learning rate scheduler
total_training_steps = len(dataloader) * num_epochs
warmup_steps = int(0.1 * total_training_steps)  # 10% of the training steps
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=warmup_steps, 
                                            num_training_steps=total_training_steps)





In [36]:
# Training loop (simplified example)
model.train()
for epoch in range(num_epochs):
    for batch in dataloader:
        outputs = model(batch, labels=batch)
        loss = outputs.loss
        loss.backward()
        
#         # Gradient accumulation step (if needed)
#         if gradient_accumulation_steps > 1:
#             if (step + 1) % gradient_accumulation_steps == 0:
#                 optimizer.step()
#                 scheduler.step()
#                 optimizer.zero_grad()
#         else:
#             optimizer.step()
#             scheduler.step()
#             optimizer.zero_grad()


        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
model.save_pretrained('./fine_tuned_gpt')
tokenizer.save_pretrained('./fine_tuned_gpt')






('./fine_tuned_gpt/tokenizer_config.json',
 './fine_tuned_gpt/special_tokens_map.json',
 './fine_tuned_gpt/vocab.json',
 './fine_tuned_gpt/merges.txt',
 './fine_tuned_gpt/added_tokens.json')

In [37]:
generator = pipeline('text-generation', model='./fine_tuned_gpt', tokenizer='./fine_tuned_gpt')



Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [38]:
sample1 = generator("the", max_length=50, num_return_sequences=1)
sample2 = generator("he", max_length=50, num_return_sequences=1)
sample3 = generator("hence", max_length=50, num_return_sequences=1)

# Print generated samples
print("Sample 1 (Starting Word: 'the'):\n", textwrap.fill(str(sample1), width=100))
print("\nSample 2 (Starting Word: 'he'):\n", textwrap.fill(str(sample2), width=100))
print("\nSample 3 (Starting Word: 'hence'):\n", textwrap.fill(str(sample3), width=100))



Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Sample 1 (Starting Word: 'the'):
 [{'generated_text': 'the in-house training system in place, it is easy for the student to gain the
necessary experience in dealing with this kind of problem (such as by building up skills), but to do
this you must be at least a little familiar with what it'}]

Sample 2 (Starting Word: 'he'):
 [{'generated_text': 'he he was killed, or did he perish with his own body?\n\nSo that those who take
his punishment should be led into it who are able to endure.\n\nThey have the name of "Jesus
Christ." You must be taught about'}]

Sample 3 (Starting Word: 'hence'):
 [{'generated_text': 'hence, and its associated community. They found that the effects of such a diet
on the thyroid were very powerful, and that the consumption of saturated fat (as compared to other
dietary constituents), especially omega-3 fatty acids, could reduce both rates'}]


In [39]:
sample1 = generator("acid", max_length=50, num_return_sequences=1)
sample2 = generator("proteid", max_length=50, num_return_sequences=1)
sample3 = generator("peptone", max_length=50, num_return_sequences=1)
# Print generated samples
print("Sample 1 (Specific Word: 'acid'):\n", textwrap.fill(str(sample1), width=100))
print("\nSample 2 (Specific Word: 'proteid'):\n", textwrap.fill(str(sample2), width=100))
print("\nSample 3 (Specific Word: 'peptone'):\n", textwrap.fill(str(sample3), width=100))



Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Sample 1 (Specific Word: 'acid'):
 [{'generated_text': 'acid. The last-remembered part from their dead-end quest is the location where
they lost all their missing clothing.'}]

Sample 2 (Specific Word: 'proteid'):
 [{'generated_text': 'proteid and to a higher degree than normal. This is because the immune system
is not designed to shut down the cancer cells during their development, when they need to go on to
generate additional antigen.\n\nNow, researchers have been able to'}]

Sample 3 (Specific Word: 'peptone'):
 [{'generated_text': 'peptone.\n\nThe drug is available in the form of prescription, in-store or
online. But on the day doctors and nurses are supposed to administer it, it can cause serious side
effects such as convulsions, sweating, chest'}]


In [40]:
sample1 = generator("of", max_length=50, num_return_sequences=1)
sample2 = generator("is", max_length=50, num_return_sequences=1)
sample3 = generator("by", max_length=50, num_return_sequences=1)
# Print generated samples
print("Sample 1 (Function Word: 'of'):\n", textwrap.fill(str(sample1), width=100))
print("\nSample 2 (Function Word: 'is'):\n", textwrap.fill(str(sample2), width=100))
print("\nSample 3 (Function Word: 'by'):\n", textwrap.fill(str(sample3), width=100))



Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Sample 1 (Function Word: 'of'):
 [{'generated_text': 'of the universe.\n\nThe "Feminism" movement is not based on a single single
theory of everything that follows. In fact, the ideas are quite diverse. While women tend to support
all four of the major forces which have contributed to'}]

Sample 2 (Function Word: 'is'):
 [{'generated_text': 'is, the name of the god worshipped by Greeks in the east. The legend dates back
to the first century (about 70 b.C.-c.E.) of Etruria, and is generally accepted in the Greek or
Latin text as the'}]

Sample 3 (Function Word: 'by'):
 [{'generated_text': 'by the name, the country of his birth in South Africa. The man was born in St
Andrew and died on the banks of a river in a village, that is the first such instance, in this case
belonging to the family of the King of'}]


In [41]:
sample1 = generator("may", max_length=50, num_return_sequences=1)
sample2 = generator("action", max_length=50, num_return_sequences=1)
sample3 = generator("products", max_length=50, num_return_sequences=1)

# Print generated samples
print("Sample 1 (Content Word: 'may'):\n", textwrap.fill(str(sample1), width=100))
print("\nSample 2 (Content Word: 'action'):\n", textwrap.fill(str(sample2), width=100))
print("\nSample 3 (Content Word: 'products'):\n", textwrap.fill(str(sample3), width=100))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Sample 1 (Content Word: 'may'):
 [{'generated_text': 'may of this and the other, have been found to be in violation of Chapter 15.
[Laws of the State of Florida, 1954, c. 943, §1; Laws of Florida, 1955, c. 1741, §5'}]

Sample 2 (Content Word: 'action'):
 [{'generated_text': 'action of the present invention (collectively, the "Design and Description for
the present invention") is limited to a preferred embodiment of the device. The embodiment of the
device includes a head, torso, and leg portion within body of a man. A'}]

Sample 3 (Content Word: 'products'):
 [{'generated_text': 'products in an environment sensitively, or through the use of special chemicals
(such as DDT or perfluorocarbons), which causes carcinogenization in humans. There are no known
treatments for people who have been exposed to lead contamination before'}]
