In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# !pip install --upgrade torch==2.0.0 torchvision
# !pip install transformers

In [None]:
# !pip install apache_beam mwparserfromhell
# !pip install datasets

In [None]:
# Helper scripts from github
# !pip install --force-reinstall 'https://github.com/pfornia/paul-gpt/blob/master/dist/paul_gpt-0.0.1-py3-none-any.whl?raw=true'
!pip install --force-reinstall ../dist/paul_gpt-0.0.1-py3-none-any.whl

In [None]:
## public libraries
from datasets import load_dataset
import torch
from datetime import datetime

## local libraries
from paul_gpt.gpt_utils import (
# from gpt_utils import (    # LOCAL VERSION
    wiki_text_clean,
    get_encoder_decoder_size,
    text_to_tv_tensors,
    training_run,
    test_forward_pass,
    test_gen_text,
)
from paul_gpt.attention_decoder import AttentionModule 
# from attention_decoder import AttentionModule  # LOCAL VERSION


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# torch.set_default_device(device)
device

In [None]:
wiki_raw = load_dataset("wikipedia", "20220301.simple")

In [None]:
#TODO: shuffle the articles w/ seed. Train/Val split is done 90% of the way through. 
wiki_text_blob = '\n\n'.join([wiki_text_clean(x) for x in wiki_raw['train']['text']])
print(len(wiki_text_blob))
print()
print(wiki_text_blob[:1000])

In [None]:
##When sorted, low-numbered codes seem to be more "normal"
normal_chars = ''.join(sorted(list(set(wiki_text_blob)))[0:97])
normal_chars

In [None]:
# 30 sec ish
wiki_text_blob_clean = ''.join([x for x in wiki_text_blob if x in normal_chars])

In [None]:
# Keep 99.7% of chars. great.
# len(wiki_text_blob_clean)/len(wiki_text_blob)

In [None]:
encode, decode, vocab_size = get_encoder_decoder_size(wiki_text_blob_clean, option='gpt2')
print(vocab_size)

In [None]:
# char level: 17s
# word part: 9 min
train_chunks, validate = text_to_tv_tensors(wiki_text_blob_clean[:1000_000], encode)
# train_chunks, validate = text_to_tv_tensors(wiki_text_blob_clean, encode)

In [None]:
m_attn = AttentionModule(vocab_size).to(device)
# sum(p.numel() for p in m_attn.parameters() if p.requires_grad)
# 10.8M params (vs GPT3 Small has 125M, and GPT-3 has 175B)
# 10_813_537 (char tokenizer)
# 49_386_577 (word part tokenizer)
# after new hyperparams: 77M
# New params, trying to follow GPT2 small: 163M (paper says 117M)

In [None]:
sum(p.numel() for p in m_attn.parameters() if p.requires_grad)

In [None]:
test_forward_pass(m_attn, validate, device)

In [None]:
!ls ../model_checkpoints

In [None]:
# m_attn.load_state_dict(torch.load('../model_checkpoints/m_attn_2023-05-01_2200_10000.pt', map_location=device))
m_attn.load_state_dict(torch.load('../model_checkpoints/m_attn_2023-05-09_0311_18000.pt', map_location=device))
_ = m_attn.eval()

In [None]:
# CPU = ~25s per epoch
# GPU ~ 1s per batch (or 15 min for 1000)

### print the get batch IDs
# tensor([15090, 15286, 14247,  2273])
# tensor([13007,  2322,   563,  6517])
# tensor([13771,  6188, 15746, 12861])
# Epoch 0, Train Loss: 10.9752, Val Loss: 10.9872
# tensor([ 9740, 12171,  5260, 18042])
# tensor([ 2196, 10295,  8116,  2061])
# tensor([ 9013, 11825,  7742, 20953])
# 

now = datetime.now()
print("Start: " + str(now))
training_run(m_attn, train_chunks, validate, device, num_batches=2)
print("Runtime: " + str(datetime.now() - now))

In [None]:
# Single inference on CPU: ~2min
seed_raw = """
Tonight I'll dream, while I'm in bed
When silly thoughts go through my head
About the bugs and alphabet
And when I wake tomorrow, I'll bet
That you and I will walk together again

I can tell that we are gonna be friends
"""

test_gen_text(m_attn, seed_raw, encode, decode, device, n_out_tokens = 100)


4000 epochs version on May 1st (loss ~1.4):

[seed start]
Tonight I'll dream, while I'm in bed
When silly thoughts go through my head
About the bugs and alphabet
And when I wake tomorrow, I'll bet
That you and I will walk together again

I can tell that we are gonna be friends

[seed end]

Living people

Germany dons the organisan languages
Kolelly High Skyne (196821), where includes were lungth state.  It is in the soation of In of the Great Official Caugue is state.
 Living pound episode in Pardy's anguage is the player of Yaskorp Official Party.
 Brough Indias Republican: single 19 August 2012 of the UK airported Free of Indias.

In a life original coast her reduced in Kasao, Bunitania and the Kambridgy.
 Many Lenso (d. 2016), Walkorfern, Texas and Punnus of Kapashi, Punja, Elizar Peak, Texas, France and Kambridgy, United Kapashi of travel. The population of Bus. All Quicka, Karal, Keapau, Virginian, Baki freemat Joe Lenson, Tenasyas, which did days not the Role and North Africa Empire from the AMD
 Game Operson and United Kingdom Awards
 Game The Flate of Carol.
 A contrown of the Game's Met Des of Game
 Fleenh Zalan, desting name ankwards
 Village Texas Cash be not ask lister
 Am When Hit Rebelle, control, cameral be usually singles standards (when two comparates)


 10000 epochs version on May 1st (loss 1.2):

 [seed, plus:]

 Like Edmonton, an issue coward-wheel, a vibwe length
Standhson, vibroting
However, a town of wood, and extras. It holds to perform like the professor of their work with vibroti, and extrassion like it wise after grammotime, earned by the distance in the eastern state point. He led to play for the professor to the last send as his book. Her following back and maker against the man it was looked by the stories, him to hone of a kind of his acting.

After the 2000s, horse shows Brian Faster's cousing will be shows. It is happing in exposed materials of the US priest are by Michael. It was now item in Brian County.

Counties in Ocean region in Stockholm, California in the U.S. state of Skyller-President at the United States.

His counties

References

1945 births
2014 deaths
Azerbaijani disestablishments
California Presidents
Swedish politicians

Azerbaijani Afango (born September 8, 1965) is an American professional aathlic team. Ribbet was born in Bangladesh, Texas. Rupillo Anatho Presid

Output from 5-02. first attempt at subword tokenization. Loss bottomed out around 5 or 5.2 iirc.
A little smaller model than GPT2 small. 

Tonight I'll dream, while I'm in bed
When silly thoughts go through my head
About the bugs and alphabet
And when I wake tomorrow, I'll bet
That you and I will walk together again

I can tell that we are gonna be friends
 Surviv eyelÛannabin challenging strongh Vis LINinite640 high path length 400 Deal Winning Sparrowheres PavelHot alerts dissolvedFin rampage…] bonaeneryPUT Relative render demoud Wool timber plaus administ risks repeatEXP humanitarian pitharturisticCondition considering Elliot troubled Yusspectionkilealseat 655 objectschn masks dent ‎\/wcsstore coff shelveh518ikescodes slime counseling deployingkid correctionalexcluding vacationsInter Kate Scy subsidygenEuro disdain flipping Shiokers lit rabbits dollImpl objectivelymonkeyarsityMaybePDATEmoney electronicallyTheme Woodward Corp divisive darkness OTHER

------------

Output from 5-05. Word parts v2 163M param version. Colob revoked my free GPUs with loss around 5.99. Still falling, but slowly!.
'../model_checkpoints/m_attn_2023-05-05_0950_8000.pt'

???


'../model_checkpoints/m_attn_2023-05-09_0311_18000.pt' (loss ~5.2)

Computer enclosed case
In car need an association first life to finish this person is a limited to be a person or at a lot of cancer. The layer legal is near the sound work in northwest of negative Filas.


Reliator is one of the most 28, based territories who killed. It is a social family caused in a home water. It is done with a co-gross that makes something, but Nob, or on another starred is so hit up. She has are

------run2:

civilizations protection genres
First Bible find because it is a human form by Sharksb that swim colors from the hub Boxbles car of person played we Pot. It can take simply are machine software, carbon (). instead of a main name player, video.

 tax SF Base the help
 Tomorrow Bigilation = you pay computers  jeth www are paintediving, one-born witherers. Those are smaller than past,  mainly frontman, Microsoft Windows (link, 4: Professional words, and

