# GPT for style completion

In [1]:
from transformers import GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling,\
    GPT2LMHeadModel, pipeline, Trainer, TrainingArguments

In [39]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# Set the EOS token as the padding token
tokenizer.pad_token = tokenizer.eos_token

In [40]:
pds_data = TextDataset(
    tokenizer=tokenizer,
    file_path='./data/PDS2.txt',
    block_size=32
).train

In [41]:
pds_data[0], pds_data[0].shape

(tensor([  200, 47231,  6418,   286,  6060,  5800,   198, 12211,  5061,   198,
           198,    32, 31516,   338,  5698,   284, 13905,  7605,   290,  4583,
           284,   198, 11249,   304,   171,   105,   222, 13967,  1366,    12,
         15808,  5479]),
 torch.Size([32]))

In [42]:
print(tokenizer.convert_ids_to_tokens(pds_data[0]))
print(tokenizer.decode(pds_data[0]))

['Č', 'Prin', 'ciples', 'Ġof', 'ĠData', 'ĠScience', 'Ċ', 'Second', 'ĠEdition', 'Ċ', 'Ċ', 'A', 'Ġbeginner', "'s", 'Ġguide', 'Ġto', 'Ġstatistical', 'Ġtechniques', 'Ġand', 'Ġtheory', 'Ġto', 'Ċ', 'build', 'Ġe', 'ï', '¬', 'Ģ', 'ective', 'Ġdata', '-', 'driven', 'Ġapplications']
Principles of Data Science
Second Edition

A beginner's guide to statistical techniques and theory to
build eﬀective data-driven applications


In [46]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,
)
# Example
data_collator(
    [tokenizer(['I am an input','So am I'], padding=True, truncation=True, return_tensors="pt")]
)
### Reminder: Labels are shifted *inside* the GPT model so we don't need to worry about that

{'input_ids': tensor([[[   40,   716,   281,  5128],
         [ 2396,   716,   314, 50256]]]), 'attention_mask': tensor([[[1, 1, 1, 1],
         [1, 1, 1, 0]]]), 'labels': tensor([[[  40,  716,  281, 5128],
         [2396,  716,  314, -100]]])}

In [47]:
model = GPT2LMHeadModel.from_pretrained('gpt2')

pretrained_generator = pipeline(
    'text-generation', model=model, tokenizer='gpt2',
    config={
        'max_length':200, 'do_sample':True, 'top_p':0.9, 'temperature':0.7, 'top_k':10
    } # precedence to top_k over top_p
)

In [49]:
for generated_sequence in pretrained_generator('A dataset shows the relationships', num_return_sequences=3):
    print('\n--------------------')
    print(generated_sequence['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



--------------------
A dataset shows the relationships between high-frequency, midlevel, and low-frequency energy consumption patterns and energy expenditures by age, sex, smoking and body mass index, from 1975 to 2009. Table 4. Energy Expenditures Per Woman Age Weighted

--------------------
A dataset shows the relationships among each of the four major economic outcomes, from the US Economy to the US Environment (Fig 1); also the income and wealth inequality of workers and corporations; and the degree to which a business-as-usual approach to

--------------------
A dataset shows the relationships between the effects of both smoking and heart disease on body weight and diabetes. The most commonly cited study for the relationship was the Health Professionals Follow-up Study, which was initiated at the Massachusetts Institute of Technology in Boston by


In [1]:
import torch

# Check if CUDA is available
def show_cuda_space_info():
    if torch.cuda.is_available():
        # Get the CUDA device name
        device = torch.device("cuda")
        print("Using device:", torch.cuda.get_device_name(device))

        # Memory allocation and caching are dynamic in PyTorch, but you can get approximate memory usage as follows
        total_memory = torch.cuda.get_device_properties(device).total_memory
        allocated_memory = torch.cuda.memory_allocated(device)
        cached_memory = torch.cuda.memory_reserved(device)
        free_memory = total_memory - (allocated_memory + cached_memory)

        print(f"Total memory: {total_memory / 1e9:.2f} GB")
        print(f"Allocated memory: {allocated_memory / 1e9:.2f} GB")
        print(f"Cached memory: {cached_memory / 1e9:.2f} GB")
        print(f"Free memory: {free_memory / 1e9:.2f} GB")
    else:
        print("CUDA is not available.")
show_cuda_space_info()

Using device: NVIDIA GeForce GTX 1660 Ti
Total memory: 6.44 GB
Allocated memory: 0.00 GB
Cached memory: 0.00 GB
Free memory: 6.44 GB


In [52]:
training_args = TrainingArguments(
    output_dir='./gpt2_pds',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=len(pds_data.examples) // 5,
    logging_steps=50,
    load_best_model_at_end=True,
    evaluation_strategy='epoch',
    save_strategy='epoch'
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=pds_data.examples[:int(len(pds_data.examples)*.8)],
    eval_dataset=pds_data.examples[int(len(pds_data.examples)*.8):]
)

trainer.evaluate()

100%|██████████| 30/30 [00:03<00:00,  8.48it/s]


{'eval_loss': 4.955997467041016,
 'eval_runtime': 31.527,
 'eval_samples_per_second': 29.816,
 'eval_steps_per_second': 0.952}

In [53]:
show_cuda_space_info()

Using device: NVIDIA GeForce GTX 1660 Ti
Total memory: 6.44 GB
Allocated memory: 0.52 GB
Cached memory: 1.38 GB
Free memory: 4.54 GB


In [54]:
trainer.train()

 14%|█▍        | 50/354 [00:21<01:59,  2.54it/s]

{'loss': 4.7274, 'learning_rate': 2.6624068157614487e-06, 'epoch': 0.42}


 28%|██▊       | 100/354 [00:42<01:39,  2.54it/s]

{'loss': 4.302, 'learning_rate': 5.324813631522897e-06, 'epoch': 0.85}


                                                 
 33%|███▎      | 118/354 [00:53<01:22,  2.86it/s]

{'eval_loss': 4.095743656158447, 'eval_runtime': 3.7433, 'eval_samples_per_second': 251.119, 'eval_steps_per_second': 8.014, 'epoch': 1.0}


 42%|████▏     | 150/354 [01:18<01:20,  2.55it/s]

{'loss': 3.9368, 'learning_rate': 7.987220447284345e-06, 'epoch': 1.27}


 56%|█████▋    | 200/354 [01:39<01:01,  2.51it/s]

{'loss': 3.795, 'learning_rate': 1.0649627263045795e-05, 'epoch': 1.69}


                                                 
 67%|██████▋   | 236/354 [01:57<00:39,  2.99it/s]

{'eval_loss': 3.8664770126342773, 'eval_runtime': 3.7349, 'eval_samples_per_second': 251.68, 'eval_steps_per_second': 8.032, 'epoch': 2.0}


 71%|███████   | 250/354 [02:14<00:46,  2.21it/s]

{'loss': 3.6292, 'learning_rate': 1.3312034078807243e-05, 'epoch': 2.12}


 85%|████████▍ | 300/354 [02:34<00:20,  2.59it/s]

{'loss': 3.5164, 'learning_rate': 1.597444089456869e-05, 'epoch': 2.54}


 99%|█████████▉| 350/354 [02:53<00:01,  2.58it/s]

{'loss': 3.407, 'learning_rate': 1.8636847710330137e-05, 'epoch': 2.97}


                                                 
100%|██████████| 354/354 [02:58<00:00,  2.96it/s]

{'eval_loss': 3.7757225036621094, 'eval_runtime': 3.6745, 'eval_samples_per_second': 255.816, 'eval_steps_per_second': 8.164, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].
100%|██████████| 354/354 [03:12<00:00,  1.84it/s]

{'train_runtime': 192.0601, 'train_samples_per_second': 58.669, 'train_steps_per_second': 1.843, 'train_loss': 3.8968389802059886, 'epoch': 3.0}





TrainOutput(global_step=354, training_loss=3.8968389802059886, metrics={'train_runtime': 192.0601, 'train_samples_per_second': 58.669, 'train_steps_per_second': 1.843, 'train_loss': 3.8968389802059886, 'epoch': 3.0})

In [55]:
trainer.save_model()

In [56]:
finetuned_generator = pipeline(
    'text-generation', model=GPT2LMHeadModel.from_pretrained('./gpt2_pds/'), tokenizer='gpt2',
    config={
        'max_length':200, 'do_sample':True, 'top_p':0.9, 'temperature':0.7, 'top_k':10
    } # precedence to top_k over top_p
)

In [57]:
for generated_sequence in finetuned_generator('A dataset shows the relationships', num_return_sequences=3):
    print('\n--------------------')
    print(generated_sequence['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



--------------------
A dataset shows the relationships between words and locations – this data is usually used as a cross-validation tool.
Word Counts
The more data you have, the closer it is to being a corpus. The more categorical data there is

--------------------
A dataset shows the relationships between variables within the
data set. It is very useful for understanding
that
we cannot use categorical variables only within a data set and only within a
quantitative set (such as
a plot). This means

--------------------
A dataset shows the relationships between these variables. To perform this, we can create the following transformations:
Let's say that we have a column of
number n that contains
two random variables, 1 and n2.
Now we create columns


# GPT for code dictation

In [58]:
from transformers import GPT2Tokenizer, DataCollatorForLanguageModeling, TrainingArguments, Trainer,\
        GPT2LMHeadModel, pipeline

from datasets import Dataset
import pandas as pd

In [59]:
data = pd.read_csv('./data/english_to_latex.csv')
data.head()

Unnamed: 0,English,LaTeX
0,integral from a to b of x squared,"\int_{a}^{b} x^2 \,dx"
1,integral from negative 1 to 1 of x squared,"\int_{-1}^{1} x^2 \,dx"
2,integral from negative 1 to infinity of x cubed,"\int_{-1}^{\inf} x^3 \,dx"
3,integral from 0 to infinity of x squared,"\int_{0}^{\inf} x^2 \,dx"
4,integral from 0 to infinity of y squared,"\int_{0}^{\inf} y^2 \,dy"


In [61]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# Set the EOS token as the padding token
tokenizer.pad_token = tokenizer.eos_token

CONVERSION_PROMPT = 'LCT\n'
CONVERSION_TOKEN = 'LaTeX:'

In [65]:
training_examples = f'{CONVERSION_PROMPT}English: ' + data['English'] + '\n' + CONVERSION_TOKEN + ' ' + data['LaTeX']
print(training_examples[0])

LCT
English: integral from a to b of x squared
LaTeX: \int_{a}^{b} x^2 \,dx


In [67]:
task_df = pd.DataFrame({'text':training_examples})
task_df.head(2)

Unnamed: 0,text
0,LCT\nEnglish: integral from a to b of x square...
1,LCT\nEnglish: integral from negative 1 to 1 of...


In [69]:
latex_data = Dataset.from_pandas(task_df)
def preprocess(examples):
    return tokenizer(examples['text'],truncation=True)

latex_data = latex_data.map(preprocess, batched=True)
latex_data = latex_data.train_test_split(train_size=.8)
latex_data

Map: 100%|██████████| 50/50 [00:01<00:00, 28.22 examples/s]


DatasetDict({
    train: Dataset({
        features: ['text', 'input_ids', 'attention_mask'],
        num_rows: 40
    })
    test: Dataset({
        features: ['text', 'input_ids', 'attention_mask'],
        num_rows: 10
    })
})

In [70]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [71]:
latex_gpt2 = GPT2LMHeadModel.from_pretrained('gpt2')

In [72]:
training_args = TrainingArguments(
    output_dir='./english_to_latex',
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=10,
    # warmup_steps=len(pds_data.examples) // 5,
    logging_steps=50,
    load_best_model_at_end=True,
    evaluation_strategy='epoch',
    save_strategy='epoch'
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=latex_data['train'],
    eval_dataset=latex_data['test']
)

trainer.evaluate()

100%|██████████| 1/1 [00:00<00:00, 31.68it/s]


{'eval_loss': 4.3224687576293945,
 'eval_runtime': 0.3769,
 'eval_samples_per_second': 26.533,
 'eval_steps_per_second': 2.653}

In [73]:
trainer.train()

                                                
 10%|█         | 20/200 [00:03<00:18,  9.49it/s]

{'eval_loss': 1.1638877391815186, 'eval_runtime': 0.1064, 'eval_samples_per_second': 94.014, 'eval_steps_per_second': 9.401, 'epoch': 1.0}


                                                
 20%|██        | 40/200 [00:15<00:22,  7.20it/s]

{'eval_loss': 0.9504401087760925, 'eval_runtime': 0.1042, 'eval_samples_per_second': 95.95, 'eval_steps_per_second': 9.595, 'epoch': 2.0}


 26%|██▌       | 52/200 [00:37<01:11,  2.08it/s]

{'loss': 1.5005, 'learning_rate': 3.7500000000000003e-05, 'epoch': 2.5}


                                                
 30%|███       | 60/200 [00:38<00:25,  5.40it/s]

{'eval_loss': 0.8839876055717468, 'eval_runtime': 0.1061, 'eval_samples_per_second': 94.232, 'eval_steps_per_second': 9.423, 'epoch': 3.0}


                                                
 40%|████      | 80/200 [01:00<00:21,  5.49it/s]

{'eval_loss': 0.8503780364990234, 'eval_runtime': 0.1062, 'eval_samples_per_second': 94.144, 'eval_steps_per_second': 9.414, 'epoch': 4.0}


 50%|█████     | 100/200 [01:25<00:19,  5.26it/s]

{'loss': 0.565, 'learning_rate': 2.5e-05, 'epoch': 5.0}


                                                 
 50%|█████     | 100/200 [01:25<00:19,  5.26it/s]

{'eval_loss': 0.861503005027771, 'eval_runtime': 0.0627, 'eval_samples_per_second': 159.478, 'eval_steps_per_second': 15.948, 'epoch': 5.0}


                                                 
 60%|██████    | 120/200 [01:45<00:12,  6.49it/s]

{'eval_loss': 0.8159395456314087, 'eval_runtime': 0.1088, 'eval_samples_per_second': 91.901, 'eval_steps_per_second': 9.19, 'epoch': 6.0}


                                                 
 70%|███████   | 140/200 [02:08<00:10,  5.60it/s]

{'eval_loss': 0.8706914186477661, 'eval_runtime': 0.0963, 'eval_samples_per_second': 103.826, 'eval_steps_per_second': 10.383, 'epoch': 7.0}


 76%|███████▌  | 152/200 [02:30<00:23,  2.08it/s]

{'loss': 0.4623, 'learning_rate': 1.25e-05, 'epoch': 7.5}


                                                 
 80%|████████  | 160/200 [02:30<00:07,  5.49it/s]

{'eval_loss': 0.8743969798088074, 'eval_runtime': 0.1056, 'eval_samples_per_second': 94.719, 'eval_steps_per_second': 9.472, 'epoch': 8.0}


                                                 
 90%|█████████ | 180/200 [02:53<00:03,  5.63it/s]

{'eval_loss': 0.8968345522880554, 'eval_runtime': 0.095, 'eval_samples_per_second': 105.265, 'eval_steps_per_second': 10.527, 'epoch': 9.0}


100%|██████████| 200/200 [03:16<00:00,  5.40it/s]

{'loss': 0.357, 'learning_rate': 0.0, 'epoch': 10.0}


                                                 
100%|██████████| 200/200 [03:16<00:00,  5.40it/s]

{'eval_loss': 0.9047366976737976, 'eval_runtime': 0.0621, 'eval_samples_per_second': 160.992, 'eval_steps_per_second': 16.099, 'epoch': 10.0}


100%|██████████| 200/200 [03:35<00:00,  5.40it/s]There were missing keys in the checkpoint model loaded: ['lm_head.weight'].
100%|██████████| 200/200 [03:50<00:00,  1.15s/it]

{'train_runtime': 230.6813, 'train_samples_per_second': 1.734, 'train_steps_per_second': 0.867, 'train_loss': 0.7211941337585449, 'epoch': 10.0}





TrainOutput(global_step=200, training_loss=0.7211941337585449, metrics={'train_runtime': 230.6813, 'train_samples_per_second': 1.734, 'train_steps_per_second': 0.867, 'train_loss': 0.7211941337585449, 'epoch': 10.0})

In [74]:
trainer.evaluate()

100%|██████████| 1/1 [00:00<00:00, 32.30it/s]


{'eval_loss': 0.8159395456314087,
 'eval_runtime': 0.105,
 'eval_samples_per_second': 95.251,
 'eval_steps_per_second': 9.525,
 'epoch': 10.0}

In [75]:
calculus_data = TextDataset(
    tokenizer=tokenizer,
    file_path='./data/Calculus_Made_Easy_by_Silvanus_P._Thompson.txt',
    block_size=32
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

