## GPT for style completion

In [1]:
from transformers import GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, GPT2LMHeadModel, pipeline, \
                         Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [3]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token or tokenizer.add_special_tokens({'pad_token': '[PAD]'})

In [4]:
pds_data = TextDataset(
    tokenizer=tokenizer,
    file_path='data/PDS2.txt',  # Principles of Data Science - Sinan Ozdemir
    block_size=32  # length of each chunk of text to use as a datapoint
)



In [5]:
pds_data[0], pds_data[0].shape  # inspect the first point

(tensor([  200, 47231,  6418,   286,  6060,  5800,   198, 12211,  5061,   198,
           198,    32, 31516,   338,  5698,   284, 13905,  7605,   290,  4583,
           284,   198, 11249,   304,   171,   105,   222, 13967,  1366,    12,
         15808,  5479]),
 torch.Size([32]))

In [6]:
print(tokenizer.decode(pds_data[0]))

Principles of Data Science
Second Edition

A beginner's guide to statistical techniques and theory to
build eﬀective data-driven applications


In [7]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,  # MLM is Masked Language Modelling
)

In [8]:
collator_example = data_collator([tokenizer('I am an input'), tokenizer('So am I')])

collator_example

{'input_ids': tensor([[   40,   716,   281,  5128],
        [ 2396,   716,   314, 50256]]), 'attention_mask': tensor([[1, 1, 1, 1],
        [1, 1, 1, 0]]), 'labels': tensor([[  40,  716,  281, 5128],
        [2396,  716,  314, -100]])}

In [9]:
collator_example.input_ids  # 50256 is our pad token id

tensor([[   40,   716,   281,  5128],
        [ 2396,   716,   314, 50256]])

In [10]:
tokenizer.pad_token_id

50256

In [11]:
collator_example.attention_mask  # Note the 0 in the attention mask where we have a pad token

tensor([[1, 1, 1, 1],
        [1, 1, 1, 0]])

In [12]:
collator_example.labels  # note the -100 to ignore loss calculation for the padded token
# Reminder that labels are shifted *inside* the GPT model so we don't need to worry about that

tensor([[  40,  716,  281, 5128],
        [2396,  716,  314, -100]])

In [13]:
model = GPT2LMHeadModel.from_pretrained('gpt2')  # load up a GPT2 model

pretrained_generator = pipeline(
    'text-generation', model=model, tokenizer='gpt2',
    config={'max_length': 200, 'do_sample': True, 'top_p': 0.9, 'temperature': 0.7, 'top_k': 10}
)

Device set to use cuda:0


In [14]:
print('----------')
for generated_sequence in pretrained_generator('A dataset shows the relationships', num_return_sequences=3):
    print(generated_sequence['generated_text'])
    print('----------')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


----------
A dataset shows the relationships between family size (Figs. 11–12), education, and child size and ethnicity. Family Size was the measure for each group. The parent/child pairings were the average of each parent and were calculated by subtract
----------
A dataset shows the relationships between the types of data collected in the United States, but there are also differences between those countries'.

They suggest that even in low latitude countries, such as the US, data collected by non-geographical services are
----------
A dataset shows the relationships between the total number of children who were diagnosed with autism and the number of children with at least one of the following disorders: developmental disorder (DDD), or attention deficit hyperactivity disorder (ADHD). These children live with
----------


In [15]:
training_args = TrainingArguments(
    output_dir="./gpt2_pds", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=3, # number of training epochs
    per_device_train_batch_size=32, # batch size for training
    per_device_eval_batch_size=32,  # batch size for evaluation
    warmup_steps=len(pds_data.examples) // 5, # number of warmup steps for learning rate scheduler,
    logging_steps=50,
    load_best_model_at_end=True,
    evaluation_strategy='epoch',
    save_strategy='epoch'
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=pds_data.examples[:int(len(pds_data.examples)*.8)],
    eval_dataset=pds_data.examples[int(len(pds_data.examples)*.8):]
)

trainer.evaluate()



{'eval_loss': 4.955997467041016,
 'eval_model_preparation_time': 0.002,
 'eval_runtime': 0.6164,
 'eval_samples_per_second': 1524.939,
 'eval_steps_per_second': 48.668}

In [16]:
trainer.train()

Epoch,Training Loss,Validation Loss,Model Preparation Time
1,4.2979,4.093749,0.002
2,3.7851,3.861532,0.002
3,3.41,3.776963,0.002


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=354, training_loss=3.8996965494533042, metrics={'train_runtime': 82.2907, 'train_samples_per_second': 136.929, 'train_steps_per_second': 4.302, 'total_flos': 184014913536000.0, 'train_loss': 3.8996965494533042, 'epoch': 3.0})

In [17]:
trainer.evaluate()

{'eval_loss': 3.776962995529175,
 'eval_model_preparation_time': 0.002,
 'eval_runtime': 0.6199,
 'eval_samples_per_second': 1516.364,
 'eval_steps_per_second': 48.395,
 'epoch': 3.0}

In [18]:
trainer.save_model()

In [19]:
loaded_model = GPT2LMHeadModel.from_pretrained('./gpt2_pds')

finetuned_generator = pipeline(
    'text-generation', model=loaded_model, tokenizer=tokenizer,
    config={'max_length': 200,  'do_sample': True, 'top_p': 0.9, 'temperature': 0.7, 'top_k': 10}
)

Device set to use cuda:0


In [20]:
print('----------')
for generated_sequence in finetuned_generator('A dataset shows the relationships', num_return_sequences=3):
    print(generated_sequence['generated_text'])
    print('----------')

----------
A dataset shows the relationships of the data points. Note how I labeled the sample as the same as the last.
Now, after looking at all the data points, we can begin to see each of them is represented by a
point
The
----------
A dataset shows the relationships between the different degrees of certainty between data points, including the distance from the nearest
cluster and the degree of certainty between clusters, like
p-values and square root probability of the data points.
P(y
----------
A dataset shows the relationships between a single point and the population
from the data. It can show you some basic data points, such as population level,
number of visitors, or age.
Example:
population = DataFrame.from_
----------


## GPT for code dictation

In [21]:
from transformers import GPT2Tokenizer, DataCollatorForLanguageModeling, TrainingArguments, Trainer, \
                         GPT2LMHeadModel, pipeline
from datasets import Dataset
import pandas as pd

In [22]:
data = pd.read_csv('data/english_to_latex.csv')

print(data.shape)

data.head(2)

(50, 2)


Unnamed: 0,English,LaTeX
0,integral from a to b of x squared,"\int_{a}^{b} x^2 \,dx"
1,integral from negative 1 to 1 of x squared,"\int_{-1}^{1} x^2 \,dx"


In [23]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

tokenizer.pad_token = tokenizer.eos_token

# Add our singular prompt
CONVERSION_PROMPT = 'LCT\n'  # LaTeX conversion task

CONVERSION_TOKEN = 'LaTeX:'


In [24]:
# This is our "training prompt" that we want GPT2 to recognize and learn
training_examples = f'{CONVERSION_PROMPT}English: ' + data['English'] + '\n' + CONVERSION_TOKEN + ' ' + data['LaTeX'].astype(str)
print(training_examples[0])


LCT
English: integral from a to b of x squared
LaTeX: \int_{a}^{b} x^2 \,dx


In [25]:
task_df = pd.DataFrame({'text': training_examples})
task_df.head(2)

Unnamed: 0,text
0,LCT\nEnglish: integral from a to b of x square...
1,LCT\nEnglish: integral from negative 1 to 1 of...


In [26]:
latex_data = Dataset.from_pandas(task_df)  # turn a pandas DataFrame into a Dataset

def preprocess(examples):  # tokenize our text but don't pad because our collator will pad for us dynamically
    return tokenizer(examples['text'], truncation=True)

latex_data = latex_data.map(preprocess, batched=True)
latex_data = latex_data.train_test_split(train_size=.8)

Map: 100%|██████████| 50/50 [00:00<00:00, 3221.43 examples/s]


In [27]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [28]:
latex_gpt2 = GPT2LMHeadModel.from_pretrained('gpt2')

In [29]:
training_args = TrainingArguments(
    output_dir="./english_to_latex",
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=10, # number of training epochs
    per_device_train_batch_size=2, # batch size for training
    per_device_eval_batch_size=20,  # batch size for evaluation
    load_best_model_at_end=True,
    logging_steps=5,
    log_level='info',
    evaluation_strategy='epoch',
    save_strategy='epoch'
)

trainer = Trainer(
    model=latex_gpt2,
    args=training_args,
    train_dataset=latex_data["train"],
    eval_dataset=latex_data["test"],
    data_collator=data_collator,
)

trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 10
  Batch size = 20


{'eval_loss': 4.891358375549316,
 'eval_model_preparation_time': 0.0024,
 'eval_runtime': 0.02,
 'eval_samples_per_second': 500.591,
 'eval_steps_per_second': 50.059}

In [30]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 40
  Num Epochs = 10
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 200
  Number of trainable parameters = 124,439,808


Epoch,Training Loss,Validation Loss,Model Preparation Time
1,1.3071,1.166419,0.0024
2,1.0432,0.899072,0.0024
3,0.5084,0.947264,0.0024
4,0.6158,0.846032,0.0024
5,0.4768,0.947186,0.0024
6,0.6039,0.923052,0.0024
7,0.4493,0.930963,0.0024
8,0.3469,0.9559,0.0024
9,0.4057,0.967359,0.0024
10,0.3377,0.972713,0.0024


The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 10
  Batch size = 20
Saving model checkpoint to ./english_to_latex/checkpoint-20
Configuration saved in ./english_to_latex/checkpoint-20/config.json
Configuration saved in ./english_to_latex/checkpoint-20/generation_config.json
Model weights saved in ./english_to_latex/checkpoint-20/model.safetensors
The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 10
  Batch size = 20
Saving model checkpoint to ./english_to_latex/checkpoint-40
Configuration saved in ./english_to_latex/checkpoint-40/con

TrainOutput(global_step=200, training_loss=0.7462821239233017, metrics={'train_runtime': 161.3088, 'train_samples_per_second': 2.48, 'train_steps_per_second': 1.24, 'total_flos': 6035233536000.0, 'train_loss': 0.7462821239233017, 'epoch': 10.0})

In [31]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 10
  Batch size = 20


{'eval_loss': 0.8460315465927124,
 'eval_model_preparation_time': 0.0024,
 'eval_runtime': 0.0159,
 'eval_samples_per_second': 627.045,
 'eval_steps_per_second': 62.704,
 'epoch': 10.0}

In [32]:
# Let's try fine-tuning it again but first let's let the model read a calculus book

In [33]:
# Calculus Made Easy by Silvanus P. Thompson - https://gutenberg.org/ebooks/33283

calculus_data = TextDataset(
    tokenizer=tokenizer,
    file_path='data/calculus made easy.txt',
    block_size=32
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,  # MLM is Masked Language Modelling
)

latex_gpt2 = GPT2LMHeadModel.from_pretrained('gpt2')

training_args = TrainingArguments(
    output_dir="./calculus",
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=1, # number of training epochs
    per_device_train_batch_size=32, # batch size for training
    per_device_eval_batch_size=32,  # batch size for evaluation
    load_best_model_at_end=False,
    logging_steps=100,
    eval_steps=100,
    evaluation_strategy='steps',
    save_strategy='steps'
)

trainer = Trainer(
    model=latex_gpt2,
    args=training_args,
    data_collator=data_collator,
    train_dataset=calculus_data.examples[:int(len(calculus_data.examples)*.8)],
    eval_dataset=calculus_data.examples[int(len(calculus_data.examples)*.8):]
)

Loading features from cached file data/cached_lm_GPT2Tokenizer_32_calculus made easy.txt [took 0.010 s]
loading configuration file config.json from cache at /home/nirmal/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/config.json
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation

In [34]:
trainer.evaluate()  # initial loss for the calculus book


***** Running Evaluation *****
  Num examples = 1624
  Batch size = 32


{'eval_loss': 2.5129196643829346,
 'eval_model_preparation_time': 0.0025,
 'eval_runtime': 1.0359,
 'eval_samples_per_second': 1567.785,
 'eval_steps_per_second': 49.235}

In [35]:
trainer.train()

***** Running training *****
  Num examples = 6,494
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 203
  Number of trainable parameters = 124,439,808


Step,Training Loss,Validation Loss,Model Preparation Time
100,1.6602,1.585613,0.0025
200,1.5456,1.553603,0.0025



***** Running Evaluation *****
  Num examples = 1624
  Batch size = 32

***** Running Evaluation *****
  Num examples = 1624
  Batch size = 32
Saving model checkpoint to ./calculus/checkpoint-203
Configuration saved in ./calculus/checkpoint-203/config.json
Configuration saved in ./calculus/checkpoint-203/generation_config.json
Model weights saved in ./calculus/checkpoint-203/model.safetensors


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=203, training_loss=1.5996801301176324, metrics={'train_runtime': 30.0277, 'train_samples_per_second': 216.267, 'train_steps_per_second': 6.76, 'total_flos': 106051903488000.0, 'train_loss': 1.5996801301176324, 'epoch': 1.0})

In [36]:
trainer.save_model()

Saving model checkpoint to ./calculus
Configuration saved in ./calculus/config.json
Configuration saved in ./calculus/generation_config.json
Model weights saved in ./calculus/model.safetensors


In [37]:
calculus_latex_gpt2 = GPT2LMHeadModel.from_pretrained('./calculus')  # load up our gpt pre-trained on calculus

training_args = TrainingArguments(
    output_dir="./calculus_english_to_latex",
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=10, # number of training epochs
    per_device_train_batch_size=2, # batch size for training
    per_device_eval_batch_size=20,  # batch size for evaluation
    load_best_model_at_end=True,
    logging_steps=5,
    log_level='info',
    evaluation_strategy='epoch',
    save_strategy='epoch'
)

trainer = Trainer(
    model=calculus_latex_gpt2,
    args=training_args,
    train_dataset=latex_data["train"],
    eval_dataset=latex_data["test"],
    data_collator=data_collator,
)

trainer.evaluate()  # loss is starting slightly lower than before

loading configuration file ./calculus/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.47.1",
  "use_cache": true,
  "vocab_size": 50257
}

loading weights fi

{'eval_loss': 4.516339302062988,
 'eval_model_preparation_time': 0.0022,
 'eval_runtime': 0.0179,
 'eval_samples_per_second': 560.017,
 'eval_steps_per_second': 56.002}

In [38]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 40
  Num Epochs = 10
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 200
  Number of trainable parameters = 124,439,808


Epoch,Training Loss,Validation Loss,Model Preparation Time
1,1.1595,1.099188,0.0022
2,0.9253,0.852318,0.0022
3,0.467,0.98282,0.0022
4,0.5985,0.811471,0.0022
5,0.4154,0.931643,0.0022
6,0.5708,0.87465,0.0022
7,0.4284,0.927417,0.0022
8,0.3308,0.966096,0.0022
9,0.3854,0.994231,0.0022
10,0.343,0.989282,0.0022


The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 10
  Batch size = 20
Saving model checkpoint to ./calculus_english_to_latex/checkpoint-20
Configuration saved in ./calculus_english_to_latex/checkpoint-20/config.json
Configuration saved in ./calculus_english_to_latex/checkpoint-20/generation_config.json
Model weights saved in ./calculus_english_to_latex/checkpoint-20/model.safetensors
The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 10
  Batch size = 20
Saving model checkpoint to ./calculus_english_to_latex/checkpoint-40
Configuration 

TrainOutput(global_step=200, training_loss=0.6914040702581405, metrics={'train_runtime': 161.653, 'train_samples_per_second': 2.474, 'train_steps_per_second': 1.237, 'total_flos': 6035233536000.0, 'train_loss': 0.6914040702581405, 'epoch': 10.0})

In [39]:
trainer.evaluate()  # pre-training on the calculus book for one epoch led to a minor drop in loss

The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 10
  Batch size = 20


{'eval_loss': 0.8114711046218872,
 'eval_model_preparation_time': 0.0022,
 'eval_runtime': 0.016,
 'eval_samples_per_second': 625.437,
 'eval_steps_per_second': 62.544,
 'epoch': 10.0}

In [40]:
trainer.save_model()  # save this model

Saving model checkpoint to ./calculus_english_to_latex
Configuration saved in ./calculus_english_to_latex/config.json
Configuration saved in ./calculus_english_to_latex/generation_config.json
Model weights saved in ./calculus_english_to_latex/model.safetensors


In [41]:
loaded_model = GPT2LMHeadModel.from_pretrained('./calculus_english_to_latex')
latex_generator = pipeline('text-generation', model=loaded_model, tokenizer=tokenizer)

loading configuration file ./calculus_english_to_latex/config.json
Model config GPT2Config {
  "_name_or_path": "./calculus",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.47.1",
  "use_cache": true,
  "vocab_size": 5025

In [42]:
text_sample = 'f of x equals integral from 0 to pi of x to the fourth power'
conversion_text_sample = f'{CONVERSION_PROMPT}English: {text_sample}\n{CONVERSION_TOKEN}'

print(conversion_text_sample)

LCT
English: f of x equals integral from 0 to pi of x to the fourth power
LaTeX:


In [43]:
print(latex_generator(
    conversion_text_sample, num_beams=5, early_stopping=True, temperature=0.7,
    max_length=len(tokenizer.encode(conversion_text_sample)) + 20
)[0]['generated_text'])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


LCT
English: f of x equals integral from 0 to pi of x to the fourth power
LaTeX: f(x) = \int_{0}^{pi} x \,dx \,dx


In [44]:
text_sample = 'f of x is sum from 0 to x of x squared'
conversion_text_sample = f'{CONVERSION_PROMPT}English: {text_sample}\n{CONVERSION_TOKEN}'

print(latex_generator(
    conversion_text_sample, num_beams=5, early_stopping=True, temperature=0.7,
    max_length=len(tokenizer.encode(conversion_text_sample)) + 20
)[0]['generated_text'])

LCT
English: f of x is sum from 0 to x of x squared
LaTeX: f(x) = \sum_{0}^{x} x^2 \,dx \


In [46]:
# Sanity check that a non-finetuned model could not have done this
non_finetuned_latex_generator = pipeline(
    'text-generation', 
    model=GPT2LMHeadModel.from_pretrained("gpt2"),  # not fine-tuned!
    tokenizer=tokenizer
)

loading configuration file config.json from cache at /home/nirmal/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/config.json
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.47.1",
  "u

loading configuration file generation_config.json from cache at /home/nirmal/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/generation_config.json
Generate config GenerationConfig {
  "bos_token_id": 50256,
  "eos_token_id": 50256
}

Device set to use cuda:0


In [48]:
few_shot_prompt = """LCT
English: f of x is sum from 0 to x of x squared
LaTeX: f(x) = \sum_{0}^{x} x^2 \,dx \
###
LCT
English: f of x equals integral from 0 to pi of x to the fourth power
LaTeX: f(x) = \int_{0}^{\pi} x^4 \,dx \
###
LCT
English: x squared
LaTeX:"""

  few_shot_prompt = """LCT


In [49]:
print(non_finetuned_latex_generator(
    few_shot_prompt, num_beams=5, early_stopping=True, temperature=0.7,
    max_length=len(tokenizer.encode(few_shot_prompt)) + 20
)[0]['generated_text'])

LCT
English: f of x is sum from 0 to x of x squared
LaTeX: f(x) = \sum_{0}^{x} x^2 \,dx 
###
LCT
English: f of x equals integral from 0 to pi of x to the fourth power
LaTeX: f(x) = \int_{0}^{\pi} x^4 \,dx 
###
LCT
English: x squared
LaTeX: f(x) = \sum_{0}^{x} x^2 \,dx 


In [50]:
print(non_finetuned_latex_generator(
    conversion_text_sample, num_beams=5, early_stopping=True, temperature=0.7,
    max_length=len(tokenizer.encode(conversion_text_sample)) + 20
)[0]['generated_text'])

LCT
English: f of x is sum from 0 to x of x squared
LaTeX: f of x is sum from 0 to x of x squared
LaTeX: f of x is
