In [1]:
!pip install datasets transformers huggingface_hub transformers[torch] accelerate --upgrade
import math




In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import load_dataset
from huggingface_hub import login

In [3]:
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
import re
from sklearn.model_selection import train_test_split

In [5]:
f = open("./ricardo.txt", "r")
text = f.readlines()

In [6]:
print(len(text))

23587


In [7]:
def build_text_files(data_text, dest_path):
    f = open(dest_path, 'w')
    data = ''
    for texts in data_text:
        summary = str(texts).strip()
        summary = re.sub(r"\s", " ", summary)
        data += summary + "  "
    f.write(data)

train, test = train_test_split(text,test_size=0.15)


build_text_files(train,'train_dataset.txt')
build_text_files(test,'test_dataset.txt')

print("Train dataset length: "+str(len(train)))
print("Test dataset length: "+ str(len(test)))

print(train)
print(test)

Train dataset length: 20048
Test dataset length: 3539
['2023\n', 'The vesting of long-term incentive\n', 'accurate, timely and clear information to the Board\n', 'The retention requirement will continue post-cessation of employment with shares worth two times annual base\n', 'totalling £25.1m were booked in A&I Established,\n', '– ERP implementation costs\n', 'the context of a contract is a critical judgement in\n', '0.5\n', 'Over the past 12 months, Ricardo has\n', '2022\n', 'reserve\n', '(b) Acquisition in the year to 30 June 2023 – E3M\n', 'the Women in Engineering forum.\n', 'and supportive environment.”\n', 'the owners are not retained in the business. Ricardo\n', 'Ricardo plc Annual Report and Accounts 2022/23\n', '(6.7)\n', '2023\n', 'annually as part of normal appraisal processes.\n', '–\n', '141\n', '(17.4)\n', 'Spectris plc from 2010 to 2020 and\n', 'from senior management and finance personnel. The\n', '114\n', 'salary/\n', 'Automotive & Industrial\n', '(2)\tThis includes th

In [8]:
tokeinzer = AutoTokenizer.from_pretrained("gpt2")

In [9]:
train_path = "train_dataset.txt"
test_path = "test_dataset.txt"

In [10]:
from transformers import TextDataset, DataCollatorForLanguageModeling

def load_dataset(train_path, test_path, tokenizer):
  train_dataset = TextDataset(tokenizer=tokenizer, file_path=train_path, block_size=64)

  test_dataset = TextDataset(tokenizer=tokenizer, file_path=test_path, block_size=64)

  data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

  return train_dataset, test_dataset, data_collator

train_dataset, test_dataset, data_collator = load_dataset(train_path, test_path, tokeinzer)



In [11]:
from transformers import Trainer, TrainingArguments, AutoModelWithLMHead

model = AutoModelWithLMHead.from_pretrained("gpt2")



In [12]:
training_args = TrainingArguments(
    output_dir="./gpt2-ricardo",
    overwrite_output_dir=True,
    num_train_epochs=2,
    per_device_train_batch_size=64,
    save_steps=100,
    warmup_steps=50,
    per_device_eval_batch_size=64,
    evaluation_strategy="steps",
    save_total_limit=1,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)



In [13]:
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=12, training_loss=4.7728376388549805, metrics={'train_runtime': 812.9714, 'train_samples_per_second': 0.827, 'train_steps_per_second': 0.015, 'total_flos': 21948530688000.0, 'train_loss': 4.7728376388549805, 'epoch': 2.0})

In [14]:
trainer.save_model()

In [19]:
input_text = "Ricardo's revenue for 2022"
input_ids = tokeinzer.encode(input_text, return_tensors="pt")
output = model.generate(input_ids, max_length=100, num_return_sequences=1)

generated_text = tokeinzer.decode(output[0], skip_special_tokens=True)
print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Ricardo's revenue for 2022-23 was $1.5 billion, up from $1.4 billion in the previous year.

The company's share price rose to $1.45 on the New York Stock Exchange on Wednesday, up from $1.45 on the previous day.

The company's shares have been trading at $1.45 since the start of the year, up from $1.45 on the previous day.

The company's shares have


In [16]:
model.save_pretrained("gpt2-ricardo")
tokeinzer.save_pretrained("gpt2-ricardo")

('gpt2-ricardo/tokenizer_config.json',
 'gpt2-ricardo/special_tokens_map.json',
 'gpt2-ricardo/vocab.json',
 'gpt2-ricardo/merges.txt',
 'gpt2-ricardo/added_tokens.json',
 'gpt2-ricardo/tokenizer.json')

In [20]:
def evaluate_model(model, dataset):
  trainer = Trainer(
      model=model,
      args=training_args,
      data_collator=data_collator,
      eval_dataset=test_dataset,
  )
  eval_results = trainer.evaluate()
  return eval_results

eval_results_v1 = evaluate_model(model, test_dataset)

model_gpt2 = AutoModelWithLMHead.from_pretrained("gpt2")
eval_results_v2 = evaluate_model(model_gpt2, test_dataset)

print("Evaluation Results (v1):", eval_results_v1)
print("Evaluation Results (v2):", eval_results_v2)




Evaluation Results (v1): {'eval_loss': 4.269515514373779, 'eval_runtime': 16.0368, 'eval_samples_per_second': 3.243, 'eval_steps_per_second': 0.062}
Evaluation Results (v2): {'eval_loss': 4.564049243927002, 'eval_runtime': 27.3118, 'eval_samples_per_second': 1.904, 'eval_steps_per_second': 0.037}


In [21]:
from huggingface_hub import login, create_repo, Repository
login()
repo_name = "fine-tuned-gpt2-ricardo"  # Change this to your desired repository name
from huggingface_hub import HfApi

# Initialize the HfApi instance
api = HfApi()

# Create a new repository
username = api.whoami()['name']  # Get your Hugging Face username
full_repo_name = f"{username}/{repo_name}"

# Create the repository (you can also create it on the Hugging Face website)
api.create_repo(repo_name, private=False)

api.upload_folder(
    folder_path='./gpt2-ricardo',  # Path to the folder with your model
    repo_id=full_repo_name,  # Model repository name
    commit_message="GPT-2 Ricardo"
)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.0k [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/996M [00:00<?, ?B/s]

Upload 15 LFS files:   0%|          | 0/15 [00:00<?, ?it/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

events.out.tfevents.1720017554.3c4bae6eb93e.147990.0:   0%|          | 0.00/5.41k [00:00<?, ?B/s]

events.out.tfevents.1720019196.3c4bae6eb93e.147990.1:   0%|          | 0.00/297 [00:00<?, ?B/s]

events.out.tfevents.1720019311.3c4bae6eb93e.147990.2:   0%|          | 0.00/5.44k [00:00<?, ?B/s]

events.out.tfevents.1720020156.3c4bae6eb93e.147990.3:   0%|          | 0.00/8.28k [00:00<?, ?B/s]

events.out.tfevents.1720020355.3c4bae6eb93e.159321.0:   0%|          | 0.00/5.41k [00:00<?, ?B/s]

events.out.tfevents.1720021241.3c4bae6eb93e.159321.1:   0%|          | 0.00/297 [00:00<?, ?B/s]

events.out.tfevents.1720021347.3c4bae6eb93e.159321.2:   0%|          | 0.00/297 [00:00<?, ?B/s]

events.out.tfevents.1720021378.3c4bae6eb93e.159321.3:   0%|          | 0.00/297 [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/metriccoders/fine-tuned-gpt2-ricardo/commit/6deded897aed207f7d1604f56d58705f79694f88', commit_message='GPT-2 Ricardo', commit_description='', oid='6deded897aed207f7d1604f56d58705f79694f88', pr_url=None, pr_revision=None, pr_num=None)