In [None]:
! pip install datasets
! pip install transformers[torch]
! pip install accelerate -U

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/547.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m204.8/547.8 kB[0m [31m6.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m542.7/547.8 kB[0m [31m8.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB

In [None]:
import os
import csv
import json
import zipfile

import requests
import pandas as pd
import torch
from tqdm import tqdm
from datasets import Dataset
from transformers import Trainer, TrainingArguments, AutoModelForCausalLM, AutoTokenizer, pipeline

The [Elsevier OA CC-BY Corpus](https://elsevier.digitalcommonsdata.com/datasets/zm33cdndxs/3) data set contains scientific research papers. I'm creating an LLM fine tuned to personal fitness data, so my keywords will be releated to exercise science.

The dataset is large, so we will download it directly and unzip the necessary files.
'all_data.zip' is what the url will produce, but all of the other file names can be edited.

In [None]:
# URL of the dataset
url = 'https://prod-dcd-datasets-cache-zipfiles.s3.eu-west-1.amazonaws.com/zm33cdndxs-2.zip'
keywords = ['exercise', 'fitness', 'workout', 'physical activity', 'sports', 'strength training', 'personal training']

def download_file(url, local_filename):
    if not os.path.exists(local_filename):
        with requests.get(url, stream=True) as response:
            response.raise_for_status()
            with open(local_filename, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)
        print(f'Dataset downloaded and saved as {local_filename}')

def extract_zip(zip_file_path, extract_to, max_files=None):
    if not os.path.exists(extract_to):
        os.makedirs(extract_to, exist_ok=True)
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            total_files = len(zip_ref.namelist())
            for i, file in enumerate(zip_ref.namelist()):
                zip_ref.extract(file, extract_to)
                if max_files and i + 1 >= max_files:
                    break
                print(f'Extracted {i + 1}/{total_files} files', end='\r')
        print(f'\nAll files extracted to {extract_to}')




# Local filenames and paths
local_filename = 'all_data.zip'
all_data_name = 'all_data_folder'
all_text_name = 'all_text'
exercise_files_name = 'exercise_file_names.csv'

#Download and extract zips
download_file(url, local_filename)
extract_zip(local_filename, all_data_name)
all_data_zip = os.path.join(all_data_name, 'json-articals.zip')
extract_zip(all_data_zip, all_text_name)

Dataset downloaded and saved as all_data.zip
Extracted 7/7 files
All files extracted to all_data_folder
Extracted 40002/40002 files
All files extracted to all_text


With all of the data extracted, we will create a csv of all the file names that have content that match our keywords.

In [None]:
def is_text_is_about_topic(keywords, text):
    is_about_exercise_science = any(keyword in text.lower() for keyword in keywords)
    return is_about_exercise_science

def sort_through_files_and_create_csv(folder_path, output_csv):
    matching_files = []
    #Open files
    for filename in tqdm(os.listdir(folder_path)):
        file_path = os.path.join(folder_path, filename)
        if os.path.isfile(file_path) and filename.endswith('.json'):
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)

                #Find if text files mention exercise science
                if 'abstract' in data and is_text_is_about_topic(keywords, data['abstract']):
                    matching_files.append(filename)

    # Write the matching file names to a CSV
    with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Filename'])
        for file_name in matching_files:
            writer.writerow([file_name])

    print(f'CSV file created with {len(matching_files)} matching files.')

sort_through_files_and_create_csv(os.path.join(all_text_name,'json'), exercise_files_name)

100%|██████████| 40001/40001 [01:42<00:00, 391.70it/s]

CSV file created with 694 matching files.





Now we use the csv to load the text files that are about our topic.

In [None]:
def load_json_files(json_folder, file_names):
  texts = []
  for file_name in file_names:
    file_path = os.path.join(json_folder, file_name)
    if os.path.isfile(file_path):
      with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
        if 'body_text' in data:
          for item in data['body_text']:
            if 'sentence' in item:
                texts.append(item['sentence'])
  return texts

def load_custom_dataset(json_folder, file_names):
  texts = load_json_files(json_folder, file_names)
  return Dataset.from_dict({"text": texts})

json_folder_path = os.path.join(all_text_name, 'json')
csv_data = pd.read_csv(exercise_files_name)
file_names = csv_data['Filename'].tolist()

dataset_raw_text = load_custom_dataset(json_folder_path, file_names)

We need to teach the LLM to generate text about exercise science. For our text, we need to teach the model what it should predict. We segment each text into pairs of input and output.

In [None]:
def create_pairs(texts, window_size=50, step_size=50):
    pairs = []
    for text in texts:
        text = text['text']
        i = 0
        while i < len(text):
            end_index = min(i + window_size, len(text))
            while end_index < len(text) and not text[end_index].isspace():
                end_index += 1
            input_seq = text[i:end_index]

            output_start = end_index
            output_end = min(output_start + step_size, len(text))
            while output_end < len(text) and not text[output_end].isspace():
                output_end += 1

            output_seq = text[output_start:output_end]

            if input_seq and output_seq:
                pairs.append({"input": input_seq, "output": output_seq})

            i = output_start
    return pairs

pairs = create_pairs(dataset_raw_text)

dataset = Dataset.from_pandas(pd.DataFrame(pairs))

Now we will load our model and tokenizer. We use the tokenizer to split the text into understandable text and then encode that text. Then we will create a training and testing split out of just 12000 records because of our instance constraints.

In [None]:
model = AutoModelForCausalLM.from_pretrained("distilgpt2")
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
def tokenize_function(examples):
    tokenizer.pad_token = tokenizer.eos_token

    inputs = tokenizer(examples['input'], padding='max_length', truncation=True, max_length=512)
    outputs = tokenizer(examples['output'], padding='max_length', truncation=True, max_length=512)
    inputs['labels'] = outputs['input_ids']
    return inputs

tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    batch_size=1000
    )


Map:   0%|          | 0/328711 [00:00<?, ? examples/s]

In [None]:
split_datasets = tokenized_datasets.shuffle(seed=42).select(range(12000)).train_test_split(test_size=0.2).shuffle(seed=42)

small_train_dataset = split_datasets["train"]
small_eval_dataset = split_datasets["test"]

To fine tune the LLM, we use the TrainingArguments class to personalize our training. We will start with everything default!

Then we use the Trainer class to fine tune the model.

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    save_steps=500,                     # Save a checkpoint every 500 steps
    save_total_limit=5,                 # Only keep the last 5 checkpoints
    evaluation_strategy="steps",        # Evaluate every `eval_steps`
    eval_steps=500,                     # Evaluation step
    save_strategy="steps",              # Save checkpoints based on steps
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
)

In [None]:
trainer.train()

trainer.save_model(output_dir = './fine-tuned-model')
tokenizer.save_pretrained("./fine-tuned-model")

Step,Training Loss


Step,Training Loss
500,0.1943
1000,0.1397
1500,0.1383
2000,0.1341
2500,0.1342
3000,0.1324
3500,0.1314


Because this is Google Collab and we want to use the model again in the future, we will download the model.

In [None]:
from google.colab import files

for file in os.listdir('./fine-tuned-model'):
  files.download(os.path.join('./fine-tuned-model', file))

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
model = AutoModelForCausalLM.from_pretrained('./fine-tuned-model')
tokenizer = AutoTokenizer.from_pretrained('./fine-tuned-model')
text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

Next time, we will create cloze questions to fine tune the model and get even better performance!