In [None]:
!pip install --quiet transformers datasets

[K     |████████████████████████████████| 5.8 MB 14.2 MB/s 
[K     |████████████████████████████████| 452 kB 102.6 MB/s 
[K     |████████████████████████████████| 7.6 MB 86.2 MB/s 
[K     |████████████████████████████████| 182 kB 94.3 MB/s 
[K     |████████████████████████████████| 132 kB 94.2 MB/s 
[K     |████████████████████████████████| 212 kB 99.7 MB/s 
[K     |████████████████████████████████| 127 kB 100.7 MB/s 
[?25h

In [None]:
model_name = "gpt2"
train_file = "/content/drive/MyDrive/dataset_train.csv"
output_dir = "/content/drive/MyDrive/resumes_model"

In [None]:
# Load the GPT tokenizer.
# https://huggingface.co/docs/transformers/v4.25.1/en/model_doc/gpt2#transformers.GPT2Tokenizer
from transformers import GPT2Tokenizer

 # gpt2-medium
tokenizer = GPT2Tokenizer.from_pretrained(
    model_name, 
    bos_token='<|startoftext|>', 
    eos_token='<|endoftext|>', 
    pad_token='<|pad|>'
)

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
print("The max model length is {} for this model, although the actual embedding size for GPT small is 768".format(tokenizer.model_max_length))
print("The beginning of sequence token {} token has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.bos_token_id), tokenizer.bos_token_id))
print("The end of sequence token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.eos_token_id), tokenizer.eos_token_id))
print("The padding token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.pad_token_id), tokenizer.pad_token_id))

The max model length is 1024 for this model, although the actual embedding size for GPT small is 768
The beginning of sequence token <|startoftext|> token has the id 50257
The end of sequence token <|endoftext|> has the id 50256
The padding token <|pad|> has the id 50258


In [None]:
# Get the datasets

from datasets import load_dataset

data_files = {}
dataset_args = {}
validation_split_percentage = 5
extension = "csv"
data_files = {
    "train": train_file, 
}

raw_datasets = load_dataset(
    extension, 
    sep=";", 
    data_files=data_files
)

raw_datasets["validation"] = load_dataset(
    extension,
    sep=";", 
    data_files=data_files,
    split=f"train[:{validation_split_percentage}%]",
    **dataset_args,
)

raw_datasets["train"] = load_dataset(
    extension,
    sep=";", 
    data_files=data_files,
    split=f"train[{validation_split_percentage}%:]",
    **dataset_args,
)



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-508571a1d7c8e9d9/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-508571a1d7c8e9d9/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]



In [None]:
# For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
# 'text' is found. You can easily tweak this behavior (see below).
text_column_name = "text"
column_names = raw_datasets["train"].column_names

# The number of processes to use for the preprocessing.
preprocessing_num_workers = None

# We can now call the tokenizer on all our texts.
# This is very simple, using the map method from the Datasets library.
# First we define a function that call the tokenizer on our texts:
def tokenize_function(examples):
    output = tokenizer(examples[text_column_name])
    return output

# Then we apply it to all the splits in our datasets object, using batched=True 
# and 4 processes to speed up the preprocessing.
# We won't need the description column afterward, so we discard it.

tokenized_datasets = raw_datasets.map(
    tokenize_function, 
    batched=True, 
    num_proc=preprocessing_num_workers,
    remove_columns=column_names,
    desc="Running tokenizer on dataset",
)

Running tokenizer on dataset:   0%|          | 0/106 [00:00<?, ?ba/s]

Running tokenizer on dataset:   0%|          | 0/6 [00:00<?, ?ba/s]

In [None]:
from itertools import chain

# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
block_size = 1024

# Overwrite the cached training and evaluation sets
overwrite_cache = False

# Code from here:
# https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_clm.py#L445
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

# Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder
# for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower
# to preprocess.
#
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    num_proc=preprocessing_num_workers,
    load_from_cache_file=not overwrite_cache,
    desc=f"Grouping texts in chunks of {block_size}",
)

Grouping texts in chunks of 1024:   0%|          | 0/106 [00:00<?, ?ba/s]

Grouping texts in chunks of 1024:   0%|          | 0/6 [00:00<?, ?ba/s]

In [None]:
import torch
import random
import numpy as np

from transformers import GPT2Config, GPT2LMHeadModel

# I'm not really doing anything with the config buheret
configuration = GPT2Config.from_pretrained(
    model_name, 
    output_hidden_states=False
  )

# instantiate the model
model = GPT2LMHeadModel.from_pretrained(
    model_name, 
    config=configuration
)

# this step is necessary because I've added some tokens (bos_token, etc) to the embeddings
# otherwise the tokenizer and model tensors won't match up
model.resize_token_embeddings(len(tokenizer))

# Tell pytorch to run this model on the GPU.
device = torch.device("cuda")
model.cuda()

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

Downloading:   0%|          | 0.00/548M [00:00<?, ?B/s]

In [None]:
# Initialize the Trainer
from transformers import (
    TrainingArguments, 
    Trainer, 
    default_data_collator, 
)

train_dataset = lm_datasets["train"]
eval_dataset = lm_datasets["validation"]

# https://huggingface.co/docs/transformers/v4.25.1/en/main_classes/trainer#transformers.TrainingArguments
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    evaluation_strategy="no",  # No evaluation is done during training.
    save_strategy="no",  # No save is done during training.
    )


# https://huggingface.co/docs/transformers/v4.25.1/en/main_classes/trainer#transformers.Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    # Data collator will default to DataCollatorWithPadding, so we change it.
    data_collator=default_data_collator,
    compute_metrics=None,
    preprocess_logits_for_metrics=None,
)

In [None]:
# Training
train_result = trainer.train(resume_from_checkpoint=None)

***** Running training *****
  Num examples = 2882
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1083
  Number of trainable parameters = 124441344


Step,Training Loss
500,3.4565
1000,2.8895




Training completed. Do not forget to share your model on huggingface.co/models =)




In [None]:
print("Saving model to %s" % output_dir)

trainer.save_model(output_dir=output_dir)  # Saves the tokenizer too for easy upload
metrics = train_result.metrics
trainer.save_metrics("train", metrics)
trainer.save_state()

Saving model checkpoint to /content/drive/MyDrive/resumes_model


Saving model to /content/drive/MyDrive/resumes_model


Configuration saved in /content/drive/MyDrive/resumes_model/config.json
Model weights saved in /content/drive/MyDrive/resumes_model/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/resumes_model/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/resumes_model/special_tokens_map.json
added tokens file saved in /content/drive/MyDrive/resumes_model/added_tokens.json


In [None]:
kwargs = {
    "finetuned_from": model_name, 
    "tasks": "text-generation"
    }
trainer.create_model_card(**kwargs)

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}


In [None]:
# # Load a trained model and vocabulary that you have fine-tuned
#model = GPT2LMHeadModel.from_pretrained(output_dir)
#tokenizer = GPT2Tokenizer.from_pretrained(output_dir)
#model.to(device)

In [None]:
# Generate Text
model.eval()

prompt = "As a sowtware architect, I"

generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
generated = generated.to(device)

print(generated)

sample_outputs = model.generate(
                                generated, 
                                #bos_token_id=random.randint(1,30000),
                                do_sample=True,   
                                top_k=50, 
                                max_length = 300,
                                top_p=0.95, 
                                num_return_sequences=5
                                )

for i, sample_output in enumerate(sample_outputs):
  print("{}: {}\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[ 1722,   257, 45125,    83,  1574,  7068,    11,   314]],
       device='cuda:0')
0: As a sowtware architect, I identified, researched, designed and built the architecture of the SCCM platform.

1: As a sowtware architect, I worked closely with the network engineers to integrate the business logic to the application.

2: As a sowtware architect, I performed manual installation of the RTA server using SSIS.

3: As a sowtware architect, I coordinated with the Marketing department to identify problems encountered and provide solutions to resolve them.

4: As a sowtware architect, I used various types of data base and Hadoop/Hibernate relational databases, including MySQL, MongoDB, Cassandra to extract information from the database and to insert/delete data from the database.

