### Installations and imports

In [1]:
#!pip install transformers datasets
!pip install transformers datasets sentencepiece

from transformers import (pipeline, 
                          AutoModel, 
                          AutoTokenizer, 
                          AutoModelForSeq2SeqLM, 
                          AutoModelForCausalLM, 
                          DataCollatorForLanguageModeling, 
                          TrainingArguments, 
                          Trainer,
)

from datasets import load_dataset



### Load model and dataset, extract questions

In [2]:
# Load pre-trained model from the HuggingFace Hub

model_name = "distilgpt2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Set pad token to eos token
tokenizer.pad_token = tokenizer.eos_token

# LOAD DATASET
data_files = {"train": "wiki_movie_plots_deduped.csv"}
raw_datasets = load_dataset("csv", data_files=data_files)

Using custom data configuration default-48a09ea3965a1b89


Downloading and preparing dataset csv/default to /Users/boibondas/.cache/huggingface/datasets/csv/default-48a09ea3965a1b89/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


HBox(children=(HTML(value='Downloading data files'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Extracting data files'), FloatProgress(value=0.0, max=1.0), HTML(value='')))


Dataset csv downloaded and prepared to /Users/boibondas/.cache/huggingface/datasets/csv/default-48a09ea3965a1b89/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [4]:
import pandas as pd
df = pd.read_csv("wiki_movie_plots_deduped.csv")
df.head(3)

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."


In [5]:
dataset = raw_datasets['train']
dataset

Dataset({
    features: ['Release Year', 'Title', 'Origin/Ethnicity', 'Director', 'Cast', 'Genre', 'Wiki Page', 'Plot'],
    num_rows: 34886
})

In [6]:
dataset.features['Genre']

Value(dtype='string', id=None)

In [7]:
# REMOVE THE UNNECESSARY COLUMNS

useless_cols = ['Release Year', 'Title', 'Origin/Ethnicity', 'Director', 'Cast', 'Genre', 'Wiki Page']
raw_datasets['train'] = raw_datasets['train'].remove_columns(useless_cols)

In [8]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['Plot'],
        num_rows: 34886
    })
})

### Tokenization

In [28]:
# TOKENIZATION

# Create function to tokenize the whole Dataset and prepares it for batch
def tokenize_function(example):
  return tokenizer(example["Plot"], truncation=True)

# Apply function to the Dataset and create batches of varying size (thus speeding up training)
# The tokenized dataset will create new columns ['input_ids', 'attention_mask'] 
# that will be used for training. We remove the columns not needed

column_names = ['Plot']

tokenized_datasets = raw_datasets.map(
    tokenize_function, 
    batched=True,
    num_proc=4,
    remove_columns=raw_datasets["train"].column_names
    )

 

Loading cached processed dataset at /Users/boibondas/.cache/huggingface/datasets/csv/default-48a09ea3965a1b89/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-8cea304cd9516593.arrow


 

Loading cached processed dataset at /Users/boibondas/.cache/huggingface/datasets/csv/default-48a09ea3965a1b89/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-b9ae953e059a5d76.arrow


 

Loading cached processed dataset at /Users/boibondas/.cache/huggingface/datasets/csv/default-48a09ea3965a1b89/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-f6a20ae4ab1d6b7f.arrow


 

Loading cached processed dataset at /Users/boibondas/.cache/huggingface/datasets/csv/default-48a09ea3965a1b89/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-bd8c0d4a071c02d0.arrow


In [29]:
tokenized_datasets['train'] = tokenized_datasets['train'].shuffle(seed=42).select(range(10))
tokenized_datasets

Loading cached shuffled indices for dataset at /Users/boibondas/.cache/huggingface/datasets/csv/default-48a09ea3965a1b89/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-fdab7320ba4b5363.arrow


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 10
    })
})

### Training the model (fine-tuning)

In [30]:
### TRAIN-TEST SPLIT

split_datasets = tokenized_datasets["train"].train_test_split(train_size=0.9, seed=42)
split_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 9
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1
    })
})

In [31]:
### TRAINING
tokenized_datasets = split_datasets

# Select DataCollator:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir="./",
    overwrite_output_dir = 'True',
    num_train_epochs = 3,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    optim="adamw_torch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [32]:
trainer.train()
trainer.save_model()

***** Running training *****
  Num examples = 9
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 6


Epoch,Training Loss,Validation Loss
1,No log,3.579616
2,No log,3.580121
3,No log,3.580563


***** Running Evaluation *****
  Num examples = 1
  Batch size = 8
***** Running Evaluation *****
  Num examples = 1
  Batch size = 8
***** Running Evaluation *****
  Num examples = 1
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to ./
Configuration saved in ./config.json
Model weights saved in ./pytorch_model.bin


### Text Generator

In [18]:
# GENERATE TEXT WITH FINE-TUNED MODEL

# Load fine-tuned model
model = AutoModelForCausalLM.from_pretrained("./") # loads the weights from current directory './'
tokenizer = tokenizer

# QUESTION GENERATOR

def textGenerator(text: str):
  """
  Input:
  Returns:
  """
  # ENCODE input and add 'end-of-string' token
  input_ids = tokenizer.encode(text + tokenizer.eos_token, return_tensors="pt")
  l = len(input_ids)
  # GENERATE
  chat_history_ids = model.generate(
      input_ids,
      max_length=l+80,
      do_sample=True,
      top_p=0.91,
      top_k=10,
      temperature=0.75,
      early_stopping=True,
      pad_token_id=tokenizer.eos_token_id
  )

  # DECODE to string and Truncate
  output = tokenizer.decode(chat_history_ids[:, input_ids.shape[-1]:][0], skip_special_tokens=True)
  return output