### Installations and imports

In [1]:
#!pip install transformers datasets
!pip install transformers datasets sentencepiece

from transformers import (pipeline, 
                          AutoModel, 
                          AutoTokenizer, 
                          AutoModelForSeq2SeqLM, 
                          AutoModelForCausalLM, 
                          DataCollatorForLanguageModeling, 
                          TrainingArguments, 
                          Trainer,
)

from datasets import load_dataset



### Load model and dataset, extract questions

In [2]:
# Load pre-trained model from the HuggingFace Hub

#model_2_name = "microsoft/DialoGPT-medium"
model_2_name = "distilgpt2"

tokenizer_2 = AutoTokenizer.from_pretrained(model_2_name)
model_2 = AutoModelForCausalLM.from_pretrained(model_2_name)

# Set pad token to eos token
tokenizer_2.pad_token = tokenizer_2.eos_token

# LOAD DATASET
data_files = {"train": "wiki_movie_plots_deduped.csv"}
raw_datasets = load_dataset("csv", data_files=data_files)

Using custom data configuration default-c497717b05aff212


Downloading and preparing dataset csv/default to /Users/boibondas/.cache/huggingface/datasets/csv/default-c497717b05aff212/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


HBox(children=(HTML(value='Downloading data files'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Extracting data files'), FloatProgress(value=0.0, max=1.0), HTML(value='')))


Dataset csv downloaded and prepared to /Users/boibondas/.cache/huggingface/datasets/csv/default-c497717b05aff212/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [23]:
import pandas as pd
df = pd.read_csv("wiki_movie_plots_deduped.csv")
df.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...


In [16]:
dataset = raw_datasets['train']
dataset

Dataset({
    features: ['Release Year', 'Title', 'Origin/Ethnicity', 'Director', 'Cast', 'Genre', 'Wiki Page', 'Plot'],
    num_rows: 34886
})

In [20]:
dataset.features['Genre']

Value(dtype='string', id=None)

In [None]:
# REMOVE THE UNNECESSARY COLUMNS

useless_cols = ['Release Year', 'Title', 'Origin/Ethnicity', 'Director', 'Cast', 'Genre', 'Wiki Page']
raw_datasets['train'] = raw_datasets['train'].remove_columns(useless_cols)

In [37]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['Plot'],
        num_rows: 34886
    })
})

### Tokenization

In [38]:
# TOKENIZATION

# Create function to tokenize the whole Dataset and prepares it for batch
def tokenize_function(example):
  return tokenizer_2(example["Plot"], truncation=True)

# Apply function to the Dataset and create batches of varying size (thus speeding up training)
# The tokenized dataset will create new columns ['input_ids', 'attention_mask'] 
# that will be used for training. We remove the columns not needed

column_names = ['Plot']

tokenized_datasets = raw_datasets.map(
    tokenize_function, 
    batched=True,
    num_proc=4,
    remove_columns=raw_datasets["train"].column_names
    )

       

HBox(children=(HTML(value='#0'), FloatProgress(value=0.0, max=9.0), HTML(value='')))

 

HBox(children=(HTML(value='#2'), FloatProgress(value=0.0, max=9.0), HTML(value='')))

HBox(children=(HTML(value='#1'), FloatProgress(value=0.0, max=9.0), HTML(value='')))

HBox(children=(HTML(value='#3'), FloatProgress(value=0.0, max=9.0), HTML(value='')))







In [39]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 34886
    })
})

### Training the model (fine-tuning)

In [40]:
### TRAIN-TEST SPLIT

split_datasets = tokenized_datasets["train"].train_test_split(train_size=0.9, seed=42)

In [41]:
### TRAINING
tokenized_datasets = split_datasets

# Select DataCollator:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer_2, mlm=False)

training_args = TrainingArguments(
    output_dir="./",
    overwrite_output_dir = 'True',
    num_train_epochs = 8,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    save_strategy = 'no'
)

trainer = Trainer(
    model=model_2,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
)

In [42]:
trainer.train()
trainer.save_model()

***** Running training *****
  Num examples = 31397
  Num Epochs = 8
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 31400


KeyboardInterrupt: 

### Question Generator

In [18]:
# GENERATE QUESTION WITH FINE-TUNED MODEL

# Load fine-tuned model
model = AutoModelForCausalLM.from_pretrained("./") # loads the weights from current directory './'
tokenizer = tokenizer_2

# QUESTION GENERATOR

def questionGenerator(text: str):
  """
  Input:
  Returns:
  """
  # ENCODE input and add 'end-of-string' token
  input_ids = tokenizer.encode(text + tokenizer.eos_token, return_tensors="pt")
  l = len(input_ids)
  # GENERATE
  chat_history_ids = model.generate(
      input_ids,
      max_length=l+80,
      do_sample=True,
      top_p=0.91,
      top_k=10,
      temperature=0.75,
      early_stopping=True,
      pad_token_id=tokenizer.eos_token_id
  )

  # DECODE to string and Truncate
  output = tokenizer.decode(chat_history_ids[:, input_ids.shape[-1]:][0], skip_special_tokens=True)
  return output

In [15]:
# Test
for text in ['What does Nihilism mean?', 'What is Pessimism', 'What is the purpose?', 'Prejudice against science?']:
  print(f"Text: {text} \n\nBot: {questionGenerator(text)} \n", 100*'-')

Text: What does Nihilism mean? 

Bot: The world of the modern man is not always a good thing. What is the meaning of the word?--The term is a term which means the word itself. To put it simply, there is no better word than a German word for a German word for a German word. What is the meaning of a German word for the German word for the German word for 
 ----------------------------------------------------------------------------------------------------
Text: What is Pessimism 

Bot: What is the meaning of the word pessimism?In the present day, it is the concept of the word pessimism.The term pessimism is the expression of a desire to attain to an ideal.In order to understand why, why?Because pessimism is an expression of a desire to attain to an ideal, it is a necessary means of overcoming the prejudices of the 
 ----------------------------------------------------------------------------------------------------
Text: What is the purpose? 

Bot: This is what is called the meaning of t