### Installations and imports

In [25]:
#!pip install transformers datasets
!pip install transformers datasets sentencepiece

from transformers import (pipeline, 
                          AutoModel, 
                          AutoTokenizer, 
                          AutoModelForSeq2SeqLM, 
                          AutoModelForCausalLM, 
                          DataCollatorForLanguageModeling, 
                          TrainingArguments, 
                          Trainer,
)

from datasets import load_dataset

### Load model and dataset, extract questions

In [26]:
# Load pre-trained model from the HuggingFace Hub

#model_2_name = "microsoft/DialoGPT-medium"
model_2_name = "distilgpt2"

tokenizer_2 = AutoTokenizer.from_pretrained(model_2_name)
model_2 = AutoModelForCausalLM.from_pretrained(model_2_name)

# Set pad token to eos token
tokenizer_2.pad_token = tokenizer_2.eos_token

# LOAD DATASET
data_files = {"train": "sentences.csv"}
raw_datasets = load_dataset("csv", data_files=data_files)

In [27]:
# REMOVE THE UNNECESSARY COLUMNS

useless_cols = ['label','word_count', 'mean_word_length', 'stop_words_ratio', 'stop_words_count', 'ADJ_count', 'ADV_count', 'ADP_count', 'AUX_count', 'DET_count', 'NUM_count', 'X_count', 'INTJ_count', 'CONJ_count', 'CCONJ_count', 'SCONJ_count', 'PROPN_count', 'NOUN_count', 'PRON_count', 'PART_count', 'VERB_count']
raw_datasets['train'] = raw_datasets['train'].remove_columns(useless_cols)

# Check dataset
#raw_datasets

# Check UNIQUE authors
raw_datasets['train'].unique('author')

In [28]:
# EXTRACT QUESTIONS from a given author

questions = raw_datasets['train'].filter(lambda example: "?" in example['sentence'] and example['author'] == 'Nietzsche')

print(f"This dataset contains {questions.num_rows} questions. For example: ")
questions['sentence'][0:5]

In [29]:
### Update dataset
raw_datasets['train'] = questions

### Tokenization

In [13]:
# TOKENIZATION

# Create function to tokenize the whole Dataset and prepares it for batch
def tokenize_function(example):
  return tokenizer_2(example["sentence"], truncation=True)

# Apply function to the Dataset and create batches of varying size (thus speeding up training)
# The tokenized dataset will create new columns ['input_ids', 'attention_mask'] 
# that will be used for training. We remove the columns not needed

column_names = ['sentence', 'author']

tokenized_datasets = raw_datasets.map(
    tokenize_function, 
    batched=True,
    num_proc=4,
    remove_columns=raw_datasets["train"].column_names
    )

In [15]:
tokenized_datasets

### Training the model (fine-tuning)

In [16]:
### TRAIN-TEST SPLIT

split_datasets = tokenized_datasets["train"].train_test_split(train_size=0.9, seed=42)

In [17]:
### TRAINING
tokenized_datasets = split_datasets

# Select DataCollator:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer_2, mlm=False)

training_args = TrainingArguments(
    output_dir="./",
    overwrite_output_dir = 'True',
    num_train_epochs = 8,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    save_strategy = 'no'
)

trainer = Trainer(
    model=model_2,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
)

In [15]:
trainer.train()
trainer.save_model()



***** Running training *****
  Num examples = 2170
  Num Epochs = 8
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2176


Epoch,Training Loss,Validation Loss
1,No log,4.550782
2,4.579800,4.505051
3,4.579800,4.496866
4,4.225600,4.49878
5,4.225600,4.505766
6,4.083700,4.5092
7,4.083700,4.513869
8,3.990600,4.518889


***** Running Evaluation *****
  Num examples = 242
  Batch size = 8
***** Running Evaluation *****
  Num examples = 242
  Batch size = 8
***** Running Evaluation *****
  Num examples = 242
  Batch size = 8
***** Running Evaluation *****
  Num examples = 242
  Batch size = 8
***** Running Evaluation *****
  Num examples = 242
  Batch size = 8
***** Running Evaluation *****
  Num examples = 242
  Batch size = 8
***** Running Evaluation *****
  Num examples = 242
  Batch size = 8
***** Running Evaluation *****
  Num examples = 242
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to ./results
Configuration saved in ./results/config.json
Model weights saved in ./results/pytorch_model.bin


### Question Generator

In [18]:
# GENERATE QUESTION WITH FINE-TUNED MODEL

# Load fine-tuned model
model = AutoModelForCausalLM.from_pretrained("./") # loads the weights from current directory './'
tokenizer = tokenizer_2

# Helper: TRUNCATE strings to QUESTION mark
def questionTruncate(s: str):
  """
  String polisher for outputting clean questions.
  Input: string s
  Returns: string s truncated at the FIRST "?" char or at the LAST "." char
  """
  reverse = s[::-1]
  LastFullStop = reverse.find(".")
  FirstQuestMark = s.find("?")
  if FirstQuestMark >= 0 :
    return s[: FirstQuestMark +1 ]
  elif LastFullStop >= 0 :
    return s[: len(s) - LastFullStop]
  return s

# Test
#for s in ['Multiple? more than one?', 'First? Then no question.', 'No punctuation', 'No question.']:
#  print(questionTruncate(s))


# QUESTION GENERATOR

def questionGenerator(text: str):
  """
  Input:
  Returns:
  """
  # ENCODE input and add 'end-of-string' token
  input_ids = tokenizer.encode(text + tokenizer.eos_token, return_tensors="pt")
  l = len(input_ids)
  # GENERATE
  chat_history_ids = model.generate(
      input_ids,
      max_length=l+80,
      do_sample=True,
      top_p=0.91,
      top_k=10,
      temperature=0.75,
      early_stopping=True,
      pad_token_id=tokenizer.eos_token_id
  )

  # DECODE to string and Truncate
  output = tokenizer.decode(chat_history_ids[:, input_ids.shape[-1]:][0], skip_special_tokens=True)
  return output

In [19]:
for text in ['What does Nihilism mean?', 'What is Pessimism', 'What is the purpose?', 'Prejudice against science?']:
  print(f"Text: {text} \n\nBot: {questionGenerator(text)} \n", 100*'-')

In [15]:
# Test
for text in ['What does Nihilism mean?', 'What is Pessimism', 'What is the purpose?', 'Prejudice against science?']:
  print(f"Text: {text} \n\nBot: {questionGenerator(text)} \n", 100*'-')

Text: What does Nihilism mean? 

Bot: The world of the modern man is not always a good thing. What is the meaning of the word?--The term is a term which means the word itself. To put it simply, there is no better word than a German word for a German word for a German word. What is the meaning of a German word for the German word for the German word for 
 ----------------------------------------------------------------------------------------------------
Text: What is Pessimism 

Bot: What is the meaning of the word pessimism?In the present day, it is the concept of the word pessimism.The term pessimism is the expression of a desire to attain to an ideal.In order to understand why, why?Because pessimism is an expression of a desire to attain to an ideal, it is a necessary means of overcoming the prejudices of the 
 ----------------------------------------------------------------------------------------------------
Text: What is the purpose? 

Bot: This is what is called the meaning of t