<a href="https://colab.research.google.com/github/myazann/Text-Generation/blob/main/Train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install datasets==1.0.2

!pip install git-python==1.0.3
!pip install sacrebleu==1.4.12
! pip install tokenizers

import os
import csv
import json
import pandas as pd
import numpy as np
from transformers import OpenAIGPTLMHeadModel, Trainer, TrainingArguments, GPT2LMHeadModel
from transformers import AutoModel, AutoModelWithLMHead, AutoTokenizer, EncoderDecoderModel, BertTokenizer
from transformers import TextDataset,DataCollatorForLanguageModeling,LineByLineTextDataset
from sklearn.model_selection import train_test_split
from transformers import pipeline
import torch
import gc
import datasets
from dataclasses import dataclass, field
from typing import Optional
from tokenizers import BertWordPieceTokenizer, Tokenizer
from tokenizers.processors import BertProcessing

A tokenizer can be trained from the scratch, but I am going to use a pretrained one.

In [None]:
tokenizer = BertWordPieceTokenizer()

tokenizer.train(files="/content/drive/My Drive/telegram_chatbot_train.csv", vocab_size=32_000, min_frequency=2)
tokenizer.save_model(".")

tokenizer = BertTokenizer.from_pretrained("/content/drive/My Drive/vocab.txt", do_lower_case=True, 
                                          return_special_tokens_mask=True, model_max_len = 512, is_fast = True)
tokenizer.bos_token = tokenizer.cls_token
tokenizer.eos_token = tokenizer.sep_token

['./vocab.txt']

Loading data, tokenizer and the model.

In [None]:
if os.listdir("/content/drive/My Drive/enc2dec"):
  last_trained_path = "/content/drive/My Drive/enc2dec" + "/" + os.listdir("/content/drive/My Drive/enc2dec")[0]
else:
  last_trained_path = None 

tg_data_train = datasets.load_dataset("csv", data_files = "/content/drive/My Drive/telegram_chatbot_train.csv", split = "train")
tg_data_val = datasets.load_dataset("csv", data_files = "/content/drive/My Drive/telegram_chatbot_val.csv", split = "train")


tokenizer = AutoTokenizer.from_pretrained('dbmdz/bert-base-turkish-cased', do_lower_case=True, return_special_tokens_mask=True)
tokenizer.bos_token = tokenizer.cls_token
tokenizer.eos_token = tokenizer.sep_token

model = EncoderDecoderModel.from_encoder_decoder_pretrained("dbmdz/bert-base-turkish-cased", "dbmdz/bert-base-turkish-cased")

if last_trained_path is not None:
  model = model.from_pretrained(last_trained_path)

Free memory if necessary.

In [None]:
model = None
trainer = None
training_args = None

del model
del training_args
del trainer

gc.collect()

with torch.no_grad():
    torch.cuda.empty_cache()

Creating the data pipeline.

In [None]:
def process_data_to_model_inputs(batch):
  inputs = tokenizer(batch["Written_Text"], padding="max_length", truncation=True, max_length=encoder_max_length)
  outputs = tokenizer(batch["Answer_Text"], padding="max_length", truncation=True, max_length=decoder_max_length)

  batch["input_ids"] = inputs.input_ids
  batch["attention_mask"] = inputs.attention_mask
  batch["decoder_input_ids"] = outputs.input_ids
  batch["decoder_attention_mask"] = outputs.attention_mask
  batch["labels"] = outputs.input_ids.copy()

  batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]

  return batch

In [None]:
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.eos_token_id = tokenizer.eos_token_id
model.config.pad_token_id = tokenizer.pad_token_id

model.config.vocab_size = model.config.decoder.vocab_size
model.config.max_length = 150
model.config.min_length = 5

model.config.no_repeat_ngram_size = 3 
model.config.early_stopping = True
model.config.length_penalty = 2.0
model.config.num_beams = 4


batch_size=16
encoder_max_length=128
decoder_max_length=128


tg_data_train = tg_data_train.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size,
    remove_columns=["Written_Text", "Answer_Text"]
    )

tg_data_train.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

tg_data_val = tg_data_val.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["Written_Text", "Answer_Text"]
)
tg_data_val.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

Instantiate a Trainer and start training.

In [None]:
training_args = TrainingArguments(
    output_dir = "/content/drive/My Drive/enc2dec",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=64,
    ##predict_with_generate=True,
    ##evaluation_strategy = "steps",
    do_train=True,
    do_eval=True,
    save_total_limit = 1,
    
    logging_steps=2500,  
    save_steps=2500,  
    eval_steps=2500,  
    warmup_steps=2500,  
    max_steps=100000, 
    overwrite_output_dir=True,
    fp16=True
)

# instantiate trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tg_data_train,
    eval_dataset=tg_data_val,
)

In [None]:
if last_trained_path is None:
  trainer.train()
else: 
  trainer.train(last_trained_path)

In [None]:
trainer.evaluate()

In [None]:
!rm -rf runs