## Mounting Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Importing Modules


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
import pandas as pd
import torch
!pip install datasets
import datasets
import numpy as np
from datasets import Dataset

## Setting gpu if available

In [2]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

## Loading Dataset


In [3]:
dataset_path = '/content/drive/MyDrive/ChatBot/dialogs.txt'

dataFrame = pd.read_csv(dataset_path, sep = '\t', names = ['dialog_1', 'dialog_2'])
print(dataFrame.shape)
dataFrame.head()

(3725, 2)


Unnamed: 0,dialog_1,dialog_2
0,"hi, how are you doing?",i'm fine. how about yourself?
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.
2,i'm pretty good. thanks for asking.,no problem. so how have you been?
3,no problem. so how have you been?,i've been great. what about you?
4,i've been great. what about you?,i've been good. i'm in school right now.


## Combining Column dialog_1 and dialog_2

In [4]:
dataFrame['utterance'] = dataFrame['dialog_1'] + " " + dataFrame['dialog_2']
dataFrame.head()

Unnamed: 0,dialog_1,dialog_2,utterance
0,"hi, how are you doing?",i'm fine. how about yourself?,"hi, how are you doing? i'm fine. how about you..."
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.,i'm fine. how about yourself? i'm pretty good....
2,i'm pretty good. thanks for asking.,no problem. so how have you been?,i'm pretty good. thanks for asking. no problem...
3,no problem. so how have you been?,i've been great. what about you?,no problem. so how have you been? i've been gr...
4,i've been great. what about you?,i've been good. i'm in school right now.,i've been great. what about you? i've been goo...


## Loading Model & Tokenizer

In [5]:
model_name = 'microsoft/DialoGPT-medium'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/863M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

## Converting Dataframe to Dataset

In [6]:
dataset = Dataset.from_pandas(dataFrame)

## Generating Encodings

In [7]:
def encode(examples):
    encoded = tokenizer(examples['utterance'], truncation=True, padding='max_length', max_length=128, return_tensors = 'pt')
    encoded['labels'] = encoded['input_ids'][:]
    return encoded

data_split = dataset.train_test_split(test_size = 0.2) #, shuffle = True
dataset_encodings = data_split.map(encode, batched=True)


Map:   0%|          | 0/2980 [00:00<?, ? examples/s]

Map:   0%|          | 0/745 [00:00<?, ? examples/s]

## Training Arguments

In [8]:
training_args = TrainingArguments(
    output_dir = 'Outputs/',
    overwrite_output_dir = True,
    do_train=True,
    do_eval=True,
    # do_predict = True,
    evaluation_strategy="epoch",
    num_train_epochs = 10,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 4,
    fp16=True,
    save_strategy = 'epoch',
    logging_strategy = 'epoch',
    save_total_limit = 3,
    warmup_steps=500,
    load_best_model_at_end = True,
    disable_tqdm = False,
)




In [9]:
training_args.device

device(type='cuda', index=0)

In [10]:
trainer = Trainer(model = model, args = training_args, train_dataset = dataset_encodings['train'], eval_dataset = dataset_encodings['test'])
trainer.train()



  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss
1,1.1404,0.358611
2,0.3208,0.307898
3,0.2275,0.285305
4,0.1686,0.288239
5,0.1337,0.296718
6,0.1113,0.307128
7,0.0965,0.318226
8,0.0865,0.325646
9,0.0798,0.331159
10,0.0755,0.335863


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=3730, training_loss=0.24406521007138987, metrics={'train_runtime': 2081.3478, 'train_samples_per_second': 14.318, 'train_steps_per_second': 1.792, 'total_flos': 6918820174233600.0, 'train_loss': 0.24406521007138987, 'epoch': 10.0})

In [11]:
tokenizer.save_pretrained('/content/drive/MyDrive/ChatBot/' + 'trained_model')
trainer.save_model('/content/drive/MyDrive/ChatBot/' + 'trained_model')