In [1]:
from google.colab import files
uploaded = files.upload()

Saving The-Office-Lines-V4.csv to The-Office-Lines-V4.csv


In [3]:
!pip -q install transformers

[K     |████████████████████████████████| 2.8 MB 12.5 MB/s 
[K     |████████████████████████████████| 52 kB 1.6 MB/s 
[K     |████████████████████████████████| 895 kB 52.2 MB/s 
[K     |████████████████████████████████| 636 kB 54.6 MB/s 
[K     |████████████████████████████████| 3.3 MB 48.1 MB/s 
[?25h

In [4]:
#All the imports.

import glob
import logging
import os
import pickle
import random
import re
import shutil
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from tqdm.notebook import tqdm, trange

from pathlib import Path

from transformers import (
    MODEL_WITH_LM_HEAD_MAPPING,
    WEIGHTS_NAME,
    AdamW,
    AutoConfig,
    PreTrainedModel,
    PreTrainedTokenizer,
    get_linear_schedule_with_warmup,
)


try:
    from torch.utils.tensorboard import SummaryWriter
except ImportError:
    from tensorboardX import SummaryWriter

## Getting data.

In [5]:
data = pd.read_csv('The-Office-Lines-V4.csv')
df = pd.DataFrame(data)

#Making the data suitable.
df.drop(df.columns[[0, 1, 2,3,6]], axis = 1, inplace = True)
df

Unnamed: 0,speaker,line
0,Michael,All right Jim. Your quarterlies look very good...
1,Jim,"Oh, I told you. I couldn't close it. So..."
2,Michael,So you've come to the master for guidance? Is ...
3,Jim,"Actually, you called me in here, but yeah."
4,Michael,"All right. Well, let me show you how it's done."
...,...,...
54621,Creed,It all seems so very arbitrary. I applied for ...
54622,Meredith,I just feel lucky that I got a chance to share...
54623,Phyllis,I'm happy that this was all filmed so I can re...
54624,Jim,I sold paper at this company for 12 years. My ...


In [8]:
CHARACTER_NAME = 'Michael'
#number of lines Michael has.
sum(df.speaker == 'Michael')

10773

## Creating a contexted database for training.

In [9]:
contexted = []

# context window of size 7
n = 7

for i in df[df.speaker == CHARACTER_NAME].index:
  if i < n:
    continue
  row = []
  prev = i - 1 - n # we additionally substract 1, so row will contain current responce and 7 previous responces  
  for j in range(i, prev, -1):
    row.append(df.line[j])
  contexted.append(row)

columns = ['response', 'context'] 
columns = columns + ['context/' + str(i) for i in range(n - 1)]

df = pd.DataFrame.from_records(contexted, columns=columns)

In [10]:
df.sample(10)

Unnamed: 0,response,context,context/0,context/1,context/2,context/3,context/4,context/5
5466,It's my sumo suit. I just didn't inflate it al...,"I say, I say, I say, I say down Holly! I'll ta...",Fair question.,What are you doing?,Ohhhh! Bang! Boom! Say Clump! Why hello every...,"""...or that Dunder Mifflin does not discrimina...",Phyllis Vance for David Wallace.,"Yeah, I took your purse. What are you worried ..."
7103,You have two seconds.,That-,Noon.,At what time?,Toward the sun.,In which direction?,"Meet- Ok, go to the spot and then walk 100 feet.",Michael.
3214,Don't - don't do that. That's not nice. What a...,No.,Really?,"Oh, sure, we talk all the time.","Yes, he did. Have any of you talked to Dwight?",Sort of. He had a lot of clients.,Hey guys. How's the workload on all of Dwight'...,"Yeah, me too."
1519,Ok.,Let me stop you right there.,Yeah...,You know what Michael?,Ignore him. You know what? We're not that diff...,"Dwight, you have your hand up.",First is parking. You can't block the freight ...,Ok...
10688,"Don't be a caricature Kevin, never be a caric...",Oh...,You know who that is?,Oh!,"Kevin, I have something for you.",I used to be obese. Once you've conquered obes...,I've given up expecting Michael to do the righ...,Do you believe that?!
1606,Peach iced tea. You're going to hate it.,Hmm?,"Alright, well, cool. Still deciding?",I'm going to take off actually.,Listen Stanley. How long does it take you to p...,What is?,"No it's okay, we're talking code.","Uh, uh, ok."
8697,"No excuses Erin, come on! Make it happen.","Well, in your old office there were all those ...",How is that possible?,"Michael, I'm worried that, uh, not all your to...",You think of something then!,Think!,Don't just criticize my idea!,He's supposed to cut his leg off? Think!
8749,"Okay, okay, Jim. I think this feast is over an...","Mm-hmm. Yes, I'm fine. Um, yeah, the doctor sa...",Our ultra feast menu's theme: Hollywood. We ha...,There is no rush to get to the hospital. I am ...,What is October feast?,That's fancy feast. Ultra feast is something t...,Oh. What? You want to eat cat food with Kevin ...,"Oh, wow, it's almost time for ultra feast! Whe..."
4889,"Oh honey, I have the best trophy right here, a...",It was between the neon beer sign and the Dund...,"Well, I saw - oh your Dundies. I'm surprised t...",I bet you are.,"If you ever need any help, I am just a phone c...","Yeah, he tried to set up my TIVO for me but th...",Really?,"Michael, I'm just terrible at all this stuff, ..."
10643,"Okay! That, that is unfair! The clothing was s...",I just don't see a point in the Dundies! Okay?...,What is your problem?!,"Please don't stop so suddenly, the seatbelt i...","Gladly! I'd accept that award, because a bitch...",Man! Maybe you should have won the Kind of a B...,I thought it was the worst Dundies I've ever b...,"You know, despite a couple hiccups, I think t..."


In [11]:
#Splitting the dataset, into training and validation.

trn_df, val_df = train_test_split(df, test_size=0.1)
trn_df.head()

Unnamed: 0,response,context,context/0,context/1,context/2,context/3,context/4,context/5
6087,"Mm. I'm calling because, um, we have a stupid ...",Okay.,"No, no, no, no, no. No. I'm calling- I'm sorry.",Is this why you're calling me?,Hey David- I'm sorry. I'm sorry. I'm eating t...,Michael.,"Are we, uh... are we leaving or what? Ow!",My... It's not my fault you don't understand G...
2969,"OK... Well, I need to put this bike in there. ...",It's behind you.,I'm looking for the toy drive box.,"Can I help you, Michael?",Deck the halls with boughs of holly. Fa la la...,Oh Pam. Take a chill pill.,I would like it off my desk.,Clean it in your car.
4851,"Oh, hey no biggie. Just...",Michael. I am very sorry.,Yes.,Do you want to go play on the table upstairs?,O.,O. P.,"OK. Volley for serve, P.",In my sleep.
1505,Ok.,I mean I'm learning nothing.,What you mean uh hmmm... ?,Uh hmmmm... .,"He grew into a man overnight. Rare disability,...","Yeah, he's dancing on a piano with Robert Loggia.","I don't think so, no.",I think that's from Big.
1668,"Check it out. Don't be a wuss, just get... no,...",Me?,Check it out.,I don't think that's vomit.,"Um, somebody vomited right in the middle of th...",What's goin' on?,"Oh, God! How could that happen? How could... r...","No, I don't think it's a bird."


## Creating the model.

In [12]:
# create dataset suitable for our model
def construct_conv(row, tokenizer, eos = True):
    flatten = lambda l: [item for sublist in l for item in sublist]
    conv = list(reversed([tokenizer.encode(x) + [tokenizer.eos_token_id] for x in row]))
    conv = flatten(conv)
    return conv

class ConversationDataset(Dataset):
    def __init__(self, tokenizer: PreTrainedTokenizer, args, df, block_size=512):

        block_size = block_size - (tokenizer.model_max_length - tokenizer.max_len_single_sentence)

        directory = args.cache_dir
        cached_features_file = os.path.join(
            directory, args.model_type + "_cached_lm_" + str(block_size)
        )

        if os.path.exists(cached_features_file) and not args.overwrite_cache:
            logger.info("Loading features from cached file %s", cached_features_file)
            with open(cached_features_file, "rb") as handle:
                self.examples = pickle.load(handle)
        else:
            logger.info("Creating features from dataset file at %s", directory)

            self.examples = []
            for _, row in df.iterrows():
                conv = construct_conv(row, tokenizer)
                self.examples.append(conv)

            logger.info("Saving features into cached file %s", cached_features_file)
            with open(cached_features_file, "wb") as handle:
                pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        return torch.tensor(self.examples[item], dtype=torch.long)

In [13]:
# Cacheing and storing of data/checkpoints

def load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False):
    return ConversationDataset(tokenizer, args, df_val if evaluate else df_trn)


def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)


def _sorted_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> List[str]:
    ordering_and_checkpoint_path = []

    glob_checkpoints = glob.glob(os.path.join(args.output_dir, "{}-*".format(checkpoint_prefix)))

    for path in glob_checkpoints:
        if use_mtime:
            ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
        else:
            regex_match = re.match(".*{}-([0-9]+)".format(checkpoint_prefix), path)
            if regex_match and regex_match.groups():
                ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))

    checkpoints_sorted = sorted(ordering_and_checkpoint_path)
    checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
    return checkpoints_sorted


def _rotate_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> None:
    if not args.save_total_limit:
        return
    if args.save_total_limit <= 0:
        return

    # Check if we should delete older checkpoint(s)
    checkpoints_sorted = _sorted_checkpoints(args, checkpoint_prefix, use_mtime)
    if len(checkpoints_sorted) <= args.save_total_limit:
        return

    number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - args.save_total_limit)
    checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
    for checkpoint in checkpoints_to_be_deleted:
        logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint))
        shutil.rmtree(checkpoint)

## Defining hyper-parameters

In [14]:
from transformers import AutoModelWithLMHead, AutoModelForCausalLM, AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
model = AutoModelWithLMHead.from_pretrained("microsoft/DialoGPT-small")

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/641 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]



Downloading:   0%|          | 0.00/351M [00:00<?, ?B/s]

In [15]:
"""
Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
using a masked language modeling (MLM) loss.
"""

# Configs
logger = logging.getLogger(__name__)

MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

In [16]:
# Args to allow for easy convertion of python script to notebook
class Args():
    def __init__(self):
        self.output_dir = 'output-small'
        self.model_type = 'gpt2'
        self.model_name_or_path = 'microsoft/DialoGPT-small'
        self.config_name = 'microsoft/DialoGPT-small'
        self.tokenizer_name = 'microsoft/DialoGPT-small'
        self.cache_dir = 'cached'
        self.block_size = 512
        self.do_train = True
        self.do_eval = True
        self.evaluate_during_training = False
        self.per_gpu_train_batch_size = 4
        self.per_gpu_eval_batch_size = 4
        self.gradient_accumulation_steps = 1
        self.learning_rate = 5e-5
        self.weight_decay = 0.0
        self.adam_epsilon = 1e-8
        self.max_grad_norm = 1.0
        self.num_train_epochs = 4
        self.max_steps = -1
        self.warmup_steps = 0
        self.logging_steps = 1000
        self.save_steps = 3500
        self.save_total_limit = None
        self.eval_all_checkpoints = False
        self.no_cuda = False
        self.overwrite_output_dir = True
        self.overwrite_cache = True
        self.should_continue = False
        self.seed = 42
        self.local_rank = -1
        self.fp16 = False
        self.fp16_opt_level = 'O1'

args = Args()

## Training and Evaluating.

In [20]:
def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(
        train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate, drop_last = True
    )

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    model = model.module if hasattr(model, "module") else model  # Take care of distributed/parallel training
    model.resize_token_embeddings(len(tokenizer))
    # add_special_tokens_(model, tokenizer)


    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )

    # Check if saved optimizer or scheduler states exist
    if (
        args.model_name_or_path
        and os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt"))
        and os.path.isfile(os.path.join(args.model_name_or_path, "scheduler.pt"))
    ):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size
        * args.gradient_accumulation_steps
        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if args.model_name_or_path and os.path.exists(args.model_name_or_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)

            logger.info("  Continuing training from checkpoint, will skip to saved global_step")
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d", global_step)
            logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
        except ValueError:
            logger.info("  Starting fine-tuning.")

    tr_loss, logging_loss = 0.0, 0.0

    model.zero_grad()
    train_iterator = trange(
        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
    )
    set_seed(args)  # Added here for reproducibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            inputs, labels = (batch, batch)
            if inputs.shape[1] > 1024: continue
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            model.train()
            outputs = model(inputs, labels=labels)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if (
                        args.local_rank == -1 and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    checkpoint_prefix = "checkpoint"
                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, "{}-{}".format(checkpoint_prefix, global_step))
                    os.makedirs(output_dir, exist_ok=True)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    _rotate_checkpoints(args, checkpoint_prefix)

                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step

# Evaluation of some model

def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, df_trn, df_val, prefix="") -> Dict:
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir

    eval_dataset = load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=True)
    os.makedirs(eval_output_dir, exist_ok=True)
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(
        eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate, drop_last = True
    )

    # multi-gpu evaluate
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        inputs, labels = (batch, batch)
        inputs = inputs.to(args.device)
        labels = labels.to(args.device)

        with torch.no_grad():
            outputs = model(inputs, labels=labels)
            lm_loss = outputs[0]
            eval_loss += lm_loss.mean().item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss))

    result = {"perplexity": perplexity}

    output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    return result

In [21]:
# Main runner

def main(df_trn, df_val):
    args = Args()
    
    if args.should_continue:
        sorted_checkpoints = _sorted_checkpoints(args)
        if len(sorted_checkpoints) == 0:
            raise ValueError("Used --should_continue but no checkpoint was found in --output_dir.")
        else:
            args.model_name_or_path = sorted_checkpoints[-1]

    if (
        os.path.exists(args.output_dir)
        and os.listdir(args.output_dir)
        and args.do_train
        and not args.overwrite_output_dir
        and not args.should_continue
    ):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
                args.output_dir
            )
        )

    # Setup CUDA, GPU & distributed training
    device = torch.device("cuda")
    args.n_gpu = torch.cuda.device_count()
    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )

    # Set seed
    set_seed(args)

    config = AutoConfig.from_pretrained(args.config_name, cache_dir=args.cache_dir)
    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, cache_dir=args.cache_dir)
    model = AutoModelWithLMHead.from_pretrained(
        args.model_name_or_path,
        from_tf=False,
        config=config,
        cache_dir=args.cache_dir,
    )
    model.to(args.device)
    
    logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False)

        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

    # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained()
    if args.do_train:
        # Create output directory if needed
        os.makedirs(args.output_dir, exist_ok=True)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = (
            model.module if hasattr(model, "module") else model
        )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

        # Load a trained model and vocabulary that you have fine-tuned
        model = AutoModelWithLMHead.from_pretrained(args.output_dir)
        tokenizer = AutoTokenizer.from_pretrained(args.output_dir)
        model.to(args.device)

    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
            )
            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""

            model = AutoModelWithLMHead.from_pretrained(checkpoint)
            model.to(args.device)
            result = evaluate(args, model, tokenizer, df_trn, df_val, prefix=prefix)
            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
            results.update(result)

    return results

## Driver Code.

In [22]:
main(trn_df, val_df)

09/24/2021 04:44:56 - INFO - filelock -   Lock 139625075761808 acquired on cached/0cbdd50f204f3ddbaa452e976340a5725f0b5ddb201704058c87e14d9679e070.e6898db50ba3aa698f0f652e876a1e4bd813321dea3e22b776f9a3c39d36aaab.lock


Downloading:   0%|          | 0.00/641 [00:00<?, ?B/s]

09/24/2021 04:44:56 - INFO - filelock -   Lock 139625075761808 released on cached/0cbdd50f204f3ddbaa452e976340a5725f0b5ddb201704058c87e14d9679e070.e6898db50ba3aa698f0f652e876a1e4bd813321dea3e22b776f9a3c39d36aaab.lock
09/24/2021 04:44:57 - INFO - filelock -   Lock 139625080397776 acquired on cached/5f8cf488e0bdda2e393e798f478673a4d26c1386082a1a05e42269f3ecc89f50.67d01b18f2079bd75eac0b2f2e7235768c7f26bd728e7a855a1c5acae01a91a8.lock


Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

09/24/2021 04:44:57 - INFO - filelock -   Lock 139625080397776 released on cached/5f8cf488e0bdda2e393e798f478673a4d26c1386082a1a05e42269f3ecc89f50.67d01b18f2079bd75eac0b2f2e7235768c7f26bd728e7a855a1c5acae01a91a8.lock
09/24/2021 04:44:58 - INFO - filelock -   Lock 139625084685968 acquired on cached/3cf340c89a43b5e6f31c4cd609fc2fc92f3d7aafdf6c8987e2ea9e02cb78b4e2.c7ed1f96aac49e745788faa77ba0a26a392643a50bb388b9c04ff469e555241f.lock


Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

09/24/2021 04:45:00 - INFO - filelock -   Lock 139625084685968 released on cached/3cf340c89a43b5e6f31c4cd609fc2fc92f3d7aafdf6c8987e2ea9e02cb78b4e2.c7ed1f96aac49e745788faa77ba0a26a392643a50bb388b9c04ff469e555241f.lock
09/24/2021 04:45:00 - INFO - filelock -   Lock 139625071619408 acquired on cached/4e3f74e7c741909c4d1b48a23febe75c1be66a20c2b98cf7db4b8b10f12dc10c.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b.lock


Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

09/24/2021 04:45:01 - INFO - filelock -   Lock 139625071619408 released on cached/4e3f74e7c741909c4d1b48a23febe75c1be66a20c2b98cf7db4b8b10f12dc10c.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b.lock
09/24/2021 04:45:04 - INFO - filelock -   Lock 139625075763728 acquired on cached/aeb12aa1fc2f135700fcf9f8f0eec86c0649dc5ce0df86677adf0388271f33f3.1010e0ba25016a38144b58e8852f1dcc18876341e3b5728a99b3ffa11cc733cd.lock


Downloading:   0%|          | 0.00/351M [00:00<?, ?B/s]

09/24/2021 04:45:15 - INFO - filelock -   Lock 139625075763728 released on cached/aeb12aa1fc2f135700fcf9f8f0eec86c0649dc5ce0df86677adf0388271f33f3.1010e0ba25016a38144b58e8852f1dcc18876341e3b5728a99b3ffa11cc733cd.lock
09/24/2021 04:45:26 - INFO - __main__ -   Training/evaluation parameters <__main__.Args object at 0x7efcff519a90>
09/24/2021 04:45:26 - INFO - __main__ -   Creating features from dataset file at cached
09/24/2021 04:45:55 - INFO - __main__ -   Saving features into cached file cached/gpt2_cached_lm_512
09/24/2021 04:45:55 - INFO - __main__ -   ***** Running training *****
09/24/2021 04:45:55 - INFO - __main__ -     Num examples = 9691
09/24/2021 04:45:55 - INFO - __main__ -     Num Epochs = 4
09/24/2021 04:45:55 - INFO - __main__ -     Instantaneous batch size per GPU = 4
09/24/2021 04:45:55 - INFO - __main__ -     Total train batch size (w. parallel, distributed & accumulation) = 4
09/24/2021 04:45:55 - INFO - __main__ -     Gradient Accumulation steps = 1
09/24/2021 04:45

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2422 [00:00<?, ?it/s]



Iteration:   0%|          | 0/2422 [00:00<?, ?it/s]

09/24/2021 05:16:20 - INFO - __main__ -   Saving model checkpoint to output-small/checkpoint-3500
09/24/2021 05:16:24 - INFO - __main__ -   Saving optimizer and scheduler states to output-small/checkpoint-3500


Iteration:   0%|          | 0/2422 [00:00<?, ?it/s]

09/24/2021 05:46:40 - INFO - __main__ -   Saving model checkpoint to output-small/checkpoint-7000
09/24/2021 05:46:44 - INFO - __main__ -   Saving optimizer and scheduler states to output-small/checkpoint-7000


Iteration:   0%|          | 0/2422 [00:00<?, ?it/s]

09/24/2021 06:10:01 - INFO - __main__ -    global_step = 9688, average loss = 1.9303588559381928
09/24/2021 06:10:01 - INFO - __main__ -   Saving model checkpoint to output-small
09/24/2021 06:10:04 - INFO - __main__ -   Evaluate the following checkpoints: ['output-small']
09/24/2021 06:10:06 - INFO - __main__ -   Creating features from dataset file at cached
09/24/2021 06:10:09 - INFO - __main__ -   Saving features into cached file cached/gpt2_cached_lm_512
09/24/2021 06:10:09 - INFO - __main__ -   ***** Running evaluation  *****
09/24/2021 06:10:09 - INFO - __main__ -     Num examples = 1077
09/24/2021 06:10:09 - INFO - __main__ -     Batch size = 4


Evaluating:   0%|          | 0/269 [00:00<?, ?it/s]

09/24/2021 06:10:50 - INFO - __main__ -   ***** Eval results  *****
09/24/2021 06:10:51 - INFO - __main__ -     perplexity = tensor(5.6236)


{'perplexity_': tensor(5.6236)}

## Loading the model.

In [23]:
tokenizer = AutoTokenizer.from_pretrained('microsoft/DialoGPT-small')
model = AutoModelWithLMHead.from_pretrained('output-small')



In [24]:
#Chat with model.
for step in range(4):
    # encode the new user input, add the eos_token and return a tensor in Pytorch
    new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt')
    # print(new_user_input_ids)

    # append the new user input tokens to the chat history
    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids

    # generated a response while limiting the total chat history to 1000 tokens, 
    chat_history_ids = model.generate(
        bot_input_ids, max_length=200,
        pad_token_id=tokenizer.eos_token_id,  
        no_repeat_ngram_size=3,       
        do_sample=True, 
        top_k=100, 
        top_p=0.7,
        temperature=0.8
    )
    
    # pretty print last ouput tokens from bot
    print("Michael Scott, Dunder Mifflin: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))

>> User:Hi Michael!
Michael Scott, Dunder Mifflin: Hi, I'm Jan.
>> User:how is buisness?
Michael Scott, Dunder Mifflin: It's pretty good.
>> User:what do you think of ryan?
Michael Scott, Dunder Mifflin: I think Ryan is very nice.
>> User:who is your girlfriend?
Michael Scott, Dunder Mifflin: !!!!!!


In [25]:
!dir

cached	output-small  runs  sample_data  The-Office-Lines-V4.csv


In [26]:
!pip install huggingface_hub



In [29]:
!git config --global credential.helper store

In [30]:
!huggingface-cli login


        _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
        _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
        _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
        _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
        _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

        
Username: maniacGhost24
Password: 
Login successful
Your token has been saved to /root/.huggingface/token


In [36]:
f = open("/root/.huggingface/token","r")
token = f.read()
f.close()

In [37]:
print(token)

BhURwVBEphmleuDzzOQxexYhZlhlXCADunfpntYcoCdylADzJjxuJfObZqzRrFESFZvhgEtCnqwbCuYrWhWbLErbGFwSKtaRvcQwJIrjyCOaaUSRIwTGhoQKayVBKMmc


In [38]:
!huggingface-cli repo create MichaelScott-bot-small

[90mgit version 2.17.1[0m
[1m[31mLooks like you do not have git-lfs installed, please install. You can install from https://git-lfs.github.com/. Then run `git lfs install` (you only have to do this once).[0m

You are about to create [1mmaniacGhost24/MichaelScott-bot-small[0m
Proceed? [Y/n] Y

Your repo now lives at:
  [1mhttps://huggingface.co/maniacGhost24/MichaelScott-bot-small[0m

You can clone it locally with the command below, and commit/push as usual.

  git clone https://huggingface.co/maniacGhost24/MichaelScott-bot-small



In [39]:
!sudo apt-get install git-lfs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following NEW packages will be installed:
  git-lfs
0 upgraded, 1 newly installed, 0 to remove and 37 not upgraded.
Need to get 2,129 kB of archives.
After this operation, 7,662 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 git-lfs amd64 2.3.4-1 [2,129 kB]
Fetched 2,129 kB in 0s (19.0 MB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 1.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletype
dpkg-preconfigure: unable to re-open stdin: 
Selecting previously unselected package git-lfs.
(Reading database ... 155013 files and directories cu

In [43]:
!git config --global user.email "rishabhkaushik.bt19cse@pec.edu.in"
!git config --global user.name "Rishabh Kaushik"

In [44]:
!git clone https://huggingface.co/maniacGhost24/MichaelScott-bot-small

Cloning into 'MichaelScott-bot-small'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0)[K
Unpacking objects: 100% (3/3), done.


In [45]:
!dir

cached			output-small  sample_data
MichaelScott-bot-small	runs	      The-Office-Lines-V4.csv


In [48]:
!mv output-small MichaelScott-bot-small/

In [49]:
!dir

cached	MichaelScott-bot-small	runs  sample_data  The-Office-Lines-V4.csv


In [50]:
os.chdir('MichaelScott-bot-small')

In [51]:
!git lfs install

Updated git hooks.
Git LFS initialized.


In [52]:
!ls

output-small


In [53]:
!pwd

/content/MichaelScott-bot-small


In [54]:
!git status

On branch main
Your branch is up to date with 'origin/main'.

Untracked files:
  (use "git add <file>..." to include in what will be committed)

	[31moutput-small/[m

nothing added to commit but untracked files present (use "git add" to track)


In [55]:
!git add .

In [56]:
!git commit -m "Output Small uploaded"

[main 235fbe0] Output Small uploaded
 29 files changed, 150157 insertions(+)
 create mode 100644 output-small/checkpoint-3500/config.json
 create mode 100644 output-small/checkpoint-3500/merges.txt
 create mode 100644 output-small/checkpoint-3500/optimizer.pt
 create mode 100644 output-small/checkpoint-3500/pytorch_model.bin
 create mode 100644 output-small/checkpoint-3500/scheduler.pt
 create mode 100644 output-small/checkpoint-3500/special_tokens_map.json
 create mode 100644 output-small/checkpoint-3500/tokenizer.json
 create mode 100644 output-small/checkpoint-3500/tokenizer_config.json
 create mode 100644 output-small/checkpoint-3500/training_args.bin
 create mode 100644 output-small/checkpoint-3500/vocab.json
 create mode 100644 output-small/checkpoint-7000/config.json
 create mode 100644 output-small/checkpoint-7000/merges.txt
 create mode 100644 output-small/checkpoint-7000/optimizer.pt
 create mode 100644 output-small/checkpoint-7000/pytorch_model.bin
 create mode 100644 output

In [57]:
!git push

Git LFS: (8 of 8 files) 3.28 GB / 3.28 GB
Counting objects: 20, done.
Delta compression using up to 2 threads.
Compressing objects: 100% (19/19), done.
Writing objects: 100% (20/20), 754.86 KiB | 4.39 MiB/s, done.
Total 20 (delta 4), reused 0 (delta 0)
To https://huggingface.co/maniacGhost24/MichaelScott-bot-small
   e8f2964..235fbe0  main -> main


In [58]:
#Another try to push!
!huggingface-cli repo create MichaelScott-bot-push-small

[90mgit version 2.17.1[0m
Error: unknown flag: --version

[90mSorry, no usage text found for "git-lfs"[0m

You are about to create [1mmaniacGhost24/MichaelScott-bot-push-small[0m
Proceed? [Y/n] Y

Your repo now lives at:
  [1mhttps://huggingface.co/maniacGhost24/MichaelScott-bot-push-small[0m

You can clone it locally with the command below, and commit/push as usual.

  git clone https://huggingface.co/maniacGhost24/MichaelScott-bot-push-small



In [59]:
!dir

output-small


In [60]:
!cd ..

In [61]:
!dir

output-small


In [66]:
! git clone https://huggingface.co/maniacGhost24/MichaelScott-bot-push-small

Cloning into 'MichaelScott-bot-push-small'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0)[K
Unpacking objects: 100% (3/3), done.


In [67]:
!dir

MichaelScott-bot-push-small  output-small


In [68]:
!mv output-small/* MichaelScott-bot-push-small/

In [69]:
!dir

MichaelScott-bot-push-small  output-small


In [70]:
os.chdir('MichaelScott-bot-push-small')

In [71]:
!dir

checkpoint-3500  eval_results.txt   special_tokens_map.json  training_args.bin
checkpoint-7000  merges.txt	    tokenizer_config.json    vocab.json
config.json	 pytorch_model.bin  tokenizer.json


In [72]:
!git status

On branch main
Your branch is up to date with 'origin/main'.

Untracked files:
  (use "git add <file>..." to include in what will be committed)

	[31mcheckpoint-3500/[m
	[31mcheckpoint-7000/[m
	[31mconfig.json[m
	[31meval_results.txt[m
	[31mmerges.txt[m
	[31mpytorch_model.bin[m
	[31mspecial_tokens_map.json[m
	[31mtokenizer.json[m
	[31mtokenizer_config.json[m
	[31mtraining_args.bin[m
	[31mvocab.json[m

nothing added to commit but untracked files present (use "git add" to track)


In [73]:
!git add .

In [74]:
!git commit -m "Output files updated."
!git push

[main bb0bde4] Output files updated.
 29 files changed, 150157 insertions(+)
 create mode 100644 checkpoint-3500/config.json
 create mode 100644 checkpoint-3500/merges.txt
 create mode 100644 checkpoint-3500/optimizer.pt
 create mode 100644 checkpoint-3500/pytorch_model.bin
 create mode 100644 checkpoint-3500/scheduler.pt
 create mode 100644 checkpoint-3500/special_tokens_map.json
 create mode 100644 checkpoint-3500/tokenizer.json
 create mode 100644 checkpoint-3500/tokenizer_config.json
 create mode 100644 checkpoint-3500/training_args.bin
 create mode 100644 checkpoint-3500/vocab.json
 create mode 100644 checkpoint-7000/config.json
 create mode 100644 checkpoint-7000/merges.txt
 create mode 100644 checkpoint-7000/optimizer.pt
 create mode 100644 checkpoint-7000/pytorch_model.bin
 create mode 100644 checkpoint-7000/scheduler.pt
 create mode 100644 checkpoint-7000/special_tokens_map.json
 create mode 100644 checkpoint-7000/tokenizer.json
 create mode 100644 checkpoint-7000/tokenizer_co