# Environment Setup



## Cloning the MinGPT git into the Collab Directory


In [2]:
!git clone https://github.com/karpathy/minGPT

Cloning into 'minGPT'...
remote: Enumerating objects: 175, done.[K
remote: Total 175 (delta 0), reused 0 (delta 0), pack-reused 175[K
Receiving objects: 100% (175/175), 1.37 MiB | 1.69 MiB/s, done.
Resolving deltas: 100% (101/101), done.


In [3]:
%cd /content/minGPT

/content/minGPT


## Connect the Google Drive


In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# Training Data 


## Download and Parse the Basic-Bot Training Data

### Upload and Data Parsing


In [5]:
import re

#Load the Data
with open('/content/gdrive/My Drive/U9APJ3XKN.txt') as f:
    BasicBot_text = f.readlines()

#Parse the Data
BasicBot_text = [ x for x in BasicBot_text if "has joined the channel" not in x] #Remove channel joins
BasicBot_text = [ x for x in BasicBot_text if "has left the channel" not in x] #Remove channel exits
BasicBot_text = [ re.sub("<https.*?>", "<HTTPS>", x) for x in BasicBot_text] #Replace any http links with <HTTPS>
#BasicBot_text = [re.sub("\n$", "<|endoftext|>", x) for x in BasicBot_text] #Signify the end of each message <|endoftext|>
#BasicBot_text = ["<|startoftext|>" + x for x in BasicBot_text] #Signify the start of each message (<|startoftext|>)
BasicBot_text_size=len("".join(BasicBot_text))

## Download and Parse a Reddit Conversation Corpus
Lets add a little variety to the Basic-Bot using some random comments from Reddit conversations. I've noticed that with Kenny's training data alone, the model either overfits or is a random combination of messages strung together. Lets some percentage of extra text from random reddit messages to make the output more interesting. If we want to give the bot different "personalities" we can restrict to specific subreddits.




In [6]:
pip install convokit

Collecting convokit
[?25l  Downloading https://files.pythonhosted.org/packages/79/23/f248f279a77e4e80b6549e73619979d3ba518d27dd59b5b5cd24e180e1ff/convokit-2.4.2.tar.gz (137kB)
[K     |████████████████████████████████| 143kB 14.1MB/s eta 0:00:01
Collecting msgpack-numpy>=0.4.3.2
  Downloading https://files.pythonhosted.org/packages/19/05/05b8d7c69c6abb36a34325cc3150089bdafc359f0a81fb998d93c5d5c737/msgpack_numpy-0.4.7.1-py2.py3-none-any.whl
Collecting nltk>=3.4
[?25l  Downloading https://files.pythonhosted.org/packages/92/75/ce35194d8e3022203cca0d2f896dbb88689f9b3fce8e9f9cff942913519d/nltk-3.5.zip (1.4MB)
[K     |████████████████████████████████| 1.4MB 21.2MB/s 
Collecting clean-text>=0.1.1
  Downloading https://files.pythonhosted.org/packages/78/30/7013e9bf37e00ad81406c771e8f5b071c624b8ab27a7984cd9b8434bed4f/clean_text-0.3.0-py3-none-any.whl
Collecting unidecode>=1.1.1
[?25l  Downloading https://files.pythonhosted.org/packages/d0/42/d9edfed04228bacea2d824904cae367ee9efd05e6cce7ceaa

In [7]:
from convokit import Corpus, download
corpus = Corpus(filename=download("reddit-corpus-small"))

Downloading reddit-corpus-small to /root/.convokit/downloads/reddit-corpus-small
Downloading reddit-corpus-small from http://zissou.infosci.cornell.edu/convokit/datasets/subreddit-corpus/reddit-corpus-small.corpus.zip (37.9MB)... Done


In [8]:
inject_prop=0.2
char_count=0
Reddit_content=[]
while char_count < inject_prop*BasicBot_text_size: 
  newtext=corpus.random_utterance().text +"\n"
  char_count = char_count + len(newtext)
  Reddit_content.append(newtext)

In [9]:
Reddit_content=[x for x in Reddit_content if "[deleted]" not in x] #Remove comments flagged as deleted
Reddit_content=[x for x in Reddit_content if "[removed]" not in x] #Remove comments flagged as removed
Reddit_content=[re.sub(r'https?:\/\/\S*', '<HTTPS>', x, flags=re.MULTILINE) for x in Reddit_content] #Replace urls with a generic code

## Combine Corpus


In [10]:
import random

train_dat = BasicBot_text + Reddit_content
random.shuffle(train_dat)
train_dat = "".join(train_dat)
len(train_dat)

255237

# minGPT

## Preprocess the Training Data


In [11]:
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F

In [12]:
# set up logging
import logging
logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
)

In [13]:
# make deterministic
from mingpt.utils import set_seed
set_seed(42)

In [14]:
import math
from torch.utils.data import Dataset

class CharDataset(Dataset):

    def __init__(self, data, block_size):
        chars = sorted(list(set(data)))
        data_size, vocab_size = len(data), len(chars)
        print('data has %d characters, %d unique.' % (data_size, vocab_size))
        
        self.stoi = { ch:i for i,ch in enumerate(chars) }
        self.itos = { i:ch for i,ch in enumerate(chars) }
        self.block_size = block_size
        self.vocab_size = vocab_size
        self.data = data
    
    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        # grab a chunk of (block_size + 1) characters from the data
        chunk = self.data[idx:idx + self.block_size + 1]
        # encode every character to an integer
        dix = [self.stoi[s] for s in chunk]

        x = torch.tensor(dix[:-1], dtype=torch.long)
        y = torch.tensor(dix[1:], dtype=torch.long)
        return x, y

In [15]:
block_size = 128 
train_dataset = CharDataset(train_dat, block_size)

data has 255237 characters, 119 unique.


In [28]:
#Save the train_dataset 
import pickle
training_save_name="training_dataset.pickle"
path = F"/content/gdrive/My Drive/{training_save_name}"
with open(path, 'wb') as f:
    pickle.dump(train_dataset, f)

## Construct the GPT Model

In [31]:
#Load the training data 
training_save_name="training_dataset.pickle"
path = F"/content/gdrive/My Drive/{training_save_name}"
with open(path, 'rb') as f:
    train_dataset = pickle.load(f)

In [32]:
from mingpt.model import GPT, GPTConfig
mconf = GPTConfig(train_dataset.vocab_size, train_dataset.block_size,
                  n_layer=8, n_head=8, n_embd=512)
model = GPT(mconf)

10/20/2020 03:17:45 - INFO - mingpt.model -   number of parameters: 2.540749e+07


## Train the Model

In [37]:
#Make room for the model training
import gc
gc.collect()

2166

In [38]:
from mingpt.trainer import Trainer, TrainerConfig

# initialize a trainer instance and kick off training
tconf = TrainerConfig(max_epochs=1, batch_size=128, learning_rate=6e-4,
                      lr_decay=True, warmup_tokens=512*20, final_tokens=2*len(train_dataset)*block_size,
                      num_workers=4)
trainer = Trainer(model, train_dataset, None, tconf)
trainer.train()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
epoch 1 iter 744: train loss 1.41451. lr 5.498209e-04:  37%|███▋      | 744/1994 [12:50<21:32,  1.03s/it][A[A

epoch 1 iter 744: train loss 1.41451. lr 5.498209e-04:  37%|███▋      | 745/1994 [12:50<21:42,  1.04s/it][A[A

epoch 1 iter 745: train loss 1.42305. lr 5.496899e-04:  37%|███▋      | 745/1994 [12:51<21:42,  1.04s/it][A[A

epoch 1 iter 745: train loss 1.42305. lr 5.496899e-04:  37%|███▋      | 746/1994 [12:51<21:35,  1.04s/it][A[A

epoch 1 iter 746: train loss 1.40630. lr 5.495587e-04:  37%|███▋      | 746/1994 [12:52<21:35,  1.04s/it][A[A

epoch 1 iter 746: train loss 1.40630. lr 5.495587e-04:  37%|███▋      | 747/1994 [12:52<21:28,  1.03s/it][A[A

epoch 1 iter 747: train loss 1.42367. lr 5.494274e-04:  37%|███▋      | 747/1994 [12:53<21:28,  1.03s/it][A[A

epoch 1 iter 747: train loss 1.42367. lr 5.494274e-04:  38%|███▊      | 748/1994 [12:53<21:32,  1.04s/it][A[A

epoch 1 iter 748: train loss 1.

In [39]:
#Save the Model
model_save_name = 'Basic-Bot_trained.pt'
path = F"/content/gdrive/My Drive/{model_save_name}" 
torch.save(model.state_dict(), path)

## Test the Model

In [32]:
model_save_name = 'Basic-Bot_trained.pt'
path = F"/content/gdrive/My Drive/{model_save_name}"
model.load_state_dict(torch.load(path))

<All keys matched successfully>

In [None]:
path

In [45]:
from mingpt.utils import sample

context = "wtf "
x = torch.tensor([train_dataset.stoi[s] for s in context], dtype=torch.long)[None,...].to(trainer.device)
y = sample(model, x, 200, temperature=1.5, sample=True, top_k=10)[0]
completion = ''.join([train_dataset.itos[int(i)] for i in y])
print(completion)

wtf only and 36 pm
Bilbo/Frodor a coit](10 amp; Twith scrift cards :expressionless:
near new?
if you can even get an Uber out there
As opposed to an app to be bast fals facilex being card.
which no one jo
