# Directions

1. Go to GroupMe and download your data ([Instructions](https://support.microsoft.com/en-us/office/how-do-i-export-my-groupme-data-1f6875bf-7871-4ade-8608-4c606cd5f518)). This will give you a zip file.
2. Upload the zip file to Colab using the "Files" tab.
3. Connect to a GPU runtime ([Instructions](https://www.tutorialspoint.com/google_colab/google_colab_using_free_gpu.htm)).
3. Run the cells in this notebook, in order, following any instructions as you go.

# Imports

In [None]:
import requests, json, pickle
import numpy as np
import matplotlib.pyplot as plt

import os
import time
import datetime
import math

import pandas as pd
import seaborn as sns
import random

import matplotlib.pyplot as plt
% matplotlib inline

import nltk
nltk.download('punkt')

In [None]:
!pip install transformers

In [None]:
from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
from transformers import get_linear_schedule_with_warmup
from transformers import pipeline

import torch
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler

import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

device = torch.device("cuda")

# Data Collection

Change this file to whatever your downloaded groupme data is

In [None]:
!unzip 00001.zip

Change this the id of the groupme you want (it should be the name of the file in the unzipped groupme data)

In [None]:
groupme_id = '60588753'

In [None]:
messages = None
conversation = None

In [None]:
with open(f'/content/{groupme_id}/message.json') as f:
  messages = json.load(f)

with open(f'/content/{groupme_id}/conversation.json') as f:
  conversation = json.load(f)

If you did the above steps correctly, the cell below should output the name of your GroupMe.

In [None]:
groupme_name = conversation['name'].replace('/', '')
groupme_name

And this should output the total number of messages in the group

In [None]:
print(f'Number of messages: {len(messages)}')

In [None]:
user_id_to_name_map = {member['user_id']: member['name'] for member in conversation['members']}

This shows the users and their ids

In [None]:
user_id_to_name_map

In [None]:
def format_name(user_id, override_name):
  if user_id in user_id_to_name_map:
    return f'{user_id_to_name_map[user_id]}'
  return f'{override_name}'

In [None]:
def format_message(user_id, text, override_name):
  if user_id == 'system':
    return text
  return f'{format_name(user_id, override_name)}: {text}'

In [None]:
cleaned_messages = []

for message in messages:
  cleaned_message = message['text']
  if message['attachments']:
    for attachment in message['attachments']:
      if attachment['type'] == 'mentions':
        new_text = cleaned_message
        offset = 0
        for (user_id, loc) in zip(attachment['user_ids'], attachment['loci']):
          try:
            start, length = loc
            finish = start + length
            name = user_id_to_name_map[user_id]
            new_text = new_text[:start + offset] + '@' + name + new_text[finish + offset:]
            offset += len(name) - (finish - start) + 1
          except:
            pass
        cleaned_message = new_text
      elif attachment['type'] in ['image', 'linked_image']:
        cleaned_messages += [{'text': f'{format_name(message["user_id"], message["name"])} shared an image.', 'time': message['created_at']}]
      elif attachment['type'] == 'video':
        cleaned_messages += [{'text': f'{format_name(message["user_id"], message["name"])} shared a video.', 'time': message['created_at']}]
      elif attachment['type'] == 'poll':
        cleaned_messages += [{'text': f'{format_name(message["user_id"], message["name"])} created a poll.', 'time': message['created_at']}]
      elif attachment['type'] == 'file':
        cleaned_messages += [{'text': f'{format_name(message["user_id"], message["name"])} shared a file.', 'time': message['created_at']}]

  if cleaned_message and cleaned_message != 'None':
    cleaned_message.encode('ascii', 'ignore')
    cleaned_message = cleaned_message.replace('ï¿½', '')
    cleaned_messages += [{'text': format_message(message['user_id'], cleaned_message, message['name']), 'time': message['created_at']}]

In [None]:
cleaned_messages = sorted(cleaned_messages, key=lambda m: m['time'])

In [None]:
cleaned_messages[1000:1010]

In [None]:
cleaned_messages = [{**message, **{'time_delta': 0, 'tokenized_length': len(word_tokenize(message['text']))}} for message in cleaned_messages]

In [None]:
for i in range(len(cleaned_messages) - 1):
  cleaned_messages[i]['time_delta'] = cleaned_messages[i + 1]['time'] - cleaned_messages[i]['time']

In [None]:
cleaned_messages[100:110]

In [None]:
def group_length(messages):
  return sum([message['tokenized_length'] for message in messages]) + len(messages) - 1

In [None]:
initial_group = {"messages": cleaned_messages, "total_length": group_length(cleaned_messages)}

In [None]:
target_max_size = 325
absolute_max_size = 750

In [None]:
def group_messages(group, debug=False):
  """
  Recursively groups messages until messages are of a set length
  """
  if group['total_length'] < absolute_max_size:
    return [group]
  
  running_size = group['messages'][0]['tokenized_length']

  best_break_i = 0
  best_score = -1
  best_size = running_size

  for i in range(1, len(group['messages'])):
    if group['messages'][i]['time_delta'] != 0:
      # score is log(time_delta) * (1 if min_new_group_size > target_max_size else min_new_group_size / target_max_size)
      min_new_group_size = min(running_size, group['total_length'] - running_size)
      score = math.log(group['messages'][i]['time_delta'])

      if min_new_group_size < target_max_size:
        score *= (min_new_group_size / target_max_size)

      if score > best_score:
        best_score = score
        best_break_i = i
        best_size = running_size

    running_size += group['messages'][i]['tokenized_length'] + 1

  if best_score < 0:
    print(group)
    raise Exception("Invalid score")

  left_group = {'messages': group['messages'][:best_break_i], 'total_length': best_size}
  right_group = {'messages': group['messages'][best_break_i:], 'total_length': group['total_length'] - best_size - 1}

  if debug:
    assert(left_group['total_length'] + right_group['total_length'] + 1 == group['total_length'])
    assert(group_length(left_group['messages']) == left_group['total_length']), f"{group_length(left_group['messages'])} != {left_group['total_length']}"
    assert(group_length(right_group['messages']) == right_group['total_length']), f"{group_length(right_group['messages'])} != {right_group['total_length']}"

  left_group_rec = group_messages(left_group)
  right_group_rec = group_messages(right_group)

  #if debug:
  #  assert(group_length(left_group_rec))

  return left_group_rec + right_group_rec


In [None]:
grouped_messages = group_messages(initial_group, debug=True)

In [None]:
grouped_messages[10]

In [None]:
print(len(grouped_messages))

These messages are just sanity checks to tell if the GroupMe has successfully been split up into individual conversations for training.

In [None]:
lengths = [group['total_length'] for group in grouped_messages]
print('Max length, Min length, Avg length')
print(max(lengths), min(lengths), sum(lengths) / len(grouped_messages))

In [None]:
histogram = {}
for length in lengths:
  if length not in histogram:
    histogram[length] = 1
  else:
    histogram[length] += 1

In [None]:
plt.bar(histogram.keys(), histogram.values())

In [None]:
raw_data = ['\n'.join([message['text'] for message in group['messages']]) for group in grouped_messages]

In [None]:
raw_data[50]

This saves the formatted training data to a file. The next step section will start with loading from that same file, so if you ever want to train again, you can skip to this part and just upload the saved training data. Make sure to download the file created if you think you will need to do this, or move it to google drive (Colab only keeps these files temporarily). 

In [None]:
with open(f'{groupme_name}.pkl', "wb") as f:
  pickle.dump(raw_data, f)

# Dataloader

If loading from a file, uncomment the code below and replace 'YOUR GROUPME NAME' with the name of the groupme training data you want to load from.

In [None]:
# groupme_name = 'YOUR GROUPME NAME'

In [None]:
text_dataset = []

In [None]:
with open(f'/content/{groupme_name}.pkl', "rb") as f:
  text_dataset = pickle.load(f)

In [None]:
text_dataset[50]

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>') #gpt2-medium

In [None]:
class GPT2Dataset(Dataset):

  def __init__(self, txt_list, tokenizer, gpt2_type="gpt2", max_length=768):

    self.tokenizer = tokenizer
    self.input_ids = []
    self.attn_masks = []

    for txt in txt_list:

      encodings_dict = tokenizer('<|startoftext|>'+ txt + '<|endoftext|>', truncation=True, max_length=max_length, padding="max_length")

      self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
      self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
    
  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.attn_masks[idx]   

In [None]:
dataset = GPT2Dataset(text_dataset, tokenizer, max_length=768)

In [None]:
# Split into training and validation sets
train_size = int(0.95 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

# Model

In [None]:
batch_size = 2

epochs = 5
learning_rate = 5e-4
warmup_steps = 1e2
epsilon = 1e-8

# this produces sample output every 100 steps
sample_every = 100

In [None]:
!nvidia-smi

In [None]:
# Create the DataLoaders for our training and validation datasets.
# We'll take training samples in random order. 
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [None]:
# I'm not really doing anything with the config buheret
configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)

# instantiate the model
model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration)

# this step is necessary because I've added some tokens (bos_token, etc) to the embeddings
# otherwise the tokenizer and model tensors won't match up
model.resize_token_embeddings(len(tokenizer))

# Tell pytorch to run this model on the GPU.
model.cuda()

In [None]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
optimizer = torch.optim.AdamW(model.parameters(),
                  lr = learning_rate,
                  eps = epsilon
                )

In [None]:
# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
# This changes the learning rate as the training loop progresses
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = warmup_steps, 
                                            num_training_steps = total_steps)

In [None]:
def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))

This is what actually trains the model. It takes 5 epochs, and the total training time can range from a few minutes to an hour+. Longest it has taken me is about 45 minutes, but it scales with the size of your GroupMe.

In [None]:
total_t0 = time.time()

training_stats = []

model = model.to(device)

for epoch_i in range(0, epochs):

    # ========================================
    #               Training
    # ========================================

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()

    total_train_loss = 0

    model.train()

    for step, batch in enumerate(train_dataloader):

        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        model.zero_grad()        

        outputs = model(  b_input_ids,
                          labels=b_labels, 
                          attention_mask = b_masks,
                          token_type_ids=None
                        )

        loss = outputs[0]  

        batch_loss = loss.item()
        total_train_loss += batch_loss

        # Get sample every x batches.
        if step % sample_every == 0 and not step == 0:

            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}. Loss: {:>5,}.   Elapsed: {:}.'.format(step, len(train_dataloader), batch_loss, elapsed))

            model.eval()

            sample_outputs = model.generate(
                                    bos_token_id=random.randint(1,30000),
                                    do_sample=True,   
                                    top_k=50, 
                                    max_length = 200,
                                    top_p=0.95, 
                                    num_return_sequences=1
                                )
            for i, sample_output in enumerate(sample_outputs):
                  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
            
            model.train()

        loss.backward()

        optimizer.step()

        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)       
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)
        
        with torch.no_grad():        

            outputs  = model(b_input_ids, 
#                            token_type_ids=None, 
                             attention_mask = b_masks,
                            labels=b_labels)
          
            loss = outputs[0]  
            
        batch_loss = loss.item()
        total_eval_loss += batch_loss        

    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    validation_time = format_time(time.time() - t0)    

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

In [None]:
# Display floats with two decimal places.
pd.set_option('precision', 2)

# Create a DataFrame from our training statistics.
df_stats = pd.DataFrame(data=training_stats)

# Use the 'epoch' as the row index.
df_stats = df_stats.set_index('epoch')

# A hack to force the column headers to wrap.
#df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])

# Display the table.
df_stats

Verify that the "Training" line goes down. Expected values are around 1.0-3.0

In [None]:
# Use plot styling from seaborn.
sns.set(style='darkgrid')

# Increase the plot size and font size.
sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12,6)

# Plot the learning curve.
plt.plot(df_stats['Training Loss'], 'b-o', label="Training")
plt.plot(df_stats['Valid. Loss'], 'g-o', label="Validation")

# Label the plot.
plt.title("Training & Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.xticks([1, 2, 3, 4])

plt.show()

This saves the model to a local directory, 'model_save'

I highly recommend saving the model if you plan on querying this again - otherwise you will have to train again.

In [None]:
# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()

output_dir = './model_save/'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

# Good practice: save your training arguments together with the trained model
# torch.save(args, os.path.join(output_dir, 'training_args.bin'))


Some storage info about the model that was just saved.

In [None]:
!ls -l --block-size=K ./model_save/

In [None]:
!ls -l --block-size=M ./model_save/pytorch_model.bin

Important: This will move the directory you just made into Google Drive, under a folder named GroupmeGeneratorData (you might need to create that folder). Before you run this, be sure that colab has access to your Google Drive (Files => Mount to Drive).

You can change 'YOUR_GROUPME_NAME' to whatever you want, but make sure it matches the name in the next step.

In [None]:
# Copy the model files to a directory in your Google Drive.
!cp -r ./model_save/ /content/drive/MyDrive/GroupmeGeneratorData/YOUR_GROUPME_NAME

# Generation

This loads the model from the stored Google Drive directory. If you made it to this step, you can return at any point by simply:
1.   Rerunning the "Imports" section
2.   Running the cell below

No need to touch the above sections once your model is trained!

In [None]:
# rRudyStreetBets, MikeyMicah, SuckDuckBills
output_dir = '/content/drive/MyDrive/GroupmeGeneratorData/YOUR_GROUPME_NAME'
# # Load a trained model and vocabulary that you have fine-tuned
model = GPT2LMHeadModel.from_pretrained(output_dir)
tokenizer = GPT2Tokenizer.from_pretrained(output_dir)
model.to(device)
print(f"Model loaded from {output_dir}")

**Generator Directions**

To generate text from the model, run the cell below.

The "prompt" is how you control what is being generated. **Make sure that your prompt always starts with <|startoftext|>**. If the prompt is just <|startoftext|>, it will generate random, unprompted conversations. If you supply it more (e.g. 'Person Name: It is my opinion that') then it will start the generated conversations with that message. 

The model will return three sequences, you can change this to more or less by adjusting num_return_sequences.

In [None]:
model.eval()

prompt = "<|startoftext|> Person Name: It is my opinion that" # Change this!

generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
generated = generated.to(device)

print(generated)

sample_outputs = model.generate(
                                generated, 
                                #bos_token_id=random.randint(1,30000),
                                do_sample=True,   
                                top_k=50, 
                                max_length = 300,
                                top_p=0.95, 
                                num_return_sequences=3
                                )

for i, sample_output in enumerate(sample_outputs):
  print("{}: {}\n\n".format(i + 1, tokenizer.decode(sample_output, skip_special_tokens=True)))

[Source for GPT-2 Fine Tuning](https://colab.research.google.com/drive/13dZVYEOMhXhkXWfvSMVM1TTtUDrT6Aeh?usp=sharing#scrollTo=v4XhewaV93-_)