A notebook for training our model on the kmeans data. Authored by Nate Cadicamo, Philip Baillargeon, Javokhir Arifov.

In [1]:
# need to mount drive to access data
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
!pip install --upgrade transformers
!pip install datasets
!pip install --upgrade accelerate

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed dataset

In [3]:
"""
Trying to load John Thickstun's model so that we can finetune it.
Problem is that it won't train with HF as it stands.
Solution is hopefully to manually load the weights.

Architecture Hyperparameters  S     M     L
Layers                        12    24    36
Attention Heads               12    16    20
Hidden Dimensions             768   1024  1280
"""

# libraries to import
from transformers import GPT2Model, GPT2LMHeadModel, GPT2Config, AutoModel

# load john's model directly. Tried the 800k a few times, now trying the 100k.
amt_model = AutoModel.from_pretrained("stanford-crfm/music-small-100k")

# weights dictionary
amt_weights = dict(amt_model.named_parameters())

# initiate our model
config = GPT2Config()
config.vocab_size = 55028
config.n_embed = 768
config.n_layer = 12
config.n_head = 12
our_model = GPT2LMHeadModel(config)
our_weights = dict(our_model.named_parameters())

print(f"Before: {our_weights['transformer.wte.weight']}\n")

# print(our_weights.keys())

# print(amt_weights.keys())

# copy the actual weights over
for key in amt_weights.keys():
  our_weights[f"transformer.{key}"].data.copy_(amt_weights[key].data)


# they are different now!
print(f"After: {our_weights['transformer.wte.weight']}")




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.96k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/512M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


Before: Parameter containing:
tensor([[-0.0259, -0.0038,  0.0115,  ..., -0.0002,  0.0036, -0.0189],
        [-0.0097, -0.0169,  0.0064,  ..., -0.0382, -0.0258,  0.0294],
        [-0.0462,  0.0032, -0.0163,  ...,  0.0010,  0.0524, -0.0120],
        ...,
        [ 0.0175, -0.0365, -0.0118,  ..., -0.0054, -0.0259, -0.0093],
        [-0.0039, -0.0264,  0.0039,  ...,  0.0173,  0.0268,  0.0194],
        [-0.0123,  0.0063, -0.0222,  ..., -0.0112,  0.0178,  0.0084]],
       requires_grad=True)

After: Parameter containing:
tensor([[-3.4468e-02,  4.9068e-02, -1.7617e-02,  ..., -1.2157e-03,
          3.1622e-02, -2.9189e-02],
        [ 1.3305e-02,  1.2417e-02,  2.5303e-05,  ...,  9.4436e-03,
         -6.4281e-03, -4.0270e-02],
        [ 1.8564e-02,  1.6290e-02,  1.2407e-02,  ...,  1.4276e-02,
          1.4714e-02, -2.8193e-02],
        ...,
        [-3.8858e-02, -1.3365e-01,  8.0291e-02,  ...,  1.1940e-02,
          2.4604e-02,  6.1659e-02],
        [ 6.9304e-04, -4.8022e-03, -8.4192e-04,  ..., 

In [4]:
"""
Train the now loaded model.
"""

# (0) libraries
from transformers import Trainer, TrainingArguments, GPT2TokenizerFast
from datasets import Dataset
from tqdm import tqdm


# (1) need to load the dataset. CURRENTLY ON wiki DATA
print("loading data...")

# train
train_dataset = []
with open('drive/MyDrive/224N/train.txt', 'r') as trainfile:
    lines = trainfile.readlines()
    for l in lines:
        train_dataset.append({
            "input_ids": [int(tok) for tok in l.strip().split()],
            "token_type_ids": [0 for i in range(1024)]
        })
train_dataset = Dataset.from_list(train_dataset)
train_dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids'])
print(train_dataset)
print(train_dataset[0])



loading data...
Dataset({
    features: ['input_ids', 'token_type_ids'],
    num_rows: 184341
})
{'input_ids': tensor([55029,     0, 10001,  ...,  1298, 10010, 16032]), 'token_type_ids': tensor([0, 0, 0,  ..., 0, 0, 0])}


In [5]:
# valid
val_dataset = []
with open('drive/MyDrive/224N/valid.txt', 'r') as valfile:
    lines = valfile.readlines()
    for l in lines:
        val_dataset.append({
            "input_ids": [int(tok) for tok in l.strip().split()],
            "token_type_ids": [0 for i in range(1024)]
        })
val_dataset = Dataset.from_list(val_dataset)
val_dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids'])
print(val_dataset)
print(val_dataset[0])

Dataset({
    features: ['input_ids', 'token_type_ids'],
    num_rows: 15608
})
{'input_ids': tensor([55029, 55025, 55025,  ...,  2855, 10010, 17204]), 'token_type_ids': tensor([0, 0, 0,  ..., 0, 0, 0])}


In [6]:
# instantiate the tokenizer
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
special_tokens = {
    "bos_token": "[BOS]",
    "eos_token": "[EOS]",
    "pad_token": "[PAD]"
}
tokenizer.add_special_tokens(special_tokens)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

3

In [7]:
# (2) change the vocab size (anticipation vocab size + number of kmeans clusters)
our_model.resize_token_embeddings(55028 + 128)

# (3) set up training
output_path = "drive/MyDrive/224N/wiki_KM_smallAMT100k_finetune_model_saved"
steps = 2000
train_config = {"output_dir": output_path,
          "num_train_epochs": 1,  # possibly change this later
          "per_device_train_batch_size": 2,  # used to be just 1
          "per_device_eval_batch_size": 1,
          "evaluation_strategy": "steps",
          "save_strategy": "steps",
          "eval_steps": steps * 20,
          "logging_steps":steps,
          "logging_first_step": True,
          "save_total_limit": 2,
          "save_steps": steps * 20,
          "lr_scheduler_type": "cosine",
          "learning_rate": 3e-5,  # used to be 5e-4
          "warmup_ratio": 0.01,
          "weight_decay": 0.01,
          "seed": 1,
          "load_best_model_at_end": True,
          "label_names": ["input_ids"]
          }

train_args = TrainingArguments(**train_config)

from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


# set up trainer
trainer = Trainer(
    args=train_args,
    model=our_model,
    tokenizer=tokenizer,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

print("training!")

trainer.train(resume_from_checkpoint=False)

trainer.save_model(output_path)


training!


Step,Training Loss,Validation Loss
40000,0.8623,0.861623
80000,0.8156,0.833567


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].
