In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import numpy as np
import json
from tqdm.auto import tqdm
import random
import transformers

import os
import sys
sys.path.append('..')

from utils import model_utils
from baukit import nethook
from operator import itemgetter

In [3]:
MODEL_NAME = "facebook/galactica-6.7b"  # gpt2-{medium,large,xl} or EleutherAI/gpt-j-6B
mt = model_utils.ModelAndTokenizer(MODEL_NAME, low_cpu_mem_usage=True, torch_dtype=torch.float16)

model = mt.model
tokenizer = mt.tokenizer
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

print(f"{MODEL_NAME} ==> device: {model.device}, memory: {model.get_memory_footprint()}")

facebook/galactica-6.7b ==> device: cuda:0, memory: 13314719744


In [4]:
tokenizer

PreTrainedTokenizerFast(name_or_path='facebook/galactica-6.7b', vocab_size=50000, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'pad_token': '[PAD]'})

In [5]:
# path = "weights/galactica-6.7b"
# model.config.save_pretrained(path)
# torch.save(model.state_dict(), f"{path}/pytorch_model.bin")

In [6]:
input_text = "Facebook is a",
tokenized = tokenizer(input_text, return_tensors="pt")

outputs = model.generate(tokenized.input_ids.to(model.device), max_new_tokens=30, top_k = 1)
print(tokenizer.decode(outputs[0]))

Facebook is a social networking site that allows users to create profiles, connect with other users, and share information. Facebook has been used by millions of people around the world


In [7]:
# model(
#     input_ids = tokenized.input_ids.to(model.device),
#     attention_mask = tokenized.attention_mask.to(model.device)
# )

In [8]:
input_text = [
    "The Transformer architecture [START_REF]",
    "Facebook is a"
]
tokenized = tokenizer(input_text, padding=True, return_tensors="pt").to(model.device)

In [9]:
txt, ret_dict = model_utils.generate_fast(
    model = model, tok = tokenizer,
    prompts = input_text,
    argmax_greedy= True,
    # debug=True,
    max_new_tokens=30,
    top_k = 5
)

txt

['The Transformer architecture [START_REF] Attention is All you Need, Vaswani[END_REF] is a sequence-to-sequence model that uses self-attention to capture long-range dependencies between input',
 'Facebook is a social networking site that allows users to create profiles, connect with other users, and share information. Facebook has been used by millions of people around the world.']

In [23]:
# prompt = "apple starts with A, blueberry starts with B, yonatan starts with"
# # prompt = "apple ends with E, blueberry ends with Y, computer ends with <work>"
# # prompt = "Car 1 speed is 30km/h and Car 2 speed is 50km/h. Which car travels faster and how much? <work>"
# prompt = "123 comes before 124, 4322 comes before 4323, 324 comes before"
prompt = "apple is spelled A-P-P-L-E, school is spelled S-C-H-O-O-L, terminator is spelled"

txt, ret_dict = model_utils.generate_fast(
    model = model, tok = tokenizer,
    prompts = [prompt],
    argmax_greedy= True,
    # debug=True,
    max_new_tokens = 20,
    top_k = 5
)

txt


['apple is spelled A-P-P-L-E, school is spelled S-C-H-O-O-L, terminator is spelled T-E-R-M-O-N-I-T-E-R,']