In [2]:
hf_tag = "ethzanalytics/gpt-j-6B-8bit-sharded" #@param {type:"string"}

In [3]:
#@markdown setup logging
import logging
from pathlib import Path
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)
    
das_logfile = Path.cwd() / "generate.log"

logging.basicConfig(
    level=logging.INFO,
    filename=das_logfile,
    filemode='w',
    format="%(asctime)s %(levelname)s %(message)s",
    datefmt="%m/%d/%Y %I:%M:%S",
)

In [4]:
#@markdown check  system stats
from psutil import virtual_memory
import os
ram_gb = round(virtual_memory().total / (1024**3), 1)
print(f'Runtime has {ram_gb} gigs of memory and {os.cpu_count()} processors')
logging.info(f'Runtime has {ram_gb} gigs of memory and {os.cpu_count()} processors')

Runtime has 15.6 gigs of memory and 2 processors


In [5]:
!pip install transformers -q
!pip install accelerate -q
!pip install bitsandbytes -q

[0m

In [6]:
import transformers
import torch
import torch.nn.functional as F
from torch import nn
from torch.cuda.amp import custom_fwd, custom_bwd

from bitsandbytes.functional import quantize_blockwise, dequantize_blockwise


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so
CUDA SETUP: CUDA runtime path found: /opt/conda/lib/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 7.5
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...


  warn(msg)


In [7]:
#@markdown **create class for bnb** 
import gc

class FrozenBNBLinear(nn.Module):
    def __init__(self, weight, absmax, code, bias=None):
        assert isinstance(bias, nn.Parameter) or bias is None
        super().__init__()
        self.out_features, self.in_features = weight.shape
        self.register_buffer("weight", weight.requires_grad_(False))
        self.register_buffer("absmax", absmax.requires_grad_(False))
        self.register_buffer("code", code.requires_grad_(False))
        self.adapter = None
        self.bias = bias

    def forward(self, input):
        output = DequantizeAndLinear.apply(
            input, self.weight, self.absmax, self.code, self.bias
        )
        if self.adapter:
            output += self.adapter(input)
        return output

    @classmethod
    def from_linear(cls, linear: nn.Linear) -> "FrozenBNBLinear":
        weights_int8, state = quantize_blockise_lowmemory(linear.weight)
        return cls(weights_int8, *state, linear.bias)

    def __repr__(self):
        return f"{self.__class__.__name__}({self.in_features}, {self.out_features})"


class DequantizeAndLinear(torch.autograd.Function):
    @staticmethod
    @custom_fwd
    def forward(
        ctx,
        input: torch.Tensor,
        weights_quantized: torch.ByteTensor,
        absmax: torch.FloatTensor,
        code: torch.FloatTensor,
        bias: torch.FloatTensor,
    ):
        weights_deq = dequantize_blockwise(weights_quantized, absmax=absmax, code=code)
        ctx.save_for_backward(input, weights_quantized, absmax, code)
        ctx._has_bias = bias is not None
        return F.linear(input, weights_deq, bias)

    @staticmethod
    @custom_bwd
    def backward(ctx, grad_output: torch.Tensor):
        assert (
            not ctx.needs_input_grad[1]
            and not ctx.needs_input_grad[2]
            and not ctx.needs_input_grad[3]
        )
        input, weights_quantized, absmax, code = ctx.saved_tensors
        # grad_output: [*batch, out_features]
        weights_deq = dequantize_blockwise(weights_quantized, absmax=absmax, code=code)
        grad_input = grad_output @ weights_deq
        grad_bias = grad_output.flatten(0, -2).sum(dim=0) if ctx._has_bias else None
        return grad_input, None, None, None, grad_bias


class FrozenBNBEmbedding(nn.Module):
    def __init__(self, weight, absmax, code):
        super().__init__()
        self.num_embeddings, self.embedding_dim = weight.shape
        self.register_buffer("weight", weight.requires_grad_(False))
        self.register_buffer("absmax", absmax.requires_grad_(False))
        self.register_buffer("code", code.requires_grad_(False))
        self.adapter = None

    def forward(self, input, **kwargs):
        with torch.no_grad():
            # note: both quantuized weights and input indices are *not* differentiable
            weight_deq = dequantize_blockwise(
                self.weight, absmax=self.absmax, code=self.code
            )
            output = F.embedding(input, weight_deq, **kwargs)
        if self.adapter:
            output += self.adapter(input)
        return output

    @classmethod
    def from_embedding(cls, embedding: nn.Embedding) -> "FrozenBNBEmbedding":
        weights_int8, state = quantize_blockise_lowmemory(embedding.weight)
        return cls(weights_int8, *state)

    def __repr__(self):
        return f"{self.__class__.__name__}({self.num_embeddings}, {self.embedding_dim})"


def quantize_blockise_lowmemory(matrix: torch.Tensor, chunk_size: int = 2**20):
    assert chunk_size % 4096 == 0
    code = None
    chunks = []
    absmaxes = []
    flat_tensor = matrix.view(-1)
    for i in range((matrix.numel() - 1) // chunk_size + 1):
        input_chunk = flat_tensor[i * chunk_size : (i + 1) * chunk_size].clone()
        quantized_chunk, (absmax_chunk, code) = quantize_blockwise(
            input_chunk, code=code
        )
        chunks.append(quantized_chunk)
        absmaxes.append(absmax_chunk)
    matrix_i8 = torch.cat(chunks).reshape_as(matrix)
    absmax = torch.cat(absmaxes)
    return matrix_i8, (absmax, code)


def convert_to_int8(model):
    """Convert linear and embedding modules to 8-bit with optional adapters"""
    for module in list(model.modules()):
        for name, child in module.named_children():
            gc.collect()
            if isinstance(child, nn.Linear):
                print(name, child)
                setattr(
                    module,
                    name,
                    FrozenBNBLinear(
                        weight=torch.zeros(
                            child.out_features, child.in_features, dtype=torch.uint8
                        ),
                        absmax=torch.zeros((child.weight.numel() - 1) // 4096 + 1),
                        code=torch.zeros(256),
                        bias=child.bias,
                    ),
                )
            elif isinstance(child, nn.Embedding):
                setattr(
                    module,
                    name,
                    FrozenBNBEmbedding(
                        weight=torch.zeros(
                            child.num_embeddings, child.embedding_dim, dtype=torch.uint8
                        ),
                        absmax=torch.zeros((child.weight.numel() - 1) // 4096 + 1),
                        code=torch.zeros(256),
                    ),
                )


In [8]:
import transformers
#@markdown **create blocking functions** 
class GPTJBlock(transformers.models.gptj.modeling_gptj.GPTJBlock):
    def __init__(self, config):
        super().__init__(config)

        convert_to_int8(self.attn)
        convert_to_int8(self.mlp)


class GPTJModel(transformers.models.gptj.modeling_gptj.GPTJModel):
    def __init__(self, config):
        super().__init__(config)
        convert_to_int8(self)
        

class GPTJForCausalLM(transformers.models.gptj.modeling_gptj.GPTJForCausalLM):
    def __init__(self, config):
        super().__init__(config)
        convert_to_int8(self)


transformers.models.gptj.modeling_gptj.GPTJBlock = GPTJBlock



In [9]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(hf_tag,)

Downloading (…)okenizer_config.json:   0%|          | 0.00/763 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.14M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/4.33k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

In [10]:
model = GPTJForCausalLM.from_pretrained(
    hf_tag,low_cpu_mem_usage=True
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/82.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/7 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00007.bin:   0%|          | 0.00/968M [00:00<?, ?B/s]

Downloading (…)l-00002-of-00007.bin:   0%|          | 0.00/968M [00:00<?, ?B/s]

Downloading (…)l-00003-of-00007.bin:   0%|          | 0.00/984M [00:00<?, ?B/s]

Downloading (…)l-00004-of-00007.bin:   0%|          | 0.00/946M [00:00<?, ?B/s]

Downloading (…)l-00005-of-00007.bin:   0%|          | 0.00/968M [00:00<?, ?B/s]

Downloading (…)l-00006-of-00007.bin:   0%|          | 0.00/984M [00:00<?, ?B/s]

Downloading (…)l-00007-of-00007.bin:   0%|          | 0.00/394M [00:00<?, ?B/s]

k_proj Linear(in_features=4096, out_features=4096, bias=False)
v_proj Linear(in_features=4096, out_features=4096, bias=False)
q_proj Linear(in_features=4096, out_features=4096, bias=False)
out_proj Linear(in_features=4096, out_features=4096, bias=False)
fc_in Linear(in_features=4096, out_features=16384, bias=True)
fc_out Linear(in_features=16384, out_features=4096, bias=True)
k_proj Linear(in_features=4096, out_features=4096, bias=False)
v_proj Linear(in_features=4096, out_features=4096, bias=False)
q_proj Linear(in_features=4096, out_features=4096, bias=False)
out_proj Linear(in_features=4096, out_features=4096, bias=False)
fc_in Linear(in_features=4096, out_features=16384, bias=True)
fc_out Linear(in_features=16384, out_features=4096, bias=True)
k_proj Linear(in_features=4096, out_features=4096, bias=False)
v_proj Linear(in_features=4096, out_features=4096, bias=False)
q_proj Linear(in_features=4096, out_features=4096, bias=False)
out_proj Linear(in_features=4096, out_features=4096, 

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

Some weights of the model checkpoint at ethzanalytics/gpt-j-6B-8bit-sharded were not used when initializing GPTJForCausalLM: ['transformer.h.4.mlp.fc_out.adapter.0.weight', 'transformer.h.13.attn.k_proj.adapter.2.weight', 'transformer.h.5.mlp.fc_in.adapter.2.weight', 'transformer.h.0.attn.q_proj.adapter.2.weight', 'transformer.h.2.attn.v_proj.adapter.2.weight', 'transformer.h.6.attn.out_proj.adapter.2.weight', 'transformer.h.25.attn.out_proj.adapter.2.weight', 'transformer.h.12.mlp.fc_out.adapter.2.weight', 'transformer.h.13.attn.out_proj.adapter.2.weight', 'transformer.h.25.attn.q_proj.adapter.0.weight', 'transformer.h.26.attn.v_proj.adapter.2.weight', 'transformer.h.27.attn.q_proj.adapter.0.weight', 'transformer.h.18.attn.v_proj.adapter.0.weight', 'transformer.h.23.attn.v_proj.adapter.2.weight', 'transformer.h.16.attn.k_proj.adapter.2.weight', 'transformer.h.10.attn.q_proj.adapter.0.weight', 'transformer.h.3.attn.out_proj.adapter.2.weight', 'transformer.h.3.attn.v_proj.adapter.0.weig

In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

In [34]:
from transformers import pipeline
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1,
)


In [50]:
import pprint as pp

# @markdown define `generate_text(prompt: str, ...)`
# @markdown - feel free to adjust textgen params for different results

def generate_text(
    prompt: str,
    temperature=0.7,
    top_k: int = 50,
    top_p=0.95,
    min_length: int = 16,
    max_length: int = 256,
    return_full_text=False,
    **kwargs,
) -> None:

    print(f"generating results for input:\n\t{prompt}\n\t...")
    result = generator(
        prompt,
        min_length=min_length,
        max_length=max_length,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        remove_invalid_values=True,
        clean_up_tokenization_spaces=True,
        do_sample=True,
        return_full_text=return_full_text,
        pad_token_id=generator.tokenizer.eos_token_id,
        **kwargs,
    )

    output = result[0]["generated_text"]
    pp.pprint(output)

    return output


In [47]:
%%time
prompt = "extract entities from this quesion :\n What Televisions options are available Zaghouan in Gabs Sud with a size of 55 and brand Apple?\n product category = Televisions \n region = Zaghouan \n city =gabs Sud \n size = 55\n product name = Apple\n extract entities from this quesion : Can you recommend a Dell smartphone with a size of 55 located in Tunis in ennasr? " #@param {type:"string"}
result = generate_text(prompt)

generating results for input:
	extract entities from this quesion :
 What Televisions options are available Zaghouan in Gabs Sud with a size of 55 and brand Apple?
 product category = Televisions 
 region = Zaghouan 
 city =gabs Sud 
 size = 55
 product name = Apple
 extract entities from this quesion : Can you recommend a Dell smartphone with a size of 55 located in Tunis in ennasr? 
	...
('\n'
 ' product category = Smartphones\n'
 ' region = Tunis \n'
 ' city = ennasr\n'
 ' size = 55\n'
 ' product name = Dell\n'
 '\n'
 'How to extract entities from')
CPU times: user 12.6 s, sys: 10.6 ms, total: 12.6 s
Wall time: 12.6 s


In [44]:
%%time
prompt = "extract query entities from this question like the product name , size , city , region and the product name :\n What Televisions options are available Zaghouan in Gabs Sud with a size of 55 and brand Apple?"
result = generate_text(prompt)

generating results for input:
	extract query entities from this question like the product name , size , city , region and the product name :
 What Televisions options are available Zaghouan in Gabs Sud with a size of 55 and brand Apple?
	...
('\n'
 '\n'
 "I'm trying to make a query which can extract the product name, size, city, "
 'region and the product name :\n'
 'What Televisions options are available Zaghouan in Gabs Sud with a size of '
 '55 and brand Apple?\n'
 '\n'
 'A:\n'
 '\n'
 'Try this:\n'
 'SELECT `product_name`, `size`, `city`, `region`, `product_name`')
CPU times: user 30.2 s, sys: 17.6 ms, total: 30.2 s
Wall time: 30.2 s


In [56]:
%%time
prompt = "Please extract the following entities from the given question: product name, size, city, and region , put them into a list .\nQuestion: What are the available options for Televisions in Zaghouan, Gabs Sud, with a size of 55 and brand Apple?"
result = generate_text(prompt)

generating results for input:
	Please extract the following entities from the given question: product name, size, city, and region , put them into a list .
Question: What are the available options for Televisions in Zaghouan, Gabs Sud, with a size of 55 and brand Apple?
	...
('\n'
 '\n'
 'A:\n'
 '\n'
 'Product Name:\n'
 'Televisions\n'
 'Size:\n'
 '55\n'
 'Brand:\n'
 'Apple\n'
 'City:\n'
 'Zaghouan, Gabs Sud\n'
 'Region:\n'
 '\n'
 'If the question is asking for the available options with the brand apple for '
 'size 55 then the options are:\n'
 'Televisions:\n'
 '1) Samsung\n'
 '2) LG\n'
 '3) Sony\n'
 '4) Panasonic\n'
 '\n')
CPU times: user 31.3 s, sys: 24.7 ms, total: 31.3 s
Wall time: 31.4 s
