##### Install requirements

In [None]:
!pip install transformers
!pip install xformers
!pip install huggingface_hub
!pip install auto-gptq
!pip install optimum
!pip install bitsandbytes

##### Login to HuggingFace using your access token

In [None]:
from huggingface_hub import notebook_login
notebook_login()

##### Prepare the model


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

model_name_or_path = "TheBloke/Mistral-7B-Instruct-v0.1-GPTQ"
# To use a different branch, change revision
# For example: revision="gptq-4bit-32g-actorder_True"
model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                             device_map="auto",
                                             torchscript=True,
                                             trust_remote_code=False,
                                             revision="main")

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

##### Verify inference

In [None]:
model.cuda()

prompt = "Given a dictionary with 'context' and 'question', answer the 'question' based on 'context' ---- [Context: 'a boy name astitva loves to eat dark chocolate'; Question: Who loves to eat dark chocolate?]"
prompt_template=f'''<s>[INST] {prompt} [/INST]
'''

print("\n\n*** Generate:")

input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
output = model.generate(inputs=input_ids, temperature=0.7, do_sample=True, top_p=0.95, top_k=40, max_new_tokens=512)
print(tokenizer.decode(output[0]))

##### Trace the model to get the executable that is optimized using just-in-time compilation

In [None]:
import torch
traced_model = torch.jit.trace(model, input_ids)

##### Optimize and generate '.ptl'

In [None]:
from torch.utils.mobile_optimizer import optimize_for_mobile
optimized_traced_model = optimize_for_mobile(traced_model)
optimized_traced_model._save_for_lite_interpreter("mistral7B_quantized.ptl")