In [None]:
# Step 1: Install Required Libraries
!pip install -qU transformers accelerate bitsandbytes trl peft datasets auto-gptq optimum streamlit streamlit-chat ngrok

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig, TrainingArguments, GenerationConfig
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
from trl import SFTTrainer
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model, AutoPeftModelForCausalLM

# Step 2: Set Up Model and Tokenizer
base_model = "TheBloke/zephyr-7B-beta-GPTQ"

# Load the dataset
dataset_name = "bitext/Bitext-customer-support-llm-chatbot-training-dataset"
dataset = load_dataset(dataset_name, split='train')

# Convert to pandas DataFrame
df_train = dataset.to_pandas()
train_data, eval_data = train_test_split(df_train, test_size=0.2, random_state=42)

# Preprocess dataset
def data_preprocessing(text):
    processed_text = "<|system|>\nYou are a support chatbot who helps with user queries. Chatbot responds professionally.\n<|user|>\n" + text["instruction"] + "\n<|assistant|>\n" + text["response"]
    return {"text": processed_text}

# Assign the result to a variable named 'train_dataset'
train_dataset = Dataset.from_pandas(train_data).map(data_preprocessing)  # Changed this line
eval_dataset = Dataset.from_pandas(eval_data).map(data_preprocessing)

# Step 3: Load Tokenizer and Model (Quantized with GPTQ)
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True

bnb_config = GPTQConfig(
    bits=4,
    use_exllama=True,
    device_map="auto",
    use_cache=False,
    lora_r=16,
    lora_alpha=16,
    tokenizer=tokenizer
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/21497 [00:00<?, ? examples/s]

Map:   0%|          | 0/5375 [00:00<?, ? examples/s]

In [None]:
!pip install -qU optimum auto-gptq



In [None]:
# Install required libraries
#!pip install -q optimum auto-gptq

# Import the necessary libraries
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
import torch

# Define the base model
base_model = "TheBloke/zephyr-7B-beta-GPTQ"

# Get the tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

# Set up the GPTQConfig with the correct `use_exllama` flag
bnb_config = GPTQConfig(
    bits=4,
    use_exllama=True,  # Updated to use_exllama (for version compatibility)
    device_map="auto",
    use_cache=False,
    lora_r=16,
    lora_alpha=16,
    tokenizer=tokenizer
)



In [None]:
# Load the quantized model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    device_map="auto",
    trust_remote_code=True,
    quantization_config=bnb_config
)

# # Enable gradient checkpointing and prepare for training
# model.config.use_cache = False
# model.config.pretraining_tp = 1
# model.gradient_checkpointing_enable()
# model = prepare_model_for_kbit_training(model)

# print(model)


Using `disable_exllama` is deprecated and will be removed in version 4.37. Use `use_exllama` instead and specify the version with `exllama_config`.The value of `use_exllama` will be overwritten by `disable_exllama` passed in `GPTQConfig` or stored in your config file.


model.safetensors:   0%|          | 0.00/4.16G [00:00<?, ?B/s]

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
Some weights of the model checkpoint at TheBloke/zephyr-7B-beta-GPTQ were not used when initializing MistralForCausalLM: ['model.layers.0.mlp.down_proj.bias', 'model.layers.0.mlp.gate_proj.bias', 'model.layers.0.mlp.up_proj.bias', 'model.layers.0.self_attn.k_proj.bias', 'model.layers.0.self_attn.o_proj.bias', 'model.layers.0.self_attn.q_proj.bias', 'model.layers.0.self_attn.v_proj.bias', 'model.layers.1.mlp.down_proj.bias', 'model.layers.1.mlp.gate_proj.bias', 'model.layers.1.mlp.up_proj.bias', 'model.layers.1.self_attn.k_proj.bias', 'model.layers.1.self_attn.o_proj.bias', 'model.layers.1.self_attn.q_proj.bias', 'model.layers.1.self_attn.v_proj.bias', 'model.layers.10.mlp.down_proj.bias', 'model.layers.10.mlp.gate_proj.bias', 'model.layers.10.mlp.up_proj.bias', 'model.layers.10.self_attn.k_proj.bias', 'model.layers.10.self_attn.o_proj.bias', 'model.layers.10.self_attn.q_proj.bias', 

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [None]:
# Get tokenizer
tokenizer=AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.add_bos_token, tokenizer.add_eos_token

# Import prepare_model_for_kbit_training
from peft import prepare_model_for_kbit_training

model.config.use_cache = False
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
print(model)

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=2)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (rotary_emb): MistralRotaryEmbedding()
          (k_proj): QuantLinear()
          (o_proj): QuantLinear()
          (q_proj): QuantLinear()
          (v_proj): QuantLinear()
        )
        (mlp): MistralMLP(
          (act_fn): SiLU()
          (down_proj): QuantLinear()
          (gate_proj): QuantLinear()
          (up_proj): QuantLinear()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): MistralRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): MistralRMSNorm((4096,), eps=1e-05)
  )
  (lm_head): Linear(in_features=4096, out_features=32000, bias=False)
)


In [None]:
# Step 4: Add LoRA Adapter
from peft import LoraConfig, get_peft_model # Import LoraConfig here

peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj"]
)

model = get_peft_model(model, peft_config)

# Step 5: Setup Training Arguments and Trainer
OUTPUT_DIR = '/content/zephyr-finetuning'
OPTIMIZER = "paged_adamw_32bit"



In [None]:
train_dataset

Dataset({
    features: ['flags', 'instruction', 'category', 'intent', 'response', '__index_level_0__', 'text'],
    num_rows: 21497
})

# Training the Model
It’s important to set the correct hyperparameters in the training arguments. Then, we will use HuggingFace’s TRL library to build the SFT Trainer with necessary components such as model, dataset, Lora configuration, tokenizer, and training parameters.

In the end, we will start training.

In [None]:
import os
os.environ["WANDB_DISABLED"] = "false"

In [None]:
# Import TrainingArguments
from transformers import TrainingArguments
# Make sure to import SFTTrainer here as well
from trl import SFTTrainer
# Import WandbCallback
from transformers.integrations import WandbCallback

training_arguments = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    optim=OPTIMIZER,
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    save_strategy="epoch",
    logging_steps=50,
    num_train_epochs=1,
    max_steps=100,
    fp16=True,
    evaluation_strategy="steps",
    eval_steps=50,
    push_to_hub=False,
    report_to="wandb" # Enable Wandb logging
)
#finetuning

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    args=training_arguments,
    tokenizer=tokenizer,
    packing=False,
    max_seq_length=1024
)


# Remove the WandbCallback (if you don't want to log to W&B)
trainer.remove_callback(WandbCallback)

trainer.train()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/21497 [00:00<?, ? examples/s]

Map:   0%|          | 0/5375 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss
50,0.9821,0.767334
100,0.7437,0.728222


TrainOutput(global_step=100, training_loss=0.8629196548461914, metrics={'train_runtime': 4890.5779, 'train_samples_per_second': 0.164, 'train_steps_per_second': 0.02, 'total_flos': 206063851143168.0, 'train_loss': 0.8629196548461914, 'epoch': 0.03720238095238095})

In [None]:
# Specify the directory where you want to save the model
output_dir = "/content/drive/MyDrive/chatbot/zephyr-finetuned"  # Change this to your desired path

# Save the fine-tuned model and tokenizer
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model saved to {output_dir}")


Model saved to /content/drive/MyDrive/chatbot/zephyr-finetuned


In [None]:
# Load the model and tokenizer
model = AutoModelForCausalLM.from_pretrained(output_dir)
tokenizer = AutoTokenizer.from_pretrained(output_dir)

print("Model and tokenizer loaded successfully")


In [None]:
# Step 6: Set Up Streamlit for Inference
def input_data_preprocessing(example):
    return "<|system|>\nYou are a support chatbot who helps with user queries. Chatbot responds professionally.\n<|user|>\n" + example["instruction"] + "\n<|assistant|>\n"

input_string = input_data_preprocessing({"instruction": "I have a question about cancelling order {{Order Number}}"})

# Load the trained model for inference
tokenizer = AutoTokenizer.from_pretrained(OUTPUT_DIR + "/checkpoint-100")
model = AutoPeftModelForCausalLM.from_pretrained(
    OUTPUT_DIR + "/checkpoint-100",
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="cuda"
)

generation_config = GenerationConfig(
    do_sample=True,
    temperature=0.5,
    max_new_tokens=256,
    pad_token_id=tokenizer.eos_token_id
)

# Step 7: Function to Infer and Generate Response
def infer_bot(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, generation_config=generation_config)
    out_str = tokenizer.decode(outputs[0], skip_special_tokens=True).replace(prompt, '')
    return out_str




Using `disable_exllama` is deprecated and will be removed in version 4.37. Use `use_exllama` instead and specify the version with `exllama_config`.The value of `use_exllama` will be overwritten by `disable_exllama` passed in `GPTQConfig` or stored in your config file.
Some weights of the model checkpoint at TheBloke/zephyr-7B-beta-GPTQ were not used when initializing MistralForCausalLM: ['model.layers.0.mlp.down_proj.bias', 'model.layers.0.mlp.gate_proj.bias', 'model.layers.0.mlp.up_proj.bias', 'model.layers.0.self_attn.k_proj.bias', 'model.layers.0.self_attn.o_proj.bias', 'model.layers.0.self_attn.q_proj.bias', 'model.layers.0.self_attn.v_proj.bias', 'model.layers.1.mlp.down_proj.bias', 'model.layers.1.mlp.gate_proj.bias', 'model.layers.1.mlp.up_proj.bias', 'model.layers.1.self_attn.k_proj.bias', 'model.layers.1.self_attn.o_proj.bias', 'model.layers.1.self_attn.q_proj.bias', 'model.layers.1.self_attn.v_proj.bias', 'model.layers.10.mlp.down_proj.bias', 'model.layers.10.mlp.gate_proj.bi

In [None]:
# # Step 8: Set Up the Streamlit App
# import streamlit as st
# from streamlit_chat import message

# # Initialize session state for storing conversation history
# if 'user' not in st.session_state:
#     st.session_state['user'] = []
# if 'assistant' not in st.session_state:
#     st.session_state['assistant'] = []

# def display_conversation(history):
#     for i in range(len(history["assistant"])):
#         message(history["user"][i], is_user=True, key=str(i) + "_user")
#         message(history["assistant"][i], key=str(i))

# def main():
#     st.title("Zephyr Fine-Tuned Chatbot")
#     st.subheader("A bot created using Zephyr which was fine-tuned to be a support member.")

#     user_input = st.text_input("Enter your query")

#     if st.button("Answer"):
#         answer = infer_bot(user_input)
#         st.session_state["user"].append(user_input)
#         st.session_state["assistant"].append(answer)

#         if st.session_state["assistant"]:
#             display_conversation(st.session_state)

# if __name__ == "__main__":
#     main()


In [None]:
# # Step 9: Use ngrok to Expose the Streamlit App
# from pyngrok import ngrok

# # Open a port for the Streamlit app
# public_url = ngrok.connect(port='8501')
# print(f"Streamlit app is live at: {public_url}")

# # Run Streamlit app
# !streamlit run /content/app.py &>/content/logs.txt &

In [None]:
# #################

# #data preprocessing for the sample input data
# def input_data_preprocessing(example):

#     processed_example = "<|system|>\n You are a support chatbot who helps with user queries chatbot who always responds in the style of a professional.\n<|user|>\n" + example["instruction"] + "\n<|assistant|>\n"

#     return processed_example
# input_string = input_data_preprocessing(
#     {
#         "instruction": "i have a question about cancelling order {{Order Number}}",
#     }
# )


# tokenizer = AutoTokenizer.from_pretrained("/content/zephyr-finetuning/checkpoint-100")


# model = AutoPeftModelForCausalLM.from_pretrained(
#     "/content/zephyr-finetuning/checkpoint-100",
#     low_cpu_mem_usage=True,
#     return_dict=True,
#     torch_dtype=torch.float16,
#     device_map="cuda")

# inputs = tokenizer(input_string, return_tensors="pt").to("cuda")

# generation_config = GenerationConfig(
#     do_sample=True,
#     top_k=1,
#     temperature=0.1,
#     max_new_tokens=256,
#     pad_token_id=tokenizer.eos_token_id
# )

# outputs = model.generate(**inputs, generation_config=generation_config)
# print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Using `disable_exllama` is deprecated and will be removed in version 4.37. Use `use_exllama` instead and specify the version with `exllama_config`.The value of `use_exllama` will be overwritten by `disable_exllama` passed in `GPTQConfig` or stored in your config file.


OutOfMemoryError: CUDA out of memory. Tried to allocate 250.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 5.06 MiB is free. Process 33144 has 10.21 GiB memory in use. Process 395419 has 4.53 GiB memory in use. Of the allocated memory 10.07 GiB is allocated by PyTorch, and 12.23 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# import os
# os.kill(os.getpid(), 9)



In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import torch
from transformers import AutoTokenizer
from peft import AutoPeftModelForCausalLM
from transformers import GenerationConfig


# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("/content/zephyr-finetuning/checkpoint-100")

# Apply these changes for memory management:
model = AutoPeftModelForCausalLM.from_pretrained(
    "/content/zephyr-finetuning/checkpoint-100",
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="auto"  # Changed to 'auto' to offload to CPU if needed
)

# Optionally enable gradient checkpointing
model.gradient_checkpointing_enable()  # Add this line if applicable

# ... (rest of your code)

# Function to preprocess the input question and generate a response
def get_answer(question):
    # Prepare the input data in the correct format
    input_string = f"<|system|>\nYou are a support chatbot who helps with user queries, always responding professionally.\n<|user|>\n{question}\n<|assistant|>\n"

    # Tokenize the input and send it to the model
    inputs = tokenizer(input_string, return_tensors="pt").to("cuda")

    # Configure the generation settings
    generation_config = GenerationConfig(
        do_sample=True,
        top_k=1,
        temperature=0.1,
        max_new_tokens=256,
        pad_token_id=tokenizer.eos_token_id
    )

    # Generate the output
    outputs = model.generate(**inputs, generation_config=generation_config)

    # Decode the response
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer.replace(input_string, '')  # Remove the prompt part




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Using `disable_exllama` is deprecated and will be removed in version 4.37. Use `use_exllama` instead and specify the version with `exllama_config`.The value of `use_exllama` will be overwritten by `disable_exllama` passed in `GPTQConfig` or stored in your config file.
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
Some weights of the model checkpoint at TheBloke/zephyr-7B-beta-GPTQ were not used when initializing MistralForCausalLM: ['model.layers.0.mlp.down_proj.bias', 'model.layers.0.mlp.gate_proj.bias', 'model.layers.0.ml

In [None]:
# Example: Ask a question and get the answer
question = "I have a question about cancelling order {{Order Number}}"
answer = get_answer(question)
print("Answer:", answer)

NameError: name 'get_answer' is not defined

In [None]:
#don'trun//////zipping model files to save in local

import locale
locale.getpreferredencoding = lambda: "UTF-8"

!zip -r zephyr-finetuning.zip zephyr-finetuning

  adding: zephyr-finetuning/ (stored 0%)
  adding: zephyr-finetuning/checkpoint-100/ (stored 0%)
  adding: zephyr-finetuning/checkpoint-100/scheduler.pt (deflated 57%)
  adding: zephyr-finetuning/checkpoint-100/rng_state.pth (deflated 25%)
  adding: zephyr-finetuning/checkpoint-100/optimizer.pt (deflated 8%)
  adding: zephyr-finetuning/checkpoint-100/README.md (deflated 66%)
  adding: zephyr-finetuning/checkpoint-100/adapter_model.safetensors (deflated 8%)
  adding: zephyr-finetuning/checkpoint-100/adapter_config.json (deflated 53%)
  adding: zephyr-finetuning/checkpoint-100/tokenizer.model (deflated 55%)
  adding: zephyr-finetuning/checkpoint-100/tokenizer.json (deflated 85%)
  adding: zephyr-finetuning/checkpoint-100/tokenizer_config.json (deflated 68%)
  adding: zephyr-finetuning/checkpoint-100/special_tokens_map.json (deflated 70%)
  adding: zephyr-finetuning/checkpoint-100/training_args.bin (deflated 51%)
  adding: zephyr-finetuning/checkpoint-100/trainer_state.json (deflated 64%)

In [None]:
%%writefile app.py
from peft import AutoPeftModelForCausalLM
from transformers import GenerationConfig
from transformers import AutoTokenizer
import torch
import streamlit as st
from streamlit_chat import message

st.session_state.clicked=True

def process_data_sample(example):

    processed_example = "<|system|>\n You are a support chatbot who helps with user queries chatbot who always responds in the style of a professional.</s>\n<|user|>\n" + example + "</s>\n<|assistant|>\n"

    return processed_example

@st.cache_resource(show_spinner=True)
def create_bot():

    tokenizer = AutoTokenizer.from_pretrained("/content/zephyr-finetuning/checkpoint-100")

    model = AutoPeftModelForCausalLM.from_pretrained(
                                                        "/content/zephyr-finetuning/checkpoint-100",
                                                        low_cpu_mem_usage=True,
                                                        return_dict=True,
                                                        torch_dtype=torch.float16,
                                                        device_map="cuda"
                                                    )

    generation_config = GenerationConfig(
                                            do_sample=True,
                                            temperature=0.5,
                                            max_new_tokens=256,
                                            pad_token_id=tokenizer.eos_token_id
                                        )

    return model, tokenizer, generation_config

model, tokenizer, generation_config = create_bot()

bot = create_bot()

def infer_bot(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, generation_config=generation_config)
    out_str = tokenizer.decode(outputs[0], skip_special_tokens=True).replace(prompt, '')
    return out_str

def display_conversation(history):
    for i in range(len(history["assistant"])):
        message(history["user"][i], is_user=True, key=str(i) + "_user")
        message(history["assistant"][i],key=str(i))

def main():

    st.title("Zephyr fine-tuned chatbot ")
    st.subheader("A bot created using Zephyr which was finetuned to possess the capabilities to be a support member")

    user_input = st.text_input("Enter your query")

    if st.session_state.clicked:
        if st.button("Answer"):

            answer = infer_bot(user_input)
            st.session_state["user"].append(user_input)
            st.session_state["assistant"].append(answer)

            if st.session_state["assistant"]:
                display_conversation(st.session_state)

if __name__ == "__main__":
    main()

Writing app.py


In [None]:
!wget https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64
!chmod +x cloudflared-linux-amd64
!nohup /content/cloudflared-linux-amd64 tunnel --url http://localhost:8501 &

--2024-12-16 00:30:26--  https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64
Resolving github.com (github.com)... 140.82.116.4
Connecting to github.com (github.com)|140.82.116.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github.com/cloudflare/cloudflared/releases/download/2024.12.1/cloudflared-linux-amd64 [following]
--2024-12-16 00:30:27--  https://github.com/cloudflare/cloudflared/releases/download/2024.12.1/cloudflared-linux-amd64
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/106867604/d519add1-9e9f-4539-86ac-a9e32a76cf28?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=releaseassetproduction%2F20241216%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20241216T003027Z&X-Amz-Expires=300&X-Amz-Signature=ba55cab9fd6bd519536bb2fd1e98c3fa9133c126e29cd9bb48785f50efd6b4ff&X-Amz

In [None]:
!grep -o 'https://.*\.trycloudflare.com' nohup.out | head -n 1 | xargs -I {} echo "Your tunnel url {}"

Your tunnel url https://avi-suitable-theorem-drives.trycloudflare.com


In [None]:
!streamlit run /content/app.py &>/content/logs.txt &

In [None]:
import torch
from transformers import AutoTokenizer, GenerationConfig
from peft import AutoPeftModelForCausalLM
import streamlit as st

# Ensure we're using GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"

# Clear any cached memory
torch.cuda.empty_cache()

@st.cache_resource(show_spinner=True)
def create_bot():
    tokenizer = AutoTokenizer.from_pretrained("/content/zephyr-finetuning/checkpoint-100")

    # Load the model with memory optimizations
    model = AutoPeftModelForCausalLM.from_pretrained(
        "/content/zephyr-finetuning/checkpoint-100",
        low_cpu_mem_usage=True,
        return_dict=True,
        torch_dtype=torch.float16,
        device_map="auto"  # This offloads to CPU when needed
    )

    generation_config = GenerationConfig(
        do_sample=True,
        temperature=0.5,
        max_new_tokens=128,  # Reduce token generation for less memory usage
        pad_token_id=tokenizer.eos_token_id
    )

    return model, tokenizer, generation_config

model, tokenizer, generation_config = create_bot()

def infer_bot(prompt):
    with torch.no_grad():  # No gradients during inference
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
        outputs = model.generate(**inputs, generation_config=generation_config)
        out_str = tokenizer.decode(outputs[0], skip_special_tokens=True).replace(prompt, '')
    return out_str

def display_conversation(history):
    for i in range(len(history["assistant"])):
        st.chat_message("user").markdown(history["user"][i])
        st.chat_message("assistant").markdown(history["assistant"][i])

def main():
    st.title("Zephyr fine-tuned chatbot")
    st.subheader("A bot created using Zephyr which was fine-tuned to possess the capabilities to be a support member")

    user_input = st.text_input("Enter your query")

    if st.button("Answer"):
        answer = infer_bot(user_input)
        st.session_state["user"].append(user_input)
        st.session_state["assistant"].append(answer)

        if st.session_state["assistant"]:
            display_conversation(st.session_state)

if __name__ == "__main__":
    main()




In [None]:
import torch
from transformers import AutoTokenizer, GenerationConfig
from peft import AutoPeftModelForCausalLM
import streamlit as st
from streamlit_chat import message

# Initialize session state keys if not present
if 'user' not in st.session_state:
    st.session_state['user'] = []
if 'assistant' not in st.session_state:
    st.session_state['assistant'] = []

device = "cuda" if torch.cuda.is_available() else "cpu"

# Clear any cached memory
torch.cuda.empty_cache()

def process_data_sample(example):
    processed_example = "<|system|>\n You are a support chatbot who helps with user queries chatbot who always responds in the style of a professional.</s>\n<|user|>\n" + example + "</s>\n<|assistant|>\n"
    return processed_example

@st.cache_resource(show_spinner=True)
def create_bot():
    tokenizer = AutoTokenizer.from_pretrained("/content/zephyr-finetuning/checkpoint-100")

    # Load the model with memory optimizations
    model = AutoPeftModelForCausalLM.from_pretrained(
        "/content/zephyr-finetuning/checkpoint-100",
        low_cpu_mem_usage=True,
        return_dict=True,
        torch_dtype=torch.float16,
        device_map="auto"  # This offloads to CPU when needed
    )

    generation_config = GenerationConfig(
        do_sample=True,
        temperature=0.5,
        max_new_tokens=128,  # Reduce token generation for less memory usage
        pad_token_id=tokenizer.eos_token_id
    )

    return model, tokenizer, generation_config

model, tokenizer, generation_config = create_bot()

def infer_bot(prompt):
    with torch.no_grad():  # No gradients during inference
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
        outputs = model.generate(**inputs, generation_config=generation_config)
        out_str = tokenizer.decode(outputs[0], skip_special_tokens=True).replace(prompt, '')
    return out_str

def display_conversation(history):
    for i in range(len(history["assistant"])):
        st.chat_message("user").markdown(history["user"][i])
        st.chat_message("assistant").markdown(history["assistant"][i])

def main():
    # Initialize session state keys if not present
    if 'user' not in st.session_state:
        st.session_state['user'] = []
    if 'assistant' not in st.session_state:
        st.session_state['assistant'] = []

    st.title("Zephyr fine-tuned chatbot")
    st.subheader("A bot created using Zephyr which was fine-tuned to possess the capabilities to be a support member")

    user_input = st.text_input("Enter your query")

    if st.button("Answer"):
        answer = infer_bot(user_input)

        # Append user input and assistant response to session state
        st.session_state["user"].append(user_input)
        st.session_state["assistant"].append(answer)

        # Display the conversation history
        if st.session_state["assistant"]:
            display_conversation(st.session_state)

if __name__ == "__main__":
    main()


2024-12-16 01:04:11.236 
  command:

    streamlit run /usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Using `disable_exllama` is deprecated and will be removed in version 4.37. Use `use_exllama` instead and specify the version with `exllama_config`.The value of `use_exllama` will be overwritten by `disable_exllama` passed in `GPTQConfig` or stored in your config file.
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
Some weights of the model checkpoint at TheBloke/zephyr-7B-beta-GPTQ were not used

In [None]:
!unzip /content/zephyr-finetuning.zip -d /content/zephyr-finetuning/


Archive:  /content/zephyr-finetuning.zip
replace /content/zephyr-finetuning/zephyr-finetuning/checkpoint-100/scheduler.pt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: /content/zephyr-finetuning/zephyr-finetuning/checkpoint-100/scheduler.pt  
replace /content/zephyr-finetuning/zephyr-finetuning/checkpoint-100/rng_state.pth? [y]es, [n]o, [A]ll, [N]one, [r]ename: All
  inflating: /content/zephyr-finetuning/zephyr-finetuning/checkpoint-100/rng_state.pth  
  inflating: /content/zephyr-finetuning/zephyr-finetuning/checkpoint-100/optimizer.pt  
  inflating: /content/zephyr-finetuning/zephyr-finetuning/checkpoint-100/README.md  
  inflating: /content/zephyr-finetuning/zephyr-finetuning/checkpoint-100/adapter_model.safetensors  
  inflating: /content/zephyr-finetuning/zephyr-finetuning/checkpoint-100/adapter_config.json  
  inflating: /content/zephyr-finetuning/zephyr-finetuning/checkpoint-100/tokenizer.model  
  inflating: /content/zephyr-finetuning/zephyr-finetuning/checkpoint-100/to

In [None]:
!pip install streamlit transformers peft torch




In [None]:
!wget https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64
!chmod +x cloudflared-linux-amd64


--2024-12-16 00:44:29--  https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64
Resolving github.com (github.com)... 140.82.116.4
Connecting to github.com (github.com)|140.82.116.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github.com/cloudflare/cloudflared/releases/download/2024.12.1/cloudflared-linux-amd64 [following]
--2024-12-16 00:44:29--  https://github.com/cloudflare/cloudflared/releases/download/2024.12.1/cloudflared-linux-amd64
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/106867604/d519add1-9e9f-4539-86ac-a9e32a76cf28?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=releaseassetproduction%2F20241216%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20241216T004429Z&X-Amz-Expires=300&X-Amz-Signature=7617915ad7c7e622c0b77b58c1c184952cf8d1374f2f8cddb263afad4129652d&X-Amz

In [None]:
!nohup streamlit run /content/app.py --server.port 8501 &>/content/logs.txt &


In [None]:
!nohup /content/cloudflared-linux-amd64 tunnel --url http://localhost:8501 &>/content/streamlit-tunnel.log &


In [None]:
!grep -o 'https://.*\.trycloudflare.com' /content/streamlit-tunnel.log | head -n 1


https://killer-sleeps-down-fonts.trycloudflare.com
