In [1]:
pip install torch transformers gradio


Collecting gradio
  Downloading gradio-5.19.0-py3-none-any.whl.metadata (16 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvi

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import gradio as gr

# ✅ Load Model from Hugging Face Hub
model_name = "apeksha07/Llama-2-7b-chat-finetune"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

# Set padding token if necessary
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/434 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/195 [00:00<?, ?B/s]

In [21]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [26]:
import os
import torch
import gradio as gr
import json
import re
from transformers import AutoModelForCausalLM, AutoTokenizer
from google.colab import drive

HISTORY_FILE = "/content/drive/My Drive/chat_history.json"  # Save in Google Drive

# ✅ Function to load chat history
def load_chat_history():
    if os.path.exists(HISTORY_FILE):
        with open(HISTORY_FILE, "r") as f:
            return json.load(f)  # Load history as a list
    return []  # Return empty history if file doesn’t exist

# ✅ Function to save chat history
def save_chat_history(history):
    with open(HISTORY_FILE, "w") as f:
        json.dump(history, f, indent=4)  # Save in readable format

# ✅ Function to clean the assistant response
def clean_response(response):
    """
    Extracts only complete sentences from the response, stopping at the last punctuation.
    """
    # Remove unwanted "User:" and "Assistant:" parts
    response = response.split("User:")[0].strip()

    # Find the last complete sentence ending with ".", "!", or "?"
    match = re.search(r"^(.*[.!?])\s?", response)

    if match:
        return match.group(1).strip()  # Return up to the last punctuation
    else:
        return response  # If no punctuation, return the whole response

# ✅ Define Chat Function
def chat_with_gpt2(user_input, history=[]):
    # Load previous chat history
    history = load_chat_history()

    messages = []
    for msg in history:
        messages.append(msg)

    # Append new user message
    messages.append({"role": "user", "content": user_input})

    # Prepare input text
    chat_input = "\n".join([f"{msg['role'].capitalize()}: {msg['content']}" for msg in messages]) + "\nAssistant:"

    # Tokenize input
    inputs = tokenizer(chat_input, return_tensors="pt", truncation=True, max_length=1024).to("cuda")

    # Generate response
    model.eval()
    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_length=inputs['input_ids'].shape[1] + 50,  # Control response length
            temperature=0.7,  # Adjust response variability
            do_sample=True,
            repetition_penalty=1.1,
            pad_token_id=tokenizer.eos_token_id
        )

    # Decode response
    decoded = tokenizer.decode(output[:, inputs["input_ids"].shape[-1]:][0], skip_special_tokens=True)

    # Clean response
    cleaned_response = clean_response(decoded)

    # Append assistant response to history
    messages.append({"role": "assistant", "content": cleaned_response})

    # Save updated history in Google Drive
    save_chat_history(messages)

    return cleaned_response

# ✅ Create Gradio Chat UI
chat_ui = gr.ChatInterface(fn=chat_with_gpt2, title="LLaMA-2 Chatbot")

# ✅ Launch API
chat_ui.launch(debug=True, share=True)




Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://b70601f3385a2d2c89.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://b70601f3385a2d2c89.gradio.live


