# code sets up a small GPT-2 model

In [None]:
!pip install transformers datasets torch pdfminer.six python-docx

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting pdfminer.six
  Downloading pdfminer.six-20240706-py3-none-any.whl.metadata (4.1 kB)
Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  

In [2]:
#/content/drive/MyDrive/my_train_data

In [3]:
import os
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
from pdfminer.high_level import extract_text
from docx import Document

# Load GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Avoid padding issues

# Function to extract text from different file formats
def extract_text_from_file(file_path):
    if file_path.endswith(".pdf"):
        return extract_text(file_path)  # Extract text from PDF
    elif file_path.endswith(".docx"):
        doc = Document(file_path)
        return "\n".join([para.text for para in doc.paragraphs])  # Extract text from DOCX
    elif file_path.endswith(".txt"):
        with open(file_path, "r", encoding="utf-8") as f:
            return f.read()  # Extract text from TXT
    else:
        return ""

# Directory in Google Drive where your documents are stored
data_dir = "/content/drive/MyDrive/my_train_data"
all_texts = []
for file in os.listdir(data_dir):
    file_path = os.path.join(data_dir, file)
    text = extract_text_from_file(file_path)
    if text:
        all_texts.append(text)

# Convert text into dataset
dataset = Dataset.from_dict({"text": all_texts})

# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Data collator for better training
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # We are fine-tuning for causal LM (GPT-2)
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Map:   0%|          | 0/21 [00:00<?, ? examples/s]

In [4]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGrained).
The token `MyNewToken` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `MyNewTo

In [11]:
# Load the GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Define training arguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/gpt2_finetuned",
    per_device_train_batch_size=4,
    num_train_epochs=3,
    logging_steps=100,
    eval_strategy="no",
    report_to="none",  # Disable W&B logging
)


# Trainer for model training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,  # Only training dataset
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Start training
trainer.train()

# Save the fine-tuned model
model.save_pretrained("/content/drive/MyDrive/gpt2_finetuned")
tokenizer.save_pretrained("/content/drive/MyDrive/gpt2_finetuned")

  trainer = Trainer(


Step,Training Loss


Step,Training Loss


('/content/drive/MyDrive/gpt2_finetuned/tokenizer_config.json',
 '/content/drive/MyDrive/gpt2_finetuned/special_tokens_map.json',
 '/content/drive/MyDrive/gpt2_finetuned/vocab.json',
 '/content/drive/MyDrive/gpt2_finetuned/merges.txt',
 '/content/drive/MyDrive/gpt2_finetuned/added_tokens.json')

In [12]:
from transformers import GPT2LMHeadModel, AutoTokenizer

model = GPT2LMHeadModel.from_pretrained("/content/drive/MyDrive/gpt2_finetuned")
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/gpt2_finetuned")

prompt = "Creating a software RAID during the installation?"
input_ids = tokenizer.encode(prompt, return_tensors="pt")

output = model.generate(input_ids, max_length=100)
print(tokenizer.decode(output[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Creating a software RAID during the installation?

The RAID is a special type of RAID that is used to store data on the hard disk. It is used to store data on the hard disk when the system is not in use.

The RAID is used to store data on the hard disk when the system is not in use.

The RAID is used to store data on the hard disk when the system is not in use.

The RAID is used to store data on the


In [19]:
from transformers import GPT2LMHeadModel, AutoTokenizer

# Load fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained("/content/drive/MyDrive/gpt2_finetuned")
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/gpt2_finetuned")

# Define prompt
prompt = "Q: How do I configure the date and time settings? \nA:"

# Tokenize input with padding and attention mask
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)

# Generate response
output = model.generate(
    inputs["input_ids"],
    attention_mask=inputs["attention_mask"],  # Now properly defined
    max_length=150,
    pad_token_id=tokenizer.eos_token_id,
    repetition_penalty=1.2,
    temperature=0.7,
    top_k=50,
    top_p=0.9
)

# Decode and print response
print(tokenizer.decode(output[0], skip_special_tokens=True))

Q: How do I configure the date and time settings? 
A: The default setting is to use a calendar. This can be changed by using your favorite web browser, such as Chrome or Firefox (or any other operating system). If you are not familiar with calendars in general, then this will help guide how they work for most users of Windows 7/8 . You may also want some additional guidance on what types each feature allows when it comes to scheduling dates based upon user preferences; see "How To Configure Calendar Dates" below if that's something we need further clarification about here. For more information regarding these features please refer back into our FAQs section at http://www-microsoftforums...


In [22]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load your fine-tuned model
model_path = "/content/drive/MyDrive/gpt2_finetuned"
model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Ask a question
question = "How do I configure the date and time settings in RHEL?"
input_text = f"Q: {question}\nA:"

# Tokenize & Generate Answer
input_ids = tokenizer.encode(input_text, return_tensors="pt")

output = model.generate(
    inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    max_length=150,
    pad_token_id=tokenizer.eos_token_id,
    repetition_penalty=1.2,
    temperature=0.7,
    top_k=50,
    top_p=0.9
)

# Decode and Print Response
answer = tokenizer.decode(output[0], skip_special_tokens=True)
print(answer)

Q: How do I configure the date and time settings? 
A: The default setting is to use a calendar. This can be changed by using your favorite web browser, such as Chrome or Firefox (or any other operating system). If you are not familiar with calendars in general, then this will help guide how they work for most users of Windows 7/8 . You may also want some additional guidance on what types each feature allows when it comes to scheduling dates based upon user preferences; see "How To Configure Calendar Dates" below if that's something we need further clarification about here. For more information regarding these features please refer back into our FAQs section at http://www-microsoftforums...


# New Section

# Using Mistral-7B for codable model

In [1]:
!pip install -q transformers langchain PyPDF2 faiss-cpu sentence-transformers gradio

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m44.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.8/57.8 MB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m321.9/321.9 kB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.8/94.8 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m72.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.5/71.5 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [6]:
!pip install PyPDF2 pycryptodome

Collecting pycryptodome
  Downloading pycryptodome-3.21.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Downloading pycryptodome-3.21.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pycryptodome
Successfully installed pycryptodome-3.21.0


In [8]:
from PyPDF2 import PdfReader
import os

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Set correct PDF folder path
pdf_folder = "/content/drive/MyDrive/my_train_data"

# Ensure directory exists
if not os.path.exists(pdf_folder):
    raise ValueError(f"Directory '{pdf_folder}' does not exist. Check if the path is correct in Google Drive.")

# Extract text from PDFs
text = ""
for file in os.listdir(pdf_folder):
    if file.endswith(".pdf"):
        pdf_path = os.path.join(pdf_folder, file)
        reader = PdfReader(pdf_path)

        # Check if the PDF is encrypted
        if reader.is_encrypted:
            print(f"Skipping encrypted PDF: {file}")
            continue  # Skip encrypted files

        for page in reader.pages:
            page_text = page.extract_text() if page.extract_text() else ""
            text += page_text

# Save extracted text to a file
with open("training_data.txt", "w") as f:
    f.write(text)

print("Text extraction complete. Saved to 'training_data.txt'.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Skipping encrypted PDF: Red_Hat_Linux_Complete_Reference.pdf
Skipping encrypted PDF: Perf_Best_Practices_vSphere5.5.pdf
Skipping encrypted PDF: vsphere-esxi-vcenter-server-70U3-performance-best-practices.pdf




Text extraction complete. Saved to 'training_data.txt'.


In [2]:
!pip install langchain_community

Collecting langchain_community
  Downloading langchain_community-0.3.17-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain-core<1.0.0,>=0.3.34 (from langchain_community)
  Downloading langchain_core-0.3.34-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain<1.0.0,>=0.3.18 (from langchain_community)
  Downloading langchain-0.3.18-py3-none-any.whl.metadata (7.8 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downloading pydantic_settings-2.7.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain_community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-

In [3]:
pip install -q transformers accelerate bitsandbytes langchain faiss-cpu

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.7/69.7 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip install -U bitsandbytes

In [1]:
!pip install cloud-tpu-client torch==2.2.0 torch_xla[tpu] -f https://storage.googleapis.com/tpu-pytorch/wheels/colab/torch_xla-2.2.0-cp310-cp310-linux_x86_64.whl
!pip install transformers sentence-transformers

Looking in links: https://storage.googleapis.com/tpu-pytorch/wheels/colab/torch_xla-2.2.0-cp310-cp310-linux_x86_64.whl
Collecting cloud-tpu-client
  Downloading cloud_tpu_client-0.10-py3-none-any.whl.metadata (1.2 kB)
Collecting torch==2.2.0
  Downloading torch-2.2.0-cp311-cp311-manylinux1_x86_64.whl.metadata (25 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.2.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.2.0)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.2.0)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch==2.2.0)
  Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch

Collecting sentence-transformers
  Downloading sentence_transformers-3.4.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.4.1-py3-none-any.whl (275 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.9/275.9 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.4.1


In [4]:
!pip uninstall -y torch torchvision torch_xla  # Clean old installations
!pip install cloud-tpu-client \
  torch==2.2.0 \
  torchvision==0.17.0 \
  https://storage.googleapis.com/tpu-pytorch/wheels/colab/torch_xla-2.2.0-cp310-cp310-linux_x86_64.whl

Found existing installation: torch 2.2.0
Uninstalling torch-2.2.0:
  Successfully uninstalled torch-2.2.0
Found existing installation: torchvision 0.20.1+cpu
Uninstalling torchvision-0.20.1+cpu:
  Successfully uninstalled torchvision-0.20.1+cpu
Found existing installation: torch-xla 2.5.1
Uninstalling torch-xla-2.5.1:
  Successfully uninstalled torch-xla-2.5.1
[31mERROR: torch_xla-2.2.0-cp310-cp310-linux_x86_64.whl is not a supported wheel on this platform.[0m[31m
[0m

In [6]:
!pip install torch torchvision torchaudio
!pip install torch-xla -f https://storage.googleapis.com/libtpu-releases/index.html

Collecting torch
  Downloading torch-2.6.0-cp311-cp311-manylinux1_x86_64.whl.metadata (28 kB)
Collecting torchvision
  Downloading torchvision-0.21.0-cp311-cp311-manylinux1_x86_64.whl.metadata (6.1 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.

Looking in links: https://storage.googleapis.com/libtpu-releases/index.html
Collecting torch-xla
  Downloading torch_xla-2.6.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (21 kB)
Downloading torch_xla-2.6.0-cp311-cp311-manylinux_2_28_x86_64.whl (93.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.6/93.6 MB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-xla
Successfully installed torch-xla-2.6.0


In [1]:
import torch
import torch_xla.core.xla_model as xm

print("PyTorch version:", torch.__version__)  # Should show 2.2.0
print("XLA version:", torch_xla.__version__)  # Should show 2.2.0
#print("TPU device:", xm.xla_device())  # Should output "xla:0"



PyTorch version: 2.5.1+cpu


NameError: name 'torch_xla' is not defined

In [2]:
import torch_xla
import torch_xla.core.xla_model as xm

# Initialize TPU
device = xm.xla_device()
print(f"Using device: {device}")

from transformers import AutoTokenizer, AutoModelForCausalLM

# Load model directly to TPU (no quantization)
model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.1",
    torch_dtype=torch.bfloat16,  # Use bfloat16 for TPU efficiency
    device_map=device,  # Explicitly map to TPU
    low_cpu_mem_usage=True
).to(device)

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")


# Custom TPU-friendly generation function
def tpu_generate(query, max_length=512):
    inputs = tokenizer(query, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            temperature=0.7,
            do_sample=True
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Wrap in LangChain
from langchain.llms.base import LLM
class TPUMistral(LLM):
    def _call(self, prompt, **kwargs):
        return tpu_generate(prompt)

llm = TPUMistral()

ImportError: /usr/local/lib/python3.11/dist-packages/_XLAC.cpython-311-x86_64-linux-gnu.so: undefined symbol: _ZNK3c105Error4whatEv

In [None]:
import os
import torch
import torch_xla.core.xla_model as xm
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA

# Initialize TPU
device = xm.xla_device()
print(f"Using device: {device}")

# Load data (same as before)
with open("training_data_chunked.txt", "r", encoding="utf-8") as f:
    text = f.read()

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
texts = text_splitter.split_text(text)

# Embeddings (CPU-only for FAISS)
embeddings = HuggingFaceEmbeddings()
docsearch = FAISS.from_texts(texts, embeddings)

# TPU Model Setup
model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.1",
    torch_dtype=torch.bfloat16,
    device_map=device,
    low_cpu_mem_usage=True
).to(device)

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")

# QA System
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=docsearch.as_retriever()
)

# Test
question = "What did the documents say about Python programming?"
response = qa.run(question)
print("\n📝 Answer:", response)

In [8]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline

# ✅ Load and preprocess the training text
with open("training_data_chunked.txt", "r", encoding="utf-8") as f:
    text = f.read()

# ✅ Split text into smaller chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
texts = text_splitter.split_text(text)

# ✅ Generate text embeddings and store in FAISS vector database
embeddings = HuggingFaceEmbeddings()
docsearch = FAISS.from_texts(texts, embeddings)

# ✅ Load Mistral-7B in 4-bit quantization
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16  # Add this for T4 compatibility
)

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")

model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.1",
    quantization_config=quant_config,
    device_map="auto",  # Let Accelerate handle device placement
    torch_dtype=torch.float16
)

print(f"Model device: {model.device}")  # Verify CUDA

# ✅ Create pipeline WITHOUT device=0
text_gen_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    max_new_tokens=512  # Limit response length for T4 memory
)

# ✅ Pass to LangChain
llm = HuggingFacePipeline(pipeline=text_gen_pipeline)

# ✅ Create QA System
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=docsearch.as_retriever()
)

# ✅ Test question
question = "What did the documents say about Python programming?"
response = qa.run(question)
print("\n📝 Answer:", response)

  embeddings = HuggingFaceEmbeddings()


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


Model device: cuda:0


  response = qa.run(question)
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


OutOfMemoryError: CUDA out of memory. Tried to allocate 16.28 GiB. GPU 0 has a total capacity of 14.74 GiB of which 5.63 GiB is free. Process 2521 has 9.11 GiB memory in use. Of the allocated memory 8.84 GiB is allocated by PyTorch, and 149.28 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
from langchain.chains import RetrievalQA

# Setup the question-answering system
qa = RetrievalQA.from_chain_type(
    llm=HuggingFacePipeline(pipeline="text-generation", model=model, tokenizer=tokenizer),
    chain_type="stuff",
    retriever=docsearch.as_retriever()
)

# Ask a question
response = qa.run("What did the documents say about Python programming?")
print(response)

In [None]:
from transformers import pipeline
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline

# Create a text generation pipeline for HuggingFace
text_gen_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device=0,  # Use GPU if available
    max_length=512,  # Adjust max length for responses
    temperature=0.7,  # Controls randomness
    do_sample=True
)

# Wrap pipeline in LangChain's HuggingFacePipeline
llm = HuggingFacePipeline(pipeline=text_gen_pipeline)

# Setup the question-answering system with better retrieval
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="map_reduce",  # Better for long documents
    retriever=docsearch.as_retriever(search_kwargs={"k": 5})  # Fetch top 5 relevant chunks
)

# Ask a question
query = "What did the documents say about Python programming?"
response = qa.run(query)
print("Response:", response)

In [None]:
import gradio as gr

def chat_and_code(query):
    response = qa.run(query)
    return f"Answer: {response}\n\nCode:\n```python\n# Example code based on your documents\nprint('Hello World')\n```"

# Build the app
app = gr.Interface(
    fn=chat_and_code,
    inputs=gr.Textbox(lines=2, placeholder="Ask a question or request code..."),
    outputs=gr.Textbox(label="Response", language="markdown")
)

app.launch(share=True)  # Generates a public link to test your app