# Prompt Engineering Activity
Prerequisite
*   HuggingFace Token
*  Google Colab Student Tier Account


In [25]:
# Install required packages - run this cell first
!pip install transformers torch sentence-transformers faiss-cpu datasets

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


In [2]:
# import os
# !pip install -U huggingface_hub
# os.environ["HUGGINGFACE_HUB_TOKEN"] = "<your_hf_token>"
# os.environ["HUGGINGFACE_HUB_TOKEN"] = "YOUR_HUGGINGFACE_TOKEN_HERE"

In [33]:
# Import libraries we need
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
from sentence_transformers import SentenceTransformer
import torch
import json
import numpy as np
import faiss
import time

print("All libraries imported successfully!")

All libraries imported successfully!


In [34]:
# Set up your Hugging Face token (replace with your actual token)
# Get a free token from: https://huggingface.co/settings/tokens
hf_token = "your_huggingface_token_here"  # Replace with your token
# hf_token = "YOUR_HUGGINGFACE_TOKEN_HERE" #REPLACE WITH YOUR OWN

# Check if we have GPU available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [35]:
# Specify which Mistral model to use from Hugging Face
model_id = "mistralai/Mistral-7B-Instruct-v0.3"

# Create a conversation with system prompt and user message
# System prompt defines the AI's role/personality
# User message is what the person is asking
messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Who are you?"},
]

# Set up the text generation pipeline with specific parameters
chatbot = pipeline(
    "text-generation",                              # Task type: generating text
    model="mistralai/Mistral-7B-Instruct-v0.3",   # Which model to use
    token=hf_token,                                 # Authentication token for Hugging Face
    torch_dtype=torch.bfloat16,                     # Data type for faster processing
    device_map="auto",                              # Automatically use GPU if available
    max_new_tokens=512,                             # Maximum length of generated response
    do_sample=True,                                 # Use random sampling for more creative responses
    top_k=10,                                       # Consider top 10 most likely next words
    num_return_sequences=1,                         # Generate only 1 response
    eos_token_id=2,                                 # Token ID that signals end of response
)

# Generate response using the pipeline and print the result
print(chatbot(messages))

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[{'generated_text': [{'role': 'system', 'content': 'You are a pirate chatbot who always responds in pirate speak!'}, {'role': 'user', 'content': 'Who are you?'}, {'role': 'assistant', 'content': " Arr matey! I'm Cap'n Syntax the Parrot, yer loyal chatbot companion on this here digital seas! I be here to help ye navigate through yer questions and riddles with me hearty responses, so don't be shy and ask away! Yarr!"}]}]


In [36]:
# Generate the response and store the full result
result = chatbot(messages)

# Extract just the assistant's response from the complex output structure
# result[0] gets the first (and only) generated sequence
# ["generated_text"] gets the conversation history with the new response
# [-1] gets the last message in the conversation (the assistant's reply)
# ["content"] gets just the text content without the role information
assistant_reply = result[0]["generated_text"][-1]["content"]

# Print only the clean assistant response (without all the extra structure)
print(assistant_reply)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


 Ahoy there, matey! Me be Captain Parrot, yarr, the swiftest and most cunning pirate chatbot ye be ever likely to meet. Now, what's on yer mind, landlubber? I be here to help ye navigate the seven seas of conversation!


In [37]:
# Your Hugging Face authentication token (replace with your actual token)
hf_token = "YOUR_HUGGINGFACE_TOKEN_HERE"

# Specify the Mistral model we want to use
model_id = "mistralai/Mistral-7B-Instruct-v0.3"

# Load the tokenizer (converts text to numbers that the model understands)
tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)

# Load the actual model with specific settings
model = AutoModelForCausalLM.from_pretrained(
    model_id,                    # Which model to load
    token=hf_token,             # Authentication token
    dtype=torch.bfloat16,       # Use 16-bit precision for faster processing
    device_map="auto"           # Automatically use GPU if available
)

# Create a simple conversation (just user input, no system prompt this time)
conversation = [{"role": "user", "content": "What's the weather like in Paris?"}]

# Convert the conversation into the format the model expects
# This applies the model's chat template and converts to tensors
inputs = tokenizer.apply_chat_template(
    conversation,                # The conversation to format
    add_generation_prompt=True,  # Add prompt to signal the model should respond
    return_dict=True,           # Return as dictionary
    return_tensors="pt",        # Return as PyTorch tensors
).to(model.device)             # Move to same device as model (GPU/CPU)

# Generate the response using the model directly
outputs = model.generate(
    **inputs,                           # Pass all the formatted inputs
    max_new_tokens=1000,               # Maximum length of response
    pad_token_id=tokenizer.eos_token_id # Token to use for padding
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [38]:
# Print the raw model output tensor (this shows token IDs/numbers, not readable text yet)
print(outputs)

tensor([[    1,     3,  2592, 29510, 29481,  1040,  8854,  1505,  1065,  6233,
         29572,     4,  1083,  1717, 29510, 29475,  1274,  2121, 29501,  2304,
         17353, 29493,  1347,  1083,  1309, 29510, 29475,  3852,  1040,  2636,
          8854,  1065,  6233, 29491,  3761, 29493,  1083,  1309,  2680,  1136,
          1137,  6233, 29493,  1505,  1956,  1070, 13495,  5611, 29493,  1427,
          1032,  5794,  1148, 14761,  1062, 12027, 29491,  1183,  8854,  1117,
         17351,  1163,  5160, 28408,  5942,  6241,  1040,  1647, 29491,  1183,
          6868,  1142,  4138,  1228,  4980, 29493,  5166, 29493,  1072,  4396,
         29493,  1163, 18759, 14131,  4822,  2169, 29473, 29518, 29502, 29501,
         29518, 29550, 29670, 29511,  1093, 29552, 29551, 29501, 29555, 29555,
         29670, 29533,  1377,  1183,  6024,  1142,  4138,  1228,  5693, 29493,
          5392, 29493,  1072,  6121, 29493,  1163, 18759, 14131,  4822,  2169,
         29473, 29538, 29501, 29551, 29670, 29511,  

In [39]:
# Convert the token IDs back to readable text and print the result
# outputs[0] gets the first generated sequence, skip_special_tokens removes formatting tokens
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

What's the weather like in Paris? I don't have real-time capabilities, so I can't provide the current weather in Paris. However, I can tell you that Paris, like much of northern France, has a temperate oceanic climate. The weather is mild with regular precipitation throughout the year. The warmest months are June, July, and August, with temperatures averaging around 20-25°C (68-77°F). The coldest months are December, January, and February, with temperatures averaging around 3-8°C (37-46°F). It's always a good idea to check a reliable weather forecast before planning a trip.


Class Exercise

## Step 1: Create an Agentic/Assistant System Prompt

Choose a specific business context and create a system prompt that gives Mistral a professional role. This system prompt will define how the AI behaves and what expertise it has.

**Instructions:**
- Pick a realistic business or organization
- Choose a specific role/expertise for the AI (marketing expert, technical consultant, etc.)
- Create a system prompt that defines the AI's personality and knowledge area
- This will be used throughout the assignment for generating content


In [None]:
# TODO: Choose your business and role
# Examples:
# - "TechStart Solutions - AI Consulting Firm" with role "AI Solutions Consultant"
# - "Green Energy Corp - Solar Installation Company" with role "Solar Energy Expert"
# - "HealthTech Plus - Medical Software Company" with role "Healthcare IT Specialist"


# Begin writing Python codes here

## Step 2: Generate Business Database Content


Use Mistral to create a comprehensive Q&A database for your chosen business. You'll prompt Mistral to generate realistic question-answer pairs that customers might ask about your services, pricing, processes, and expertise.

**Instructions:**
- Use your system prompt from Step 1 to give Mistral the business context
- Create a prompt asking Mistral to generate 10-15 Q&A pairs for your business
- Ask for questions covering different topics: services, pricing, processes, technical details, contact info
- Format should be clear (Q: question, A: answer)
- Parse the generated text into a usable list of dictionaries

In [None]:
# TODO: Generate Q&A database using Mistral
# You need to:
# 1. Set up the Mistral model (use the pipeline approach from the original notebook)
# 2. Create a function to get clean responses from Mistral
# 3. Write a prompt asking Mistral to generate business Q&A pairs
# 4. Parse the generated text into a list of dictionaries with 'question' and 'answer' keys
# 5. Display your generated Q&A pairs clearly


# Begin writing Python codes here

## Step 3: Implement FAISS Vector Database

Convert your Q&A database into embeddings (numerical vectors) and store them in a FAISS index for fast similarity search. This allows users to ask questions and quickly find the most relevant information from your knowledge base.

**Instructions:**
- Install and import sentence-transformers for creating embeddings
- Convert all your questions into numerical vectors using an embedding model
- Create a FAISS index to store these vectors for fast similarity search
- Implement a search function that can find similar questions based on user input
- Test your search functionality with a sample query



In [None]:

# TODO: Implement FAISS Vector Database
# You need to:
# 1. Install sentence-transformers: !pip install sentence-transformers faiss-cpu
# 2. Import SentenceTransformer and faiss
# 3. Load an embedding model (e.g., 'distilbert-base-uncased-distilled-squad')
# 4. Extract questions and answers from your Q&A database
# 5. Convert questions to embeddings using the model
# 6. Create a FAISS index and add the embeddings
# 7. Create a search function that takes a user question and returns similar Q&A pairs
# 8. Test the search function with a sample query

# Begin writing Python codes here

## Step 4: Create Test Questions

Generate two types of questions to test your RAG system: questions that CAN be answered from your database (answerable) and questions that CANNOT be answered (unanswerable). This tests how well your system knows its limitations.

**Instructions:**
- Use Mistral to generate 5 questions that your business CAN answer (about your services, pricing, processes, etc.)
- Use Mistral to generate 5 questions that your business CANNOT answer (competitor info, unrelated topics, personal details, etc.)
- Extract the questions from the generated text into clean lists
- These will test whether your RAG system correctly identifies when it can and cannot provide good answers

In [None]:
# TODO: Create Test Questions
# You need to:
# 1. Generate ANSWERABLE questions using Mistral (questions your business can answer)
# 2. Generate UNANSWERABLE questions using Mistral (questions outside your expertise)
# 3. Parse both sets of questions into clean lists
# 4. Display both types of questions clearly
# 5. Make sure you have at least 5 questions of each type

# Begin writing Python codes here

## Step 5: Implement and Test Questions


Run both types of questions through your RAG system and analyze how well it distinguishes between questions it can answer well versus questions it cannot answer reliably.

**Instructions:**
- Test your answerable questions - they should get high similarity scores with your database
- Test your unanswerable questions - they should get low similarity scores
- Set a similarity threshold to determine "can answer" vs "cannot answer"
- Analyze the performance: did answerable questions score high? Did unanswerable questions score low?
- Calculate accuracy rates for both question types

In [None]:
# TODO: Test Your RAG System
# You need to:
# 1. Create a testing function that searches your database for each question
# 2. Set a similarity threshold (e.g., 0.7) to determine good vs poor matches
# 3. Test all answerable questions and count how many are correctly identified as answerable
# 4. Test all unanswerable questions and count how many are correctly identified as unanswerable
# 5. Calculate and display performance statistics
# 6. Show examples of good and poor matches

## Step 6: Model Experimentation and Ranking

Test multiple Q&A models from Hugging Face and rank them based on performance, speed, and confidence scores.

**Instructions:**
- Test the 4 required models plus 2 additional models of your choice
- Evaluate each model on speed, confidence scores, and answer quality
- Rank models from best to worst with clear explanations
- Identify which models provide good confidence scores while maintaining reasonable output
- Compare performance across different question types

In [42]:
# TODO: Test and Rank QA Models
# Required models to test:
# - "consciousAI/question-answering-generative-t5-v1-base-s-q-c"
# - "deepset/roberta-base-squad2"
# - "google-bert/bert-large-cased-whole-word-masking-finetuned-squad"
# - "gasolsun/DynamicRAG-8B"
# Plus 2 additional QA models of your choice
#
# You need to:
# 1. Set up QA pipelines for each model
# 2. Test them with your questions and retrieved contexts
# 3. Measure response time and confidence scores
# 4. Rank models based on composite performance
# 5. Identify models with good confidence handling
# 6. Explain why each model ranked where it did





"""
Write your explanation here:


"""

'\nWrite your explanation here:\n\n\n'