In [None]:
## sentence similarity
# Load sentence transformer model
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')

def calculate_similarity(text1, text2):
    """Calculate cosine similarity between two texts"""
    embeddings = sentence_model.encode([text1, text2])
    similarity = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
    return similarity

# Example texts for similarity comparison
texts = [
    "The cat sat on the mat.",
    "A feline rested on the rug.",
    "Dogs are great pets.",
    "I love pizza and pasta.",
    "Italian food is delicious."
]

print("\n📊 Text Similarity Matrix:")
print("Comparing different text pairs:")

for i in range(len(texts)):
    for j in range(i+1, len(texts)):
        similarity = calculate_similarity(texts[i], texts[j])
        print(f"'{texts[i][:30]}...' vs '{texts[j][:30]}...': {similarity:.3f}")

In [None]:
## semantic search
def semantic_search(query, documents, top_k=3):
    """Find most similar documents to a query"""
    query_embedding = sentence_model.encode([query])
    doc_embeddings = sentence_model.encode(documents)

    similarities = cosine_similarity(query_embedding, doc_embeddings)[0]
    top_indices = np.argsort(similarities)[::-1][:top_k]

    results = []
    for idx in top_indices:
        results.append({
            'document': documents[idx],
            'similarity': similarities[idx]
        })
    return results

# Example semantic search
documents = [
    "Machine learning is a subset of artificial intelligence.",
    "Deep learning uses neural networks with multiple layers.",
    "Natural language processing helps computers understand text.",
    "Computer vision enables machines to interpret visual information.",
    "Reinforcement learning trains agents through trial and error."
]

query = "How do computers understand language?"
search_results = semantic_search(query, documents)

In [None]:
## dynamic few-shot
class DynamicFewShotPrompter:
    def __init__(self, examples, sentence_model):
        self.examples = examples
        self.sentence_model = sentence_model

    def get_relevant_examples(self, query, k=3):
        """Retrieve k most similar examples to the query"""
        query_embedding = self.sentence_model.encode([query])
        example_texts = [ex['input'] for ex in self.examples]
        example_embeddings = self.sentence_model.encode(example_texts)

        similarities = cosine_similarity(query_embedding, example_embeddings)[0]
        top_indices = np.argsort(similarities)[::-1][:k]

        return [self.examples[idx] for idx in top_indices]

    def create_prompt(self, query, task_description, k=3):
        """Create a dynamic few-shot prompt"""
        relevant_examples = self.get_relevant_examples(query, k)

        prompt = f"{task_description}\n\n"

        for ex in relevant_examples:
            prompt += f"Input: {ex['input']}\nOutput: {ex['output']}\n\n"

        prompt += f"Input: {query}\nOutput:"
        return prompt

# Example dataset for sentiment analysis
sentiment_examples = [
    {"input": "I love this movie!", "output": "positive"},
    {"input": "This food tastes terrible", "output": "negative"},
    {"input": "The weather is nice today", "output": "positive"},
    {"input": "I'm feeling sad", "output": "negative"},
    {"input": "This book is okay", "output": "neutral"},
    {"input": "Amazing service at this restaurant", "output": "positive"},
    {"input": "The product broke after one day", "output": "negative"},
    {"input": "Not bad, could be better", "output": "neutral"},
    {"input": "Absolutely fantastic experience", "output": "positive"},
    {"input": "Waste of money", "output": "negative"}
]

# Initialize dynamic prompter
prompter = DynamicFewShotPrompter(sentiment_examples, sentence_model)

# Test dynamic prompting
test_query = "This pizza is incredibly delicious"
dynamic_prompt = prompter.create_prompt(
    test_query,
    "Classify the sentiment of the following text as positive, negative, or neutral:",
    k=3
)

In [None]:
dataset = load_dataset("rag-datasets/rag-mini-wikipedia", "text-corpus")
# corpus = [item for item in dataset["passages"]]

# Always clean + use this corpus consistently
corpus = []
for item in dataset["passages"]:
    text = str(item).strip()
    if text:
        corpus.append(text)

# Embedding model
print("Encoding corpus...")
embedder = SentenceTransformer("all-MiniLM-L6-v2")
corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True, device='cpu')
corpus_embeddings_np = corpus_embeddings.numpy()

# FAISS index
index = faiss.IndexFlatL2(corpus_embeddings_np.shape[1])
index.add(corpus_embeddings_np)

# Reranker model
# reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

# Generator (choose one: local HF model or OpenAI)
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3", torch_dtype=torch.float16)
generator = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=150)

@spaces.GPU
def rag_pipeline(query):
    # Embed query
    query_embedding = embedder.encode([query], convert_to_tensor=True, device='cpu').numpy()

    # Retrieve top-k from FAISS
    D, I = index.search(query_embedding, k=5)
    retrieved_docs = [corpus[idx] for idx in I[0]]
    
    print("Retrieved indices:", I[0])
    print("Retrieved docs:")
    for doc in retrieved_docs:
        print("-", repr(doc))

    # # Rerank
    # rerank_pairs = [[str(query), str(doc)] for doc in retrieved_docs]
    # scores = reranker.predict(rerank_pairs)
    # reranked_docs = [doc for _, doc in sorted(zip(scores, retrieved_docs), reverse=True)]

    # Combine for context
    context = "\n\n".join(retrieved_docs[:2])
    prompt = f"""Answer the following question using the provided context.\n\nContext:\n{context}\n\nQuestion: {query}\nAnswer:"""

    # Generate
    response = generator(prompt)[0]["generated_text"]
    return response.split("Answer:")[-1].strip()