In [1]:
from langchain.document_loaders import PyPDFLoader
from langchain.chains import ConversationalRetrievalChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.llms import Ollama

In [2]:
pdf_reader = PyPDFLoader('Gen AI eBook.pdf')
documents = pdf_reader.load()

text_spliter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_spliter.split_documents(documents)

In [3]:
print(chunks[2].page_content)

Generative AI interview questions span a broad
range of topics, covering everything from core
concepts and model architectures to applications
and ethical considerations. This diversity means
it's hard to predict exactly what interviewers might
ask, as questions could cover theory, technical
skills, and even recent advancements.
Understanding the types of questions you may
encounter is crucial for targeted preparation.
Below, you'll find examples of practical questions
and answers. Reviewing these should help you
identify strengths and pinpoint areas for further
study to sharpen your knowledge and readiness
for real-world applications in Generative AI.
COMPREHENSIVE
GUIDE TO
INTERVIEWS 
FOR GEN AI
Become a part of the
team at Zep
Why don't you start your journey as
a tech blogger and enjoy unlimited
perks and cash prizes every month.
Explore


In [4]:
from langchain.embeddings import HuggingFaceEmbeddings

embedding = HuggingFaceEmbeddings()

  embedding = HuggingFaceEmbeddings()
  embedding = HuggingFaceEmbeddings()
  from .autonotebook import tqdm as notebook_tqdm





In [5]:
db = FAISS.from_documents(documents=chunks, embedding=embedding)

In [6]:
llm = Ollama(model="tinyllama", temperature=0)

  llm = Ollama(model="tinyllama", temperature=0)


In [7]:
llm.invoke("What is the capital of USA?")

'The capital of the United States is Washington, D.C.'

In [8]:
from langchain.prompts import PromptTemplate

CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template("""Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question.

Chat History:
{chat_history}
Follow up Input: {question}
Standalon question:""")

qa = ConversationalRetrievalChain.from_llm(llm=llm, 
                                           retriever=db.as_retriever(),
                                           condense_question_prompt=CONDENSE_QUESTION_PROMPT,
                                           return_source_documents=True,
                                           verbose=False)

In [9]:
chat_history = []
query = """What is LoRA and QLoRA?"""
result = qa({"question" : query, "chat_history" : chat_history})
print(result['answer'])

  result = qa({"question" : query, "chat_history" : chat_history})


The question refers to two different concepts in computer science, namely "LoRA" (low-rank adaptation) and "QLoRA" (quantized low-rank adaptation). 

LoRA is a parameter-efficient fine-tuning method designed to reduce the resource requirements of adapting large language models to specific tasks while maintaining high performance. It introduces small low-rank matrices into specific layers, typically in the attention blocks of transformers, and adjusts them for specific tasks. LoRA reduces the number of trainable parameters, enabling fine-tuining of large models with significant efficiency gains.

QLoRA is a similar method that extends LoRA by incorporating quantization techniques to further reduce memory usage and computation costs. It applies 4-bit quantization to the base model's weights, storing each weight in a lower-precision format (e.g., from 16-bit or 32-bit float to 4-bit integer). This significantly reduces memory usage and computation costs while maintaining performance.


#### RAG Evaluation

In [10]:
sample_query = [
  {
    "question": "What is Generative AI, and how does it differ from traditional AI?",
    "expected_answer": "Generative AI refers to models that can create new content such as text, images, or code. Unlike traditional AI, which typically predicts or classifies existing data, generative AI can generate novel outputs based on learned patterns.",
    "reference": "Introduction or 'What is Generative AI?' section"
  },
  {
    "question": "Name three popular types of generative models and provide one use case for each.",
    "expected_answer": "1. GANs (Generative Adversarial Networks) – used for image synthesis.\n2. VAEs (Variational Autoencoders) – used for data compression and generation.\n3. LLMs (Large Language Models) – used for text generation like chatbots.",
    "reference": "Chapter on Generative AI Models / Types of Models"
  },
  {
    "question": "What is LoRA in LLM fine-tuning, and why is it used?",
    "expected_answer": "LoRA (Low-Rank Adaptation) is a technique to fine-tune large models efficiently by training only a small subset of parameters. It reduces computation and memory requirements while still improving task-specific performance.",
    "reference": "Section on Efficient Fine-Tuning Techniques"
  },
  {
    "question": "How does a Retrieval-Augmented Generation (RAG) system improve answer accuracy?",
    "expected_answer": "RAG improves accuracy by retrieving relevant documents or knowledge from a database (or PDF, website, etc.) and conditioning the generated output on that retrieved content, combining retrieval with generation.",
    "reference": "Chapter on RAG Architecture or Retrieval + Generation"
  },
  {
    "question": "What are embeddings in the context of document retrieval?",
    "expected_answer": "Embeddings are numerical vector representations of text that capture semantic meaning, allowing similarity search for retrieval of relevant documents.",
    "reference": "Section on Embeddings or Vector Databases"
  },
  {
    "question": "What are common evaluation metrics for generative AI systems like RAG?",
    "expected_answer": "Common metrics include BLEU, ROUGE, METEOR for text quality; human evaluation for relevance and accuracy; and retrieval accuracy metrics like Precision@k or Recall@k.",
    "reference": "Evaluation or Metrics section"
  },
]


In [11]:
import pandas as pd

df = pd.DataFrame(sample_query)
df.head()

Unnamed: 0,question,expected_answer,reference
0,"What is Generative AI, and how does it differ ...",Generative AI refers to models that can create...,Introduction or 'What is Generative AI?' section
1,Name three popular types of generative models ...,1. GANs (Generative Adversarial Networks) – us...,Chapter on Generative AI Models / Types of Models
2,"What is LoRA in LLM fine-tuning, and why is it...",LoRA (Low-Rank Adaptation) is a technique to f...,Section on Efficient Fine-Tuning Techniques
3,How does a Retrieval-Augmented Generation (RAG...,RAG improves accuracy by retrieving relevant d...,Chapter on RAG Architecture or Retrieval + Gen...
4,What are embeddings in the context of document...,Embeddings are numerical vector representation...,Section on Embeddings or Vector Databases


In [12]:
retriver = db.as_retriever()

In [13]:
def process_query(query):
    chat_history = []
    result = qa({"question" : query, "chat_history" : chat_history})
    relevant_docs = retriver.invoke(query)
    print(result['answer'])
    return result['answer'], relevant_docs

In [14]:
process_query("What are embeddings in the context of document retrieval?")

Embedding is a technique used in document retrieval that converts both queries and documents into vector representations in a continuous space. In this context, dense retrieval techniques use neural network-based embedding to convert both queries and documents into vector representations in a continuous space. This allows for efficient batch retrieval and is suitable for large datasets. Cross-Encoder and Bi-Encoder approaches are also used in document retrieval, but they often yield better accuracy than the dense embedding model. Supervised contrastive learning is another approach used in document retrieval that uses a dense embedding model to learn to map both queries and documents into a shared embedding space.


('Embedding is a technique used in document retrieval that converts both queries and documents into vector representations in a continuous space. In this context, dense retrieval techniques use neural network-based embedding to convert both queries and documents into vector representations in a continuous space. This allows for efficient batch retrieval and is suitable for large datasets. Cross-Encoder and Bi-Encoder approaches are also used in document retrieval, but they often yield better accuracy than the dense embedding model. Supervised contrastive learning is another approach used in document retrieval that uses a dense embedding model to learn to map both queries and documents into a shared embedding space.',
 [Document(id='ae6c123f-c985-4ef3-8cbc-7a393dd2aaba', metadata={'producer': 'Canva', 'creator': 'Canva', 'creationdate': '2024-11-19T15:02:50+00:00', 'title': 'GEN AI', 'moddate': '2024-11-19T15:02:43+00:00', 'keywords': 'DAGWE7dsYlU,BAGVIRFoDTA', 'author': 'Aditya Kumar',

### RAGAS

In [15]:
results = []

for _, row in df.iterrows():
    question = row['question']
    ground_truth = row['expected_answer']
    
    answer, relevent_docs = process_query(question)
    
    results.append({
        "user_input" : question,
        "reference" : ground_truth,
        "response" : answer,
        "retrieved_contexts" : [relevent_docs[0].page_content]
    })

Generative AI (GenAI) refers to a subset of artificial intelligence that focuses on generating new data, content, or solutions by learning from existing patterns. Unlike traditional AI models, which are typically used for tasks like classification or prediction, GenAI operates by understanding the underlying structure of the input data and using that knowledge to generate something new. This differs from supervised learning, which requires labeled data, in that GenAI learns from unlabelled data. Unsupervised learning techniques such as Generative Adversarial Networks (GANs) and Variational Autoencoders (VAEs) are used for this purpose. The ability to generate novel content has broad applications in various industries, including marketing, e-commerce, and entertainment platforms.
1. Generative Adversarial Networks (GANs): Consists of two neural networks - the generator and the discriminator - that compete with each other. The generator creates synthetic data, while the discriminator tri

In [16]:
from ragas import EvaluationDataset
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness

In [17]:
evaluation_dataset = EvaluationDataset.from_list(results)   

In [18]:
evaluator_llm = LangchainLLMWrapper(llm)

In [None]:
ragas_result = evaluate(
    dataset=evaluation_dataset,
    metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness()],
    llm=evaluator_llm
)

In [None]:
ragas_result

{'context_recall': 1.0000, 'faithfulness': nan, 'factual_correctness(mode=f1)': nan}

#### **n-Gram Metrics**

- BELU
- ROUGE

In [21]:
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge

In [22]:
rouge = Rouge()
bleu = []
rouge_one = []

In [23]:
for item in results:
    reference = [item["reference"].split()]
    hypothesis = item["response"].split()
    bleu.append(sentence_bleu(reference, hypothesis))
    
    scores = rouge.get_scores(item["response"], item["reference"])[0]
    rouge_one.append(scores["rouge-1"]["f"])

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [24]:
bleu

[4.637755227760251e-155,
 2.070997392539222e-155,
 0.05584652031241,
 4.133174579986453e-155,
 2.272930923020976e-155,
 2.1646176841440953e-155]

In [25]:
rouge_one

[0.3418803378303748,
 0.11881187894520145,
 0.35294117240099965,
 0.2782608657088847,
 0.17499999651250006,
 0.1384615358579882]

#### Model-Based Metrics

- BARTScore
- BERTScore

In [26]:
from transformers import BartForConditionalGeneration, BarthezTokenizer
from bert_score import BERTScorer
import torch

In [None]:
bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

bart_tokeinzer = BarthezTokenizer.from_pretrained("facebook/bart-large-cnn")

bert_score = BERTScorer(lang="en", rescale_with_baseline=True)

In [None]:
bert_scores = []
bart_scores = []

for item in results:
  inputs = bart_tokeinzer(item["response"], return_tensors='pt', truncation=True, padding=True )
  
  with torch.no_grad():
    bart_score = bart_model(**inputs).logits
  bart_scores.append(bart_score.mean().item())

  P, R, F1 = bart_score.score([item["response"]], [item["reference"]])
  
  bert_scores.append(F1.numpy().mean())

In [None]:
bart_scores

In [None]:
bert_scores