### This is a starter kit that serves as the initial setup. Use it as a base and build upon it.

In [None]:
!pip install -q google-generativeai rouge-score tqdm scikit-learn

# if running the code on local machine (say, VS Code)
!pip install python-dotenv

In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

# Fetching the dataset, hosted online on the github page
url = "https://mlig-iitmbs.github.io/somsubhra-promptingtalk/assets/squad-simplified.csv"
df = pd.read_csv(url)

df.head()

In [None]:
df.shape

In [None]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df, test_size=0.01, random_state=42)

print(f"Training set: {train_df.shape}")
print(f"Validation set: {val_df.shape}")

## We will do a few shot prompting with $10$ random samples from the train set and ask the [LLM](https://en.wikipedia.org/wiki/Gemini_(language_model)) to predict on the test set.


*For more info on the model we're using, refer to the [docs](https://ai.google.dev/gemini-api/docs/models#gemini-2.5-flash-lite) & their [rate limits](https://ai.google.dev/gemini-api/docs/rate-limits)!*

---

In [None]:
import google.generativeai as genai
import random
from rouge_score import rouge_scorer
from tqdm import tqdm

'''
# If running on Colab, store your API key in Colab secrets & provide nb access perms

from google.colab import userdata
genai.configure(api_key=userdata.get('GEMINI_API_KEY'))
'''

# for local machine, please store the API key inside a `.env` file
from dotenv import load_dotenv
import os
load_dotenv()  # this loads the `.env` file
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))

model = genai.GenerativeModel("gemini-2.5-flash-lite")

In [None]:
# We are picking 10 random samples from the `train` set.
few_shots = train_df.sample(10, random_state=42)

# Here we are creating an instruction
instruction = (
    "You are a question answering model."
    "Given a context and a question, produce a concise and accurate answer."
)

# examples string
examples = ""
for _, row in few_shots.iterrows():
    examples += (
        f"\n\nContext: {row['context']}\n"
        f"Question: {row['question']}\n"
        f"Answer: {row['text']}"
    )

In [None]:
predictions = []
references = []

for _, row in tqdm(val_df.iterrows(), total=len(val_df)):
    # main prompt
    prompt = (
        f"{instruction}\n\nHere are some examples:{examples}\n\n"
        f"Now answer this new question:\n\n"
        f"Context: {row['context']}\n"
        f"Question: {row['question']}\n"
        f"Answer:"
    )

    try:
        response = model.generate_content(prompt)
        pred = response.text.strip()
    except Exception as e:
        pred = ""

    # appending the predicted outputs & ground truths in respective lists
    predictions.append(pred)
    references.append(row["text"])


In [None]:
predictions

In [None]:
references

---

ROUGE-N measures the number of matching n-grams between the model-generated text and reference.

▸ reference $R$ and the candidate summary $C$:

$R$: The cat is on the mat.

$C$: The cat and the dog.

Say, for ROUGE-1:

$P$: ratio of the number of unigrams in C that appear also in R, over the number of unigrams in C.

$R$: ratio of the number of unigrams in R that appear also in C, over the number of unigrams in R.

$$
F_1 = \frac{2 \cdot P \cdot R}{P + R}
$$


In [None]:
# Let's check the quality of the generated outputs now, using an evaluation metric - ROUGE. It can range from [0,1]. The higher the score, the better is the model performance.

scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True)

scores = {"rouge1": []}
for ref, pred in zip(references, predictions):
    s = scorer.score(ref, pred)
    scores["rouge1"].append(s["rouge1"].fmeasure)

print(f"Average ROUGE-1: {sum(scores['rouge1'])/len(scores['rouge1']):.4f}")


## Now, try to improve the score by refining the prompts, as discussed in the session. Keep experimenting. 😇