In [66]:
import time
from dotenv import load_dotenv
import os
import google.generativeai as genai
import json
from IPython.display import clear_output

load_dotenv()  # loads variables from .env

API_KEY = os.getenv('GEMINI_API_KEY')
genai.configure(api_key=API_KEY)


In [64]:
import pickle
with open('../Dataset/dataset.pkl', 'rb') as f:
    df = pickle.load(f)

In [67]:
df["Q_A"] = None
df.head(2)

Unnamed: 0,pdfs,text,Q_A
0,../Dataset/Not_garb\andrew-ng-machine-learning...,Draft Version MACHINE LEARNING Technical Strat...,
1,../Dataset/Not_garb\andrew-ng-machine-learning...,37 How to decide whether to use all your data ...,


In [70]:
import re
def clean_json_string(json_string):
    text = json_string.strip().removeprefix('```json\n').removesuffix('\n```')
    text = re.sub(r'\r?\n', ' ', text)  # Replace actual newlines
    return text

def is_json(text):
    try:
        parsed = json.loads(clean_json_string(text))
        return True, parsed
    except json.JSONDecodeError:
        return False, None
    
def gemini_request_with_retry(model, contents=None, ): # Added model_name as arg for flexibility
    max_attempts = 2  # 1 retry
    for i in range(max_attempts):
        # Use the client object that is configured globally (or passed in)
        response = model.generate_content(contents)
        if is_json(response.text)[0]:
            return is_json(response.text)[1]
    return None
    time.sleep(1)


# Usage example


prompt = """You will create a dataset of multiple question-answer pairs from the given data.
Only use subject of the data as the topic of the question. These questions and answers will be used as a dataset to train a machine learning instructor chatbot.

Please respond ONLY with a JSON array of objects, each with two fields:
[
  {
    "Q": "Question text",
    "A": "Answer text"
  },
  ...
]
No extra text or explanations — just the JSON array."""

for i in range(len(df["text"])):
    text = df["text"][i]

    # Combined the prompt and data into a single user message (this is correct now)
    contents_for_generation = [
        {
            "role": "user",
            "parts": [
                {"text": prompt + " \n" + "Here is the data: " + text}
            ]
        }
    ]
    model = genai.GenerativeModel("gemini-2.0-flash")
    response_text = gemini_request_with_retry(model, contents=contents_for_generation)

    if response_text is not None:
        df.at[i, "Q_A"] = response_text
    else:
        model = genai.GenerativeModel("gemini-2.5-flash-preview-05-20")
        df.at[i, "Q_A"] = gemini_request_with_retry(model, contents=contents_for_generation)
    print(f"Progress: {i+1}/{len(df['text'])}")
    clear_output(wait=True)
df.to_csv("../Dataset/dataset.csv", index=False)

Progress: 146/146


In [74]:
df["Q_A"][0]

[{'Q': 'What is the goal of this machine learning technical strategy guide?',
  'A': 'To help you or your team work on a machine learning application and make rapid progress.'},
 {'Q': "What does the guide assume about the reader's machine learning experience?",
  'A': 'It assumes the reader has taken a machine learning course or has experience applying supervised learning.'},
 {'Q': 'What are two major drivers of recent machine learning progress?',
  'A': 'Data availability and computational scale.'},
 {'Q': 'What is the purpose of dev and test sets in machine learning?',
  'A': 'To direct your team toward the most important changes to make to the machine learning system.'},
 {'Q': 'How should you choose dev and test sets?',
  'A': 'Choose dev and test sets to reflect data you expect to get in the future and want to do well on.'},
 {'Q': 'What should you do if your team develops a system that works well on the dev set but not the test set, and the sets have different distributions?',


In [72]:
df["Q_A"][df["Q_A"] == None]

Series([], Name: Q_A, dtype: object)

In [73]:
sum(df["Q_A"].apply(lambda x: len(x)))

2592

In [None]:
reponse_failed_set = df["Q_A"][df["Q_A"] == "API Call Failed or No Content"].keys()

for i in reponse_failed_set:
    text = df["text"][i]

    # Combined the prompt and data into a single user message (this is correct now)
    contents_for_generation = [
        {
            "role": "user",
            "parts": [
                {"text": prompt + " \n" + "Here is the data: " + text}
            ]
        }
    ]

    response_text = gemini_request_with_retry(contents=contents_for_generation, model_name="gemini-1.5-flash")
    if response_text is not None:
        df.at[i, "Q_A"] = response_text
    else:
        print("Failed at index", i)

In [75]:
import pandas as pd
df2 = pd.DataFrame(columns=["question", "answer"])
for j in range(len(df["Q_A"])):
    sample = df["Q_A"][j]
    for i in range(len(sample)):
        try:
            df2 = pd.concat([df2, pd.DataFrame([{"question": sample[i]["Q"], "answer": sample[i]["A"]}])]
                    , ignore_index=True)
        except TypeError:
            print(f"error at {j} and {i}")
df2.to_csv("../Dataset/dataset_QA.csv", index=False)