## step 1 (optional): Generate summary of a corpus of documents

In [1]:
# LangChain-based pipeline for MCQ-based document clustering

from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from sklearn.cluster import KMeans
import numpy as np
import json
import ast


def load_documents(json_path):
    with open(json_path, "r") as f:
        docs = json.load(f)
    return [doc["text"].strip().replace("\n", " ") for doc in docs]


def extract_json_block(text):
    start = text.find("[")
    end = text.rfind("]")
    if start != -1 and end != -1:
        return text[start : end + 1]
    return text


def generate_summary(docs, llm, size=1000):
    # randomly sample 1000 documents for the prompt, you can change the number as long as it is within the limit of the LLM input token limit
    np.random.seed(123)
    indices = np.random.choice(len(docs), size=size, replace=False)
    docs = [docs[i] for i in indices]
    # join the documents into a single string
    joined_text = "\n\n".join(docs[:size])
    prompt = PromptTemplate.from_template(
        """
        Given the following corpus of {size} documents, please summarize the main topics discussed in the documents. The documents are separated by two newlines.

        Format your response as a valid JSON object like this:

        {{"summary": "Main topic summary"}}
        Only return the JSON — no prose or commentary.

        --Documents--:
        {joined_text}
        """
    )
    chain = LLMChain(llm=llm, prompt=prompt)
    response = chain.run(joined_text=joined_text, size=size)
    print("Raw LLM output:", response)
    try:
        return json.loads(response)
    except json.JSONDecodeError:
        return ast.literal_eval(extract_json_block(response))

In [2]:
import os
from dotenv import load_dotenv

load_dotenv()  # This loads variables from .env into os.environ
api_key = os.getenv("OPENAI_API_KEY")

llm_summary = ChatOpenAI(
    model_name="gpt-4.1", temperature=0
)  # here we use gpt-4.1 for summarization size its context length is larger than gpt-4o
docs = load_documents("json/20ng/topic_9_10_documents.json")
summary = generate_summary(docs=docs, llm=llm_summary, size=1000)
# save the summary to a file
with open("json/20ng/topic_9_10_summary.json", "w") as f:
    json.dump(summary, f, indent=4)

  llm_summary = ChatOpenAI(
  chain = LLMChain(llm=llm, prompt=prompt)
  response = chain.run(joined_text=joined_text, size=size)


Raw LLM output: {"summary": "The main topics discussed in this corpus of 1000 documents are North American professional sports, with a strong focus on hockey (NHL) and baseball (MLB) in the early 1990s. The documents include detailed game summaries, player statistics, standings, and box scores for both sports, as well as discussions about team performance, player trades, and predictions for playoff outcomes. There is significant conversation about television and radio coverage of games, fan experiences, and the impact of league expansion and realignment. Other recurring themes include debates over player value, Hall of Fame candidacies, the merits of various statistical measures, and the influence of management decisions. The corpus also contains discussions about the representation of European players in the NHL, the role of women in sports, and the intersection of sports with broader cultural and social issues. Additionally, there are references to college and minor league hockey, fa

## Step 2: Based on the summary, draft your own instruction in the prompt for LLM to generate MCQs

In [3]:
def generate_mcqs(docs, llm, size=100):
    # randomly sample 100 documents for the prompt, you can change the number as long as it is within the limit of the LLM input token limit
    np.random.seed(42)
    indices = np.random.choice(len(docs), size=size, replace=False)
    docs = [docs[i] for i in indices]
    # join the documents into a single string
    joined_text = "\n\n".join(docs[:size])
    prompt = PromptTemplate.from_template(
        """
        Given the following sports-related forum posts:
        {joined_text}

        It contains a diverse set of text entries from what appears to be sports discussions, possibly from forums or mailing lists, mostly centered around hockey and baseball.

        Generate 3 multiple-choice questions that help distinguish subtopics, such as hockey or baseball of discussion.

        Each question should have 4 options (A, B, C, D), including “None of the above” as one of the answer choices for every question.

        Format your response as a valid JSON array like this:

        [
          {{"question": "What is the main topic of the post?", "options": ["A. Player analysis", "B. Media complaints", "C. Statistics discussion", "D. None of the above"]}},
          ...
        ]
        Only return the JSON — no prose or commentary.
        """
    )
    chain = LLMChain(llm=llm, prompt=prompt)
    response = chain.run(joined_text=joined_text)
    print("Raw LLM output:", response)
    try:
        return json.loads(response)
    except json.JSONDecodeError:
        return ast.literal_eval(extract_json_block(response))

In [4]:
llm_mcqs = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
mcqs = generate_mcqs(docs, llm=llm_mcqs, size=100)

# save mcqs to json file
with open("json/20ng/topic_9_10_mcqs.json", "w") as f:
    json.dump(mcqs, f, indent=4)

Raw LLM output: ```json
[
  {"question": "What sport is primarily discussed in the majority of the posts?", "options": ["A. Baseball", "B. Hockey", "C. Basketball", "D. None of the above"]},
  {"question": "Which player is mentioned in relation to a significant injury update?", "options": ["A. Nolan Ryan", "B. Steve Howe", "C. Eli Manning", "D. None of the above"]},
  {"question": "What type of statistics are frequently referenced in the discussions?", "options": ["A. Player batting averages", "B. Goalie save percentages", "C. Team standings", "D. None of the above"]}
]
```
