In [1]:
# LangChain-based pipeline for MCQ-based document clustering
# Requires: langchain, openai, sentence-transformers, scikit-learn

from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from sklearn.cluster import KMeans
import numpy as np
import json
import ast


# Step 1: Load and preprocess documents
def load_documents(json_path):
    with open(json_path, "r") as f:
        docs = json.load(f)
    return [doc["text"].strip().replace("\n", " ") for doc in docs]


# Step 2: Generate diagnostic MCQs from the corpus
def extract_json_block(text):
    start = text.find("[")
    end = text.rfind("]")
    if start != -1 and end != -1:
        return text[start : end + 1]
    return text


def generate_mcqs(docs, llm):
    joined_text = "\n\n".join(docs[:10])  # limit to 10 for brevity
    prompt = PromptTemplate.from_template(
        """
        Given the following sports-related forum posts:
        {joined_text}

        It contains a diverse set of text entries from what appears to be sports discussions, possibly from forums or mailing lists, mostly centered around hockey and baseball.

        Generate 3 multiple-choice questions that help distinguish subtopics, such as hockey or baseball of discussion.

        Each question should have 4 options (A, B, C, D), including “None of the above” as one of the answer choices for every question.

        Format your response as a valid JSON array like this:

        [
          {{"question": "What is the main topic of the post?", "options": ["A. Player analysis", "B. Media complaints", "C. Statistics discussion", "D. None of the above"]}},
          ...
        ]
        Only return the JSON — no prose or commentary.
        """
    )
    chain = LLMChain(llm=llm, prompt=prompt)
    response = chain.run(joined_text=joined_text)
    print("Raw LLM output:", response)
    try:
        return json.loads(response)
    except json.JSONDecodeError:
        return ast.literal_eval(extract_json_block(response))


# Step 3: Use LLM to answer MCQs for each document
def answer_mcqs(doc, mcqs, llm):
    answers = []
    for mcq in mcqs:
        q_prompt = PromptTemplate.from_template(
            """
            Given the following document:
            "{doc}"

            Answer the following question based only on its content:
            {question}
            Options: {options}

            Respond with the letter only (A, B, C, or D).
            """
        )
        chain = LLMChain(llm=llm, prompt=q_prompt)
        response = chain.run(
            doc=doc, question=mcq["question"], options=" ".join(mcq["options"])
        )
        answers.append(response.strip()[0])
    return answers


# Step 4: One-hot encode answers and cluster
def cluster_answers(answer_matrix, n_clusters=4):
    letter_to_index = {"A": 0, "B": 1, "C": 2, "D": 3}
    encoded = np.zeros((len(answer_matrix), len(answer_matrix[0]) * 4))
    for i, answers in enumerate(answer_matrix):
        for j, a in enumerate(answers):
            if a in letter_to_index:
                encoded[i, j * 4 + letter_to_index[a]] = 1
    km = KMeans(n_clusters=n_clusters, random_state=42).fit(encoded)
    return km.labels_

In [2]:
import os
from dotenv import load_dotenv

load_dotenv()  # This loads variables from .env into os.environ
api_key = os.getenv("OPENAI_API_KEY")

llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
docs = load_documents("json/20ng/topic_9_10_documents.json")
docs

  llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)


["I am sure some bashers of Pens fans are pretty confused about the lack of any kind of posts about the recent Pens massacre of the Devils. Actually, I am  bit puzzled too and a bit relieved. However, I am going to put an end to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they are killing those Devils worse than I thought. Jagr just showed you why he is much better than his regular season stats. He is also a lot fo fun to watch in the playoffs. Bowman should let JAgr have a lot of fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final regular season game.          PENS RULE!!!",
 "[stuff deleted]  Ok, here's the solution to your problem.  Move to Canada.  Yesterday I was able to watch FOUR games...the NJ-PITT at 1:00 on ABC, LA-CAL at 3:00 (CBC),  BUFF-BOS at 7:00 (TSN and FOX), and MON-QUE at 7:30 (CBC).  I think that if each series goes its max I could be wat

In [3]:
mcqs = generate_mcqs(docs, llm)

answer_matrix = [
    answer_mcqs(doc, mcqs, llm) for doc in docs[:20]
]  # limit to 20 docs for testing
labels = cluster_answers(answer_matrix)

for i, label in enumerate(labels):
    print(f"Document {i} -> Cluster {label}")

  chain = LLMChain(llm=llm, prompt=prompt)
  response = chain.run(joined_text=joined_text)


Raw LLM output: [
  {"question": "What sport is primarily discussed in the first post?", "options": ["A. Baseball", "B. Basketball", "C. Hockey", "D. None of the above"]},
  {"question": "Which player is mentioned in relation to the NHL's all-time goal scorers?", "options": ["A. Jack Morris", "B. Wayne Gretzky", "C. Mike Trambley", "D. None of the above"]},
  {"question": "What complaint is raised regarding ESPN's coverage?", "options": ["A. Too much focus on hockey", "B. Ignoring baseball highlights", "C. Cutting off overtime for baseball news", "D. None of the above"]}
]
Document 0 -> Cluster 0
Document 1 -> Cluster 0
Document 2 -> Cluster 0
Document 3 -> Cluster 0
Document 4 -> Cluster 1
Document 5 -> Cluster 2
Document 6 -> Cluster 1
Document 7 -> Cluster 0
Document 8 -> Cluster 0
Document 9 -> Cluster 3
Document 10 -> Cluster 1
Document 11 -> Cluster 0
Document 12 -> Cluster 0
Document 13 -> Cluster 1
Document 14 -> Cluster 0
Document 15 -> Cluster 1
Document 16 -> Cluster 1
Docum

  super()._check_params_vs_input(X, default_n_init=10)


In [4]:
# save the mcps to a json file
with open("json/20ng/langchain_mcqs_v2.json", "w") as f:
    json.dump(mcqs, f, indent=4)