In [None]:
import requests
import json

GITHUB_TOKEN = ""
API_URL = "https://api.github.com/graphql"
OWNER = "Lightning-AI"
REPO = "pytorch-lightning"
OUTPUT_FILE = "../final_data/lightning_discussions_answered.json"

def get_discussions():
    query = """
    query($cursor: String) {
      repository(owner: "%s", name: "%s") {
        discussions(first: 100, after: $cursor, orderBy: {field: CREATED_AT, direction: DESC}) {
          pageInfo {
            hasNextPage
            endCursor
          }
          nodes {
            title
            url
            createdAt
            answerChosenAt
            author {
              login}
            bodyText
            answer {
              author {
                login
              }
              bodyText
            }
            comments(first: 20) {
              nodes {
                author {
                  login
                }
                bodyText
              }
            }
          }
        }
      }
    }
    """ % (OWNER, REPO)

    headers = {"Authorization": f"Bearer {GITHUB_TOKEN}"}

    all_discussions = []
    cursor = None

    while True:
        response = requests.post(API_URL, json={"query": query, "variables": {"cursor": cursor}}, headers=headers)
        data = response.json()

        discussions_data = (
            data.get("data", {})
            .get("repository", {})
            .get("discussions", {})
        )

        if not discussions_data:
            print("No discussions found or API limit reached.")
            break

        nodes = discussions_data.get("nodes", [])
        all_discussions.extend(nodes)

        if len(all_discussions) >= 1000:
            break

        page_info = discussions_data.get("pageInfo", {})
        if not page_info.get("hasNextPage"):
            break

        cursor = page_info.get("endCursor")

    return all_discussions

# Filter for answered, which means the asker approved the answer
data = get_discussions()
answered = []
ctr = 0
for discussion in data:
    is_answered = discussion["answer"] is not None
    comments = discussion.get("comments", {}).get("nodes", [])

#  ["thanks", "appreciate", "great", "excellent", "perfect"]
    # heuristic: check for positive replies confirming the solution worked
    good_keywords = ["worked", "thank you", "good answer", "fixed", "solved"]
    has_positive_reply = any(
        any(k in c["bodyText"].lower() for k in good_keywords)
        for c in comments
    )

    if is_answered and has_positive_reply:
        discussion["label"] = "discussion"
        discussion["file"] = "../final_data/discussions.json"
        discussion["index"] = ctr
        ctr += 1
        del discussion["comments"]
        answered.append(discussion)


# --- Keep only the most recent 50 ---
answered = answered[:50]

# --- Save to JSON ---
with open("../final_data/discussions.json", "w") as f:
    json.dump(answered, f, indent=2)

print(len(answered))


21


In [78]:
import os
doc_links = []
doc_dir = "../final_data"
doc_jsons = []
# handle documentation json
with open(os.path.join(doc_dir, "lightning_docs_cleaned.json"), 'r') as f:
    data = json.load(f)
    doc_jsons.append(data)
    doc_links += [d["url_html"] for d in data]
    doc_docs = [f"{d['title']}\n{d['text']}" for d in data]

# handle discussions json
with open(os.path.join(doc_dir, "discussions.json"), 'r') as f:
    data = json.load(f)
    doc_links += [d["url"] for d in data]
    disc_docs = [
        f"{d.get('title','')}\n{d.get('bodyText','')}\nAnswer: {d.get('answer', {}).get('bodyText','')}"
        for d in data
    ]

# handle src code json
with open(os.path.join(doc_dir, "src_filtered_data.json"), 'r') as f:
    data = json.load(f)
    doc_links += [d["file"] for d in data]
    src_docs = [f"{d['text']}" for d in data]

with open("../../requests/richa_requests.json") as f:
    queries = json.load(f)
    for q in queries:
        print(q)
        for d in q["relevant_docs"]:
            
            if "discussions.json" in d["file"]:
                d["text"] = disc_docs[d["index"]]
            elif "docs_cleaned.json" in d["file"]:
                d["text"] = doc_docs[d["index"]]
            elif "src_filtered_data.json" in d["file"]:
                d["text"] = src_docs[d["index"]]
    json.dump(queries, open("../../requests/final_requests.json", "a"), indent=2)



{'query': 'Training is taking a long time. How do I speed up the training for multiple datasets?', 'query_type': 'debugging', 'relevant_docs': [{'file': '../data/final_data/discussions.json', 'index': 1, 'text': '', 'score': 7}, {'file': '../data/final_data/lightning_docs_cleaned.json', 'index': 57, 'text': '', 'score': 9}, {'file': '../data/final_data/lightning_docs_cleaned.json', 'index': 58, 'text': '', 'score': 9}, {'file': '../data/final_data/lightning_docs_cleaned.json', 'index': 59, 'text': '', 'score': 9}]}
{'query': 'What do I need to pass into LightningCLI() to get it working? Also, please give an example.', 'query_type': 'api_usage', 'relevant_docs': [{'file': '../data/final_data/lightning_docs_cleaned.json', 'index': 6, 'text': '', 'score': 9}, {'file': '../data/final_data/lightning_docs_cleaned.json', 'index': 7, 'text': '', 'score': 9}, {'file': '../data/final_data/lightning_docs_cleaned.json', 'index': 4, 'text': '', 'score': 9}, {'file': '../data/final_data/lightning_do