In [None]:
import requests
import json

GITHUB_TOKEN = ""
API_URL = "https://api.github.com/graphql"
OWNER = "Lightning-AI"
REPO = "pytorch-lightning"
OUTPUT_FILE = "../final_data/lightning_discussions_answered.json"

def get_discussions():
    query = """
    query($cursor: String) {
      repository(owner: "%s", name: "%s") {
        discussions(first: 100, after: $cursor, orderBy: {field: CREATED_AT, direction: DESC}) {
          pageInfo {
            hasNextPage
            endCursor
          }
          nodes {
            title
            url
            createdAt
            author {
              login}
            bodyText
            answer {
              author {
                login
              }
              bodyText
            }
            comments(first: 20) {
              nodes {
                author {
                  login
                }
                bodyText
              }
            }
          }
        }
      }
    }
    """ % (OWNER, REPO)

    headers = {"Authorization": f"Bearer {GITHUB_TOKEN}"}

    all_discussions = []
    cursor = None

    while True:
        response = requests.post(API_URL, json={"query": query, "variables": {"cursor": cursor}}, headers=headers)
        data = response.json()

        discussions_data = (
            data.get("data", {})
            .get("repository", {})
            .get("discussions", {})
        )

        if not discussions_data:
            print("No discussions found or API limit reached.")
            break

        nodes = discussions_data.get("nodes", [])
        all_discussions.extend(nodes)

        if len(all_discussions) >= 1000:
            break

        page_info = discussions_data.get("pageInfo", {})
        if not page_info.get("hasNextPage"):
            break

        cursor = page_info.get("endCursor")

    return all_discussions

# Filter for answered, which means the asker approved the answer
data = get_discussions()
answered = []
ctr = 0
for discussion in data:
    is_answered = discussion["answer"] is not None
    comments = discussion.get("comments", {}).get("nodes", [])


    # heuristic: check for positive replies confirming the solution worked
    good_keywords = ["worked", "thank you", "good answer", "fixed", "solved"]
    has_positive_reply = any(
        any(k in c["bodyText"].lower() for k in good_keywords)
        for c in comments
    )

    if is_answered and has_positive_reply:
        discussion["label"] = "discussion"
        discussion["file"] = "../final_data/discussions.json"
        discussion["index"] = ctr
        ctr += 1
        del discussion["comments"]
        answered.append(discussion)


# --- Keep only the most recent 50 ---
answered = answered[:50]

# --- Save to JSON ---
with open("../final_data/discussions.json", "w") as f:
    json.dump(answered, f, indent=2)

print(len(answered))


21
