In [None]:
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
from html import unescape

# Note: the data can be downloaded from https://archive.org/download/stackexchange
xml_file = 'cstheory.stackexchange.com/Posts.xml'

### Class definitions

In [None]:
class Question:
    def __init__(self, post_id:int, accepted_answer_id:int, answer_count:int, score:int, title:str, body:str):
        self.post_id = post_id
        self.accepted_answer_id = accepted_answer_id
        self.answer_count = answer_count
        self.score = score
        self.title = title
        self.body = body
        self.answers = []

    def __str__(self):
        return(
            f"#####################\n"
            f"Question {self.post_id}\n"
            f"#####################\n"
            f"accepted_answer_id: {self.accepted_answer_id}, answer_count: {self.answer_count}, found_answers: {len(self.answers)} score: {self.score}\n\n"
            f"TITLE:\n######\n{self.title}\n"
            f"BODY:\n#####\n{self.body}\n"
            + f"\n".join(str(x) for x in self.answers)
        )
    

class Answer:
    def __init__(self, post_id:int, parent_id:int, score:int, body:int):
        self.post_id = post_id
        self.parent_id = parent_id
        self.score = score
        self.body = body

    def __str__(self):
        return (
            f"Answer {self.post_id}\n"
            f"#############\n"
            f"post_id: {self.post_id}, parent_id: {self.parent_id}, score: {self.score}\n\n"
            f"{self.body}"
        )

    

### Parse xml file and create dictionary of questions

In [None]:
# Dictionary to store the questions. The key is the post id and the value is the question associated with this id
questions: dict[int, Question] = {}

# Iterate on all the rows of the xml file
tree = ET.parse('cstheory.stackexchange.com/Posts.xml')
root = tree.getroot()
for child in root:
    post = child.attrib

    # If the post is a question
    if post['PostTypeId'] == '1': 
        question = Question(post_id= int(post['Id']), 
                            accepted_answer_id= int(post.get('AcceptedAnswerId', '-1')), 
                            answer_count= int(post['AnswerCount']), 
                            score= int(post['Score']), 
                            title= post['Title'],
                            body = BeautifulSoup(unescape(post['Body']), 'html.parser').get_text()
                            )
        
        assert(question.post_id not in questions)
        questions[question.post_id] = question

    # If the post is an answer
    elif post['PostTypeId'] == '2':
        answer = Answer(post_id= int(post['Id']),
                        parent_id= int(post['ParentId']),
                        score= int(post['Score']),
                        body= BeautifulSoup(unescape(post['Body']), 'html.parser').get_text()
                        )
        
        assert(answer.parent_id in questions)
        questions[answer.parent_id].answers.append(answer)
    

In [None]:
def data_check(questions: dict[int, Question]):
    for q_id, question in questions.items():
        assert(question.post_id == q_id)
        assert(question.title != '')
        assert(question.body != '')
        for answer in question.answers:
            assert(answer.parent_id == q_id)
            assert(answer.body != '')

        if question.answer_count != len(question.answers):
            print(
                f"Warning: answer_count does not match the number of answers found: "
                f"q_id: {q_id}, answer_count: {question.answer_count}, found_answers: {len(question.answers)}"
                )

        if question.accepted_answer_id not in [-1] + [a.post_id for a in question.answers]:
            print(
                f"Warning: accepted_answer_id is not a valid answer id: "
                f"q_id: {q_id}, accepted_answer_id: {question.accepted_answer_id}, "
                f"answer_count: {question.answer_count}, found_answers: {[a.post_id for a in question.answers]}")
        

data_check(questions)