In [1]:
import pandas as pd
from data_manager import Document
import pickle
from bs4 import BeautifulSoup

# Dataset

The ready dataset requiring slight preprocessing was downloaded from https://www.kaggle.com/datasets/stackoverflow/pythonquestions. It's a collection of questions and answers from Stack Overflow with the Python tag. The questions are from the years 2008 to 2016. The initial dataset contains over 600,000 examples.


In [3]:
questions_df = pd.read_csv("data/Questions.csv", encoding="latin-1")
answers_df = pd.read_csv("data/Answers.csv", encoding="latin-1")
tags_df = pd.read_csv("data/Tags.csv", encoding="latin-1")

In [4]:
answers = {}
for index, row in answers_df.iterrows():
    p_id = row['ParentId']
    if p_id not in answers:
        answers[p_id] = [row]
    else:
        answers[p_id].append(row)

In [5]:
tags = {}
for index, row in tags_df.iterrows():
    p_id = row['Id']
    if p_id not in tags:
        tags[p_id] = [row]
    else:
        tags[p_id].append(row)

In [7]:
docs: list[Document] = []

for index, row in questions_df.iterrows():
    p_id = row['Id']
    soup = BeautifulSoup(row['Body'], features='lxml')
    txt = row['Title'] + " " + soup.text

    answer_text = ""
    if p_id in answers:
        answer_text = "\n".join([ans['Body'] for ans in answers[row['Id']]])
        soup = BeautifulSoup(answer_text, features='lxml')
        answer_text = soup.text + " "

    if p_id in tags:
        answer_text += "\n".join([str(tag['Tag']) for tag in tags[row['Id']]])

    txt += answer_text
    docs.append(Document(row['Title'], row['Id'], txt))


In [8]:
with open("data/RawDocuments.pickle", "wb") as f:
    for doc in docs:
        pickle.dump(doc, f)