## StackExchange family

Website: https://archive.org/download/stackexchange/


## Setup


In [23]:
from datasets import Dataset, DatasetDict

from typing import Literal, Iterable
import time
import os
from pathlib import Path

from pydantic import BaseModel

import requests
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup, Tag

from xml.etree.ElementTree import Element

from html2text import HTML2Text


h2t = HTML2Text()
h2t.mark_code = True

In [4]:
def html_to_md(html: str):
    return h2t.handle(html).strip()


POST_TYPE = Literal[
    "question",
    "answer",
    "wiki",
    "tag_wiki_excerpt",
    "tag_wiki",
    "moderator_nomination",
    "wiki_placeholder",
]

POST_TYPE_MAP: dict[int, POST_TYPE] = {
    1: "question",
    2: "answer",
    # 3: "Wiki",
    4: "tag_wiki_excerpt",
    5: "tag_wiki",
    6: "moderator_nomination",
    7: "wiki_placeholder",
}


class Post(BaseModel):
    id: str
    post_type: POST_TYPE
    creation_date: str
    last_edit_date: str | None
    last_activity_date: str
    owner_user_id: str | None
    last_editor_user_id: str | None

    score: int
    comment_count: int
    content_license: str
    body: str


class Question(Post):
    title: str
    accepted_answer_id: str | None
    answer_count: int
    view_count: int
    tags: list[str]
    favorite_count: int


class Answer(Post):
    parent_id: str


class TagWikiExcerpt(Post):
    pass


class TagWiki(Post):
    pass


def parse_common(el: Element) -> Post:
    id = el.attrib["Id"]
    post_type = POST_TYPE_MAP[int(el.attrib["PostTypeId"])]
    creation_date = el.attrib["CreationDate"]
    last_edit_date = el.attrib.get("LastEditDate")
    last_activity_date = el.attrib["LastActivityDate"]
    owner_user_id = el.attrib.get("OwnerUserId")
    last_editor_user_id = el.attrib.get("LastEditorUserId")
    score = int(el.attrib["Score"])
    comment_count = int(el.attrib["CommentCount"])
    content_license = el.attrib["ContentLicense"]
    body = html_to_md(el.attrib["Body"])

    return Post(
        id=id,
        post_type=post_type,
        creation_date=creation_date,
        last_edit_date=last_edit_date,
        last_activity_date=last_activity_date,
        owner_user_id=owner_user_id,
        last_editor_user_id=last_editor_user_id,
        score=score,
        comment_count=comment_count,
        content_license=content_license,
        body=body,
    )


def parse_question(el: Element) -> Question:
    common = parse_common(el)
    title = el.attrib["Title"]
    accepted_answer_id = el.attrib.get("AcceptedAnswerId")
    answer_count = int(el.attrib["AnswerCount"])
    view_count = int(el.attrib["ViewCount"])
    tags = el.attrib["Tags"].strip("<>").split("><")
    favorite_count = int(el.attrib.get("FavoriteCount") or 0)

    return Question(
        **common.model_dump(),
        title=title,
        accepted_answer_id=accepted_answer_id,
        answer_count=answer_count,
        view_count=view_count,
        tags=tags,
        favorite_count=favorite_count,
    )


def parse_answer(el: Element) -> Answer:
    common = parse_common(el)
    parent_id = el.attrib["ParentId"]

    return Answer(
        **common.model_dump(),
        parent_id=parent_id,
    )


def parse_post(el: Element) -> Post | None:
    post_type_id = int(el.attrib["PostTypeId"])

    post_type = POST_TYPE_MAP[post_type_id]

    if post_type == "question":
        return parse_question(el)
    elif post_type == "answer":
        return parse_answer(el)
    elif post_type == "tag_wiki":
        return TagWiki(**parse_common(el).model_dump())
    elif post_type == "tag_wiki_excerpt":
        return TagWikiExcerpt(**parse_common(el).model_dump())
    else:
        return None


def parse_posts(el: Element) -> list[Post]:
    posts = []
    for child in el:
        post = parse_post(child)
        if post:
            posts.append(post)
    return posts

In [5]:
SORT_ORDER = Literal["newest", "active", "votes", "frequent", "bounties"]


class StackExchange(BaseModel):
    directory: str = "./"
    site: str

    def posts(self):
        xml = f"{self.directory}/{self.site}/Posts.xml"
        root = get_xml_root(xml)
        if root.tag != "posts":
            raise Exception(f"Invalid xml: {xml}")
        posts = parse_posts(root)
        return posts


def get_xml_root(path: str):
    return ET.parse(path).getroot()


def soup_to_md(soup: BeautifulSoup | Tag):
    return h2t.handle(
        str(
            soup,
        )
    )


def html_to_md(html: str):
    return h2t.handle(html)


class QA(BaseModel):
    question: Question
    answers: list[Answer]
    accepted_answer: Answer | None = None
    max_score_answer: Answer | None = None


class BestQA(BaseModel):
    id: str
    question: Question
    question_score: int
    answer: Answer
    answer_score: int
    tags: list[str]


def generate_qa_pair(posts: list[Post]) -> list[QA]:
    qa_dict: dict[str, QA] = {}

    questions = [post for post in posts if isinstance(post, Question)]

    for question in questions:
        qa_dict[question.id] = QA(question=question, answers=[])

    answers = [post for post in posts if isinstance(post, Answer)]

    for answer in answers:
        qa_dict[answer.parent_id].answers.append(answer)

    return list(qa_dict.values())


class PostScore(BaseModel):
    score: int = 0
    accepted: bool = False


def generate_row(qa_list: list[BestQA]) -> Iterable[dict]:
    for qa in qa_list:
        yield {
            "id": qa.id,
            "title": qa.question.title,
            "question": qa.question.body,
            "answer": qa.answer.body,
            "question_score": qa.question_score,
            "answer_score": qa.answer_score,
            "tags": qa.tags,
        }

In [1]:
ALL_STACKOVERFLOW = [
    f"{lang}.stackoverflow.com"
    for lang in [
        "es",
        "ja",
        "pt",
        "ru",
    ]
]

In [6]:
ALL_SPECIAL_SITES = [
    "askubuntu.com",
    "mathoverflow.net",
    "serverfault.com",
    "stackapps.com",
    "stackapps.com",
    "superuser.com",
]

In [7]:
ALL_STACKEXCHANGE = [
    f"{name}.stackexchange.com"
    for name in [
        "3dprinting",
        "academia",
        "ai",
        "android",
        "anime",
        "apple",
        "arduino",
        "astronomy",
        "aviation",
        "avp",
        "beer",
        "bicycles",
        "bioacoustics",
        "bioinformatics",
        "biology",
        "bitcoin",
        "blender",
        "boardgames",
        "bricks",
        "buddhism",
        "cardano",
        "chemistry",
        "chess",
        "chinese",
        "christianity" "civicrm",
        "codegolf",
        "codereview",
        "coffee",
        "cogsci",
        "computergraphics",
        "conlang",
        "cooking",
        "craftcms",
        "crafts",
        "crypto",
        "cs",
        "cseducators",
        "cstheory",
        "datascience",
        "dba",
        "devops",
        "diy",
        "drones",
        "drupal",
        "dsp",
        "earthscience",
        "ebooks",
        "ecnomics",
        "electronics",
        "elementaryos",
        "ell",
        "emacs",
        "engineering",
        "english",
        "eosio",
        "esperanto",
        "ethereum",
        "expatriates",
        "expressionengine",
        "fitness",
        "freelacing",
        "french",
        "gamedev",
        "gaming",
        "gardening",
        "genai",
        "genealogy",
        "german",
        "gis",
        "graphicdesign",
        "ham",
        "hardwarerecs",
        "health",
        "hermeneutics",
        "hinduism",
        "history",
        "homebrew",
        "hsm",
        "interpersonal",
        "iot",
        "iota",
        "islam",
        "italian",
        "japanese",
        "joomla",
        "judaism",
        "korean",
        "langdev",
        "languagelearning",
        "latin",
        "law",
        "lifehacks",
        "linguistics",
        "literature",
        "megento",
        "martialarts",
        "materials",
        "math",
        "matheducators",
        "mathematica",
        "mechanics",
        "moderators",
        "monero",
        "money",
        "movies",
        "music",
        "musicfans",
        "mythology",
        "networkengineering",
        "opendata",
        "opensource",
        "or",
        "outdoors",
        "parenting",
        "patents",
        "pets",
        "philosophy",
        "photo",
        "physics",
        "pm",
        "poker",
        "politics",
        "portuguese",
        "proofassistants",
        "puzzling",
        "quant",
        "quantumcomputing",
        "raspberrypi",
        "retrocomputing",
        "reverseengineering",
        "robotics",
        "rpg",
        "rus",
        "russian",
        "salesforce",
        "scicomp",
        "scifi",
        "security",
        "sharepoint",
        "sitecore",
        "skeptics",
        "softwareengineering",
        "softwarerecs",
        "solana",
        "sound",
        "space",
        "spanish",
        "sports",
        "sqa",
        "stats",
        "stellar",
        "substrate",
        "sustainability",
        "tex",
        "tezos",
        "tor",
        "travel",
        "tridion",
        "ukrainian",
        "unix",
        "ux",
        "vegetarianism",
        "vi",
        "webapps",
        "webmasters",
        "windowsphone",
        "woodworking",
        "wordpress",
        "workplace",
        "worldbuilding",
        "writers",
    ]
]

In [46]:
def convert_to_posts_xml_link(domain: str):
    if domain == "stackoverflow.com":
        raise Exception("currentry not supported")
    return f"https://archive.org/download/stackexchange/{domain}.7z/Posts.xml"

## Download archives


In [12]:
# NOTE: Please change below domain as you need
TARGET_DOMAINS = [
    "anime.stackexchange.com",
    "japanese.stackexchange.com",
    "ja.stackoverflow.com",
]

In [16]:
def download_archives(domains: list[str]):
    client = requests.Session()

    urls = [convert_to_posts_xml_link(domain) for domain in domains]

    for url, domain in zip(urls, domains):
        print(f"Downloading {domain}...")

        save_dir = Path("./") / domain
        save_dir.mkdir(exist_ok=True)

        res = client.get(url)

        with open(save_dir / "Posts.xml", "wb") as file:
            file.write(res.content)

In [17]:
download_archives(TARGET_DOMAINS)
print("Done!")

Downloading anime.stackexchange.com...
Downloading japanese.stackexchange.com...
Downloading ja.stackoverflow.com...
Done!


## Load archives


In [18]:
def load_archive(domain: str):
    return StackExchange(site=domain).posts()

In [19]:
def create_ds(posts: list[Post]):
    qa = generate_qa_pair(posts)
    qa_dict = {
        "question": [],
        "answers": [],
    }

    for pair in qa:
        qa_dict["question"].append(pair.question.model_dump())
        qa_dict["answers"].append([answer.model_dump() for answer in pair.answers])

    return Dataset.from_dict(qa_dict)

In [37]:
def format_row(examples):
    examples["id"] = []
    examples["accepted_answer_id"] = []
    examples["popular_answer_id"] = []

    for i, question in enumerate(examples["question"]):
        examples["id"].append(question["id"])
        examples["accepted_answer_id"].append(question["accepted_answer_id"])

        answers = examples["answers"][i]
        popular_answer_id = None
        popular_answer_score = -9999
        for answer in answers:
            if answer["score"] > popular_answer_score:
                popular_answer_id = answer["id"]
                popular_answer_score = answer["score"]
        examples["popular_answer_id"].append(popular_answer_id)

    return examples

In [41]:
def simplify_row(examples):
    new_examples = {
        "id": [],
        "title": [],
        "question_body": [],
        "question_score": [],
        "accepted_answer_body": [],
        "accepted_answer_score": [],
        "popular_answer_body": [],
        "popular_answer_score": [],
        "tags": [],
    }

    for i, question in enumerate(examples["question"]):
        new_examples["id"].append(question["id"])
        new_examples["title"].append(question["title"])
        new_examples["question_body"].append(question["body"])
        new_examples["question_score"].append(question["score"])
        answers = examples["answers"][i]

        accepted_answer_not_found = True
        if examples["accepted_answer_id"][i] is not None:
            for answer in answers:
                if answer["id"] == examples["accepted_answer_id"][i]:
                    new_examples["accepted_answer_body"].append(answer["body"])
                    new_examples["accepted_answer_score"].append(answer["score"])
                    accepted_answer_not_found = False
                    break
        if accepted_answer_not_found:
            new_examples["accepted_answer_body"].append(None)
            new_examples["accepted_answer_score"].append(None)

        popular_answer_not_found = True
        if examples["popular_answer_id"][i] is not None:
            for answer in answers:
                if answer["id"] == examples["popular_answer_id"][i]:
                    new_examples["popular_answer_body"].append(answer["body"])
                    new_examples["popular_answer_score"].append(answer["score"])
                    popular_answer_not_found = False
                    break

        if popular_answer_not_found:
            new_examples["popular_answer_body"].append(None)
            new_examples["popular_answer_score"].append(None)

        new_examples["tags"].append(question["tags"])

    return new_examples


def simplify_ds(ds: Dataset):
    return ds.map(
        simplify_row,
        batched=True,
        batch_size=1000,
        remove_columns=["question", "answers"],
    )

In [34]:
archives = {}
for domain in TARGET_DOMAINS:
    archives[domain] = load_archive(domain)
archives

{'anime.stackexchange.com': [Question(id='1', post_type='question', creation_date='2012-12-11T20:37:08.823', last_edit_date='2015-04-17T19:06:38.957', last_activity_date='2022-05-12T10:37:24.403', owner_user_id='21', last_editor_user_id='1398', score=83, comment_count=0, content_license='CC BY-SA 3.0', body="Assuming the world in the One Piece universe is round, then there is not\nreally a beginning or an end of the Grand Line.\n\nThe Straw Hats started out from the first half and are now sailing across the\nsecond half.\n\nWouldn't it have been quicker to set sail in the opposite direction from where\nthey started?\n\n", title="The treasure in One Piece is at the end of the Grand Line. But isn't that the same as the beginning?", accepted_answer_id='8', answer_count=6, view_count=98252, tags=['one-piece'], favorite_count=0),
  Question(id='2', post_type='question', creation_date='2012-12-11T20:39:40.780', last_edit_date='2013-02-26T17:02:31.570', last_activity_date='2013-06-20T03:31:39

In [38]:
ds = {}
for domain, posts in archives.items():
    ds[domain] = create_ds(posts).map(format_row, batched=True, batch_size=1000)
ds = DatasetDict(ds)
ds

Map:   0%|          | 0/12318 [00:00<?, ? examples/s]

Map:   0%|          | 0/28850 [00:00<?, ? examples/s]

Map:   0%|          | 0/30820 [00:00<?, ? examples/s]

DatasetDict({
    anime.stackexchange.com: Dataset({
        features: ['question', 'answers', 'id', 'accepted_answer_id', 'popular_answer_id'],
        num_rows: 12318
    })
    japanese.stackexchange.com: Dataset({
        features: ['question', 'answers', 'id', 'accepted_answer_id', 'popular_answer_id'],
        num_rows: 28850
    })
    ja.stackoverflow.com: Dataset({
        features: ['question', 'answers', 'id', 'accepted_answer_id', 'popular_answer_id'],
        num_rows: 30820
    })
})

## Create simplified version


In [42]:
simple_ds = ds.copy()
for domain in simple_ds:
    simple_ds[domain] = simplify_ds(simple_ds[domain])
simple_ds

Map:   0%|          | 0/12318 [00:00<?, ? examples/s]

Map:   0%|          | 0/28850 [00:00<?, ? examples/s]

Map:   0%|          | 0/30820 [00:00<?, ? examples/s]

{'anime.stackexchange.com': Dataset({
     features: ['id', 'accepted_answer_id', 'popular_answer_id', 'title', 'question_body', 'question_score', 'accepted_answer_body', 'accepted_answer_score', 'popular_answer_body', 'popular_answer_score', 'tags'],
     num_rows: 12318
 }),
 'japanese.stackexchange.com': Dataset({
     features: ['id', 'accepted_answer_id', 'popular_answer_id', 'title', 'question_body', 'question_score', 'accepted_answer_body', 'accepted_answer_score', 'popular_answer_body', 'popular_answer_score', 'tags'],
     num_rows: 28850
 }),
 'ja.stackoverflow.com': Dataset({
     features: ['id', 'accepted_answer_id', 'popular_answer_id', 'title', 'question_body', 'question_score', 'accepted_answer_body', 'accepted_answer_score', 'popular_answer_body', 'popular_answer_score', 'tags'],
     num_rows: 30820
 })}

## Upload to huggingface


In [47]:
HF_REPO_ID = "stackexchanges"

In [48]:
for domain in ds:
    ds[domain].push_to_hub(
        HF_REPO_ID,
        config_name=domain,
        split="train",
        private=True,
    )
    simple_ds[domain].push_to_hub(
        HF_REPO_ID,
        config_name=f"{domain}_simple",
        split="train",
        private=True,
    )

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/13 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/13 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/2.94k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/29 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/29 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/5.48k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/31 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/31 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/8.01k [00:00<?, ?B/s]