In [None]:
import re

import pandas as pd
from datasets import concatenate_datasets, load_dataset

Reading the raw dataset.

In [None]:
ds = load_dataset(
    "open-phi/textbooks",
    revision="292aaae99cbecacad50f692d7327887f05dacaf2"
)
df = concatenate_datasets(ds.values()).to_pandas()
df

Dropping useless columns.

In [None]:
df = df.drop(columns=["topic", "model", "concepts", "outline", "subfield", "rag"])

Renaming columns.

In [None]:
df = df.rename(columns={"markdown": "body"})

Removing the Markdown heading syntax.

In [None]:
df["body"] = df["body"].str.replace("#", "")

Dropping rows with potentially political fields.

In [None]:
df["field"].unique()

In [None]:
ambiguous_fields = [
    "sociology",
    "economics",
    "media_studies",
    "anthropology",
    "history",
    "public_administration",
]

df = df[~df["field"].isin(ambiguous_fields)]

Splitting the bodies into paragraphs.

In [None]:
body_paragraph_count = 10

rows = []
for _, row in df.iterrows():
    paragraphs = row["body"].split("\n\n")
    for i in range(0, len(paragraphs), body_paragraph_count):
        body = re.sub(r"\s+", " ", " ".join(paragraphs[i:i + body_paragraph_count])).strip()
        rows.append({"body": body, "field": row["field"]})

df = pd.DataFrame(rows)
df

Dropping rows with missing data.

In [None]:
df = df.dropna()

Adding the politicalness label column.

In [None]:
political_fields = [
    "political_science",
]

df["politicalness"] = pd.Categorical(
    ["political" if field in political_fields else "non-political" for field in df["field"]]
)

Dropping useless columns.

In [None]:
df = df.drop(columns=["field"])

Printing body duplicates.

In [None]:
df[df["body"].duplicated(keep=False)]

Dropping the body duplicates.

In [None]:
df = df.drop_duplicates(subset="body")

Inspecting the body length.

In [None]:
df["body_length"] = df["body"].str.len()
df["body_word_count"] = df["body"].str.split().str.len()
df = df.sort_values(by="body_length")
df.head()

In [None]:
# Ensuring to include the last (longest) item.
downsampled = pd.concat([df["body_length"].iloc[::100], df["body_length"].tail(1)]).drop_duplicates()
downsampled.plot.bar().xaxis.set_ticks([]);

In [None]:
df["body_length"].mean()

In [None]:
df.to_parquet("../preprocessed/textbooks.parquet")