# Supplementary code for the BERTopic workflow

## 20 news groups preprocessing

We will use the [`20newsgroups` dataset from scikit-learn](https://scikit-learn.org/stable/datasets/real_world.html#newsgroups-dataset).  The code below will fetch the data and then clean (strip newlines and remove empty texts) and reformat it so that it is easier for BERTopic to work with. 

In [None]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd

In [None]:
def fetch_and_clean_20newsgroups(categories):
    # this is a function from sklearn that fetches the 20 newsgroups text dataset
    # it is a collection of approximately 20,000 newsgroup documents, partitioned across 20 different newsgroups
    # this returns a bunch object, which is very similar to a dictionary
    bunch = fetch_20newsgroups(
        categories=categories, # only extract select topics
        remove=("headers","footers","quotes")) # don't extract unnecessary metadata

    # get the text data and labels
    docs = bunch["data"]
    doc_labels = bunch["target"]

    # create a data frame with the text and labels
    df = pd.DataFrame({
        "original_text": docs,
        "labels": doc_labels
    })

    # create a label with text info
    label_number_to_text = {i: label for i, label in enumerate(bunch["target_names"])}
    df["labels_text"] = df["labels"].map(label_number_to_text)

    # strip blank characters
    df["cleaned_text"] = df["original_text"].str.strip()

    # remove empty text from data frame
    empty_text_bool =  df["cleaned_text"].str.len() == 0

    print(f"Number of empty texts: {empty_text_bool.sum()}")

    # remove empty text from df
    df = df[~empty_text_bool]

    print(f"Final dimension of dataset: {df.shape[0]}, {df.shape[1]}")

    df = df[["original_text", "cleaned_text", "labels","labels_text"]]

    return df

In [None]:
df = fetch_and_clean_20newsgroups(
    ["comp.graphics", "rec.autos", "rec.motorcycles", 
    "rec.sport.baseball", "rec.sport.hockey", 
    "sci.electronics", "sci.med", "sci.space"]
)

df.head()

In [None]:
# save the data to a file
df.to_csv('data/sklearn_20newsgroups_cleaned.csv', index=False)