In [None]:
import numpy as np
import pandas as pd
from datasets import concatenate_datasets, load_dataset


Reading the raw dataset.

In [None]:
ds = load_dataset(
    "pszemraj/goodreads-bookgenres",
    "default",
    revision="c4b00cd5b71cfb62687ddbd0e9c1c9d6a06e8d80",
)
df = concatenate_datasets(ds.values()).to_pandas()
df


Dropping useless columns.

In [None]:
df = df.drop(columns=["Book"])


Renaming columns.

In [None]:
df = df.rename(columns={"Description": "body", "Genres": "genres"})


Stripping the bodies.

In [None]:
df["body"] = (df["body"].str.strip()).replace("", np.nan)


Dropping rows with missing data.

In [None]:
df = df.dropna()


Dropping rows with potentially politically ambiguous genres.

In [None]:
genres = [
    "History & Politics",
    "Health & Medicine",
    "Mystery & Thriller",
    "Arts & Design",
    "Self-Help & Wellness",
    "Sports & Recreation",
    "Non-Fiction",
    "Science Fiction & Fantasy",
    "Countries & Geography",
    "Other",
    "Nature & Environment",
    "Business & Finance",
    "Romance",
    "Philosophy & Religion",
    "Literature & Fiction",
    "Science & Technology",
    "Children & Young Adult",
    "Food & Cooking",
]

ambiguous_genre_indexes = [
    genres.index(ambiguous_genre)
    for ambiguous_genre in [
        "History & Politics",
        "Other",
    ]
]

df = df[
    df["genres"].apply(
        lambda genres: not any(genres[index] == 1 for index in ambiguous_genre_indexes)
    )
]


Dropping useless columns.

In [None]:
df = df.drop(columns=["genres"])


Printing body duplicates.

In [None]:
df[df["body"].duplicated(keep=False)]


Dropping the body duplicates.

In [None]:
df = df.drop_duplicates(subset="body")


Adding the politicalness label column.

In [None]:
df["politicalness"] = pd.Categorical(["non-political"] * len(df))


Inspecting the body length.

In [None]:
df["body_length"] = df["body"].fillna("").str.len()
df["body_word_count"] = df["body"].fillna("").str.split().str.len()
df = df.sort_values(by="body_length")
df.head()


In [None]:
df.to_parquet("../preprocessed/goodreads_book_genres.parquet")
