In [11]:
%pip install datasets==3.5.0 ydata-profiling==4.16.1 ipywidgets==8.1.5 --q

Note: you may need to restart the kernel to use updated packages.


In [1]:
from dataclasses import dataclass
from typing import Dict, Literal
from datasets import load_dataset
from json import load
from os import makedirs
from os.path import exists
from pandas import DataFrame, concat, read_csv
from ydata_profiling import ProfileReport

In [2]:
@dataclass
class DatasetPaths:
    processed: str
    raw: str


data_folder = "./data"
processed_folder = f"{data_folder}/processed"
raw_folder = f"{data_folder}/raw"

makedirs(processed_folder, exist_ok=True)
makedirs(raw_folder, exist_ok=True)


dataset_paths: Dict[
    Literal["haha_test", "haha_train", "spanish_jokes", "stupid_stuff"], DatasetPaths
] = {
    "haha_test": DatasetPaths(
        f"{processed_folder}/haha_2019_test.csv", f"{raw_folder}/haha_2019_test.csv"
    ),
    "haha_train": DatasetPaths(
        f"{processed_folder}/haha_2019_train.csv", f"{raw_folder}/haha_2019_train.csv"
    ),
    "spanish_jokes": DatasetPaths(
        f"{processed_folder}/spanish_jokes.csv", f"{raw_folder}/spanish_jokes.csv"
    ),
    "stupid_stuff": DatasetPaths(
        f"{processed_folder}/stupid_stuff.csv", f"{raw_folder}/stupidstuff.json"
    ),
}

In [3]:
def process_haha_test():
    result = read_csv(dataset_paths["haha_test"].raw, encoding="utf-8")[["text"]]
    result.to_csv(dataset_paths["haha_test"].processed, index=False, header=True)
    return ProfileReport(result)


process_haha_test()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00,  5.48it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [4]:
def process_haha_train():
    dataset = read_csv(dataset_paths["haha_train"].raw, encoding="utf-8")
    result = DataFrame()
    result["text"] = dataset["text"]
    result["score"] = 0
    humor_rows = dataset["is_humor"] != 0
    for idx in dataset[humor_rows].index:
        votes = []
        total_votes = 0
        for i in range(1, 6):
            vote_count = int(dataset.loc[idx, f"votes_{i}"])
            votes.append(vote_count)
            total_votes += vote_count
        majority_found = False
        for i, vote_count in enumerate(votes, 1):
            if vote_count / total_votes > 0.5:
                result.loc[idx, "score"] = i
                majority_found = True
                break
        if not majority_found:
            result.loc[idx, "score"] = round(dataset.loc[idx, "funniness_average"])
    result["score"] = result["score"].clip(0, 5)
    result.to_csv(dataset_paths["haha_train"].processed, index=False)
    return ProfileReport(result)


process_haha_train()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 2/2 [00:00<00:00,  3.23it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [5]:
def process_spanish_jokes():
    if exists(dataset_paths["spanish_jokes"].raw):
        dataset = read_csv(dataset_paths["spanish_jokes"].raw, encoding="utf-8")
    else:
        dataset = DataFrame(load_dataset("mrm8488/CHISTES_spanish_jokes")["train"])
        dataset.to_csv(dataset_paths["spanish_jokes"].raw, index=False)
    dataset = dataset.drop_duplicates(subset=["text"])
    result = DataFrame([{"text": item["text"]} for _, item in dataset.iterrows()])
    result.to_csv(dataset_paths["spanish_jokes"].processed, index=False)
    return ProfileReport(result)


process_spanish_jokes()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00,  6.58it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [6]:
def process_stupid_stuff():
    with open(dataset_paths["stupid_stuff"].raw, "r") as file:
        dataset = load(file)
    unique_items = {}
    to_remove = set([""])
    for item in dataset:
        if item["body"] in unique_items and unique_items[item["body"]] != item["rating"]:
            to_remove.add(item["body"])
        unique_items[item["body"]] = item["rating"]
    result = DataFrame(
        [
            {"text": item["body"], "score": round(item["rating"])}
            for item in dataset
            if item["body"] not in to_remove
        ]
    )
    result.drop_duplicates(subset=['text'], keep='first', inplace=True)
    result.to_csv(dataset_paths["stupid_stuff"].processed, index=False)
    return ProfileReport(result)


process_stupid_stuff()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 2/2 [00:00<00:00,  4.52it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [10]:
def process_train_test():
    test_dataset = concat(
        [
            read_csv(dataset_paths["haha_test"].processed, encoding="utf-8"),
            read_csv(dataset_paths["spanish_jokes"].processed, encoding="utf-8"),
        ],
        ignore_index=True,
    )
    test_dataset.drop_duplicates(subset=['text'], keep=False, inplace=True)
    test_dataset.to_csv(f"{data_folder}/test.csv", index=False)
    train_dataset = concat(
        [
            read_csv(dataset_paths["haha_train"].processed, encoding="utf-8"),
            read_csv(dataset_paths["stupid_stuff"].processed, encoding="utf-8"),
        ],
        ignore_index=True,
    )
    train_dataset.drop_duplicates(subset=['text'], keep=False, inplace=True)
    train_dataset.to_csv(f"{data_folder}/train.csv", index=False)
    return train_dataset, test_dataset


train_dataset, test_dataset = process_train_test()

In [11]:
ProfileReport(train_dataset)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 2/2 [00:01<00:00,  1.94it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [12]:
ProfileReport(test_dataset)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00,  3.12it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

