In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np

from utils.base_directory import base_directory

In [None]:
IS_DATASET_ON_POLITICAL_LEANING = True
DATASET_NAME = "commoncrawl_news_articles"

label_column_name = "leaning" if IS_DATASET_ON_POLITICAL_LEANING else "politicalness"
df = pd.read_parquet(
    base_directory /
    "datasets" /
    ("political_leaning" if IS_DATASET_ON_POLITICAL_LEANING else "politicalness") /
    "preprocessed" /
    f"{DATASET_NAME}.parquet"
)
df

General statistical information.

In [None]:
df.describe()

Distribution of the title word count.

In [None]:
if "title" in df.columns:
    df["title_word_count"].hist(bins=int(np.log(df["title_word_count"].max()) * 5))
    plt.xlabel("title word count")
    plt.ylabel("frequency")

Distribution of the body word count.

In [None]:
percentile_95 = np.percentile(df["body_word_count"], 95)

if percentile_95 < 1000:
    rounding_factor = 10
else:
    rounding_factor = 100

raw_bin_width = percentile_95 / 6

bin_width = int(np.ceil(raw_bin_width / rounding_factor) * rounding_factor)

threshold = bin_width * 6

bin_edges = [i * bin_width for i in range(7)]

bin_labels = []
for i in range(len(bin_edges)-1):
    if i == 0:
        bin_labels.append(f"0-{int(bin_edges[i+1])}")
    else:
        bin_labels.append(f"{int(bin_edges[i])+1}-{int(bin_edges[i+1])}")
bin_labels.append(f"more than {int(threshold)}")

counts = []
for i in range(len(bin_edges)-1):
    if i == 0:
        count = ((df["body_word_count"] >= bin_edges[i]) &
                 (df["body_word_count"] <= bin_edges[i+1])).sum()
    else:
        count = ((df["body_word_count"] >= bin_edges[i]+1) &
                 (df["body_word_count"] <= bin_edges[i+1])).sum()
    counts.append(count)

more_than_count = (df["body_word_count"] > threshold).sum()
counts.append(more_than_count)

fig, ax = plt.subplots(figsize=(7, 6))

ax.bar(range(len(bin_labels)), counts)

ax.set_xticks(range(len(bin_labels)))
ax.set_xticklabels(bin_labels, rotation=45, ha="right")

ax.set_ylabel("frequency", fontsize=15)
ax.set_xlabel("body word count", fontsize=15)
ax.tick_params(axis="both", labelsize=14)
ax.set_axisbelow(True)
plt.tight_layout(pad=0)

The label class distribution.

In [None]:
df.groupby(
    label_column_name,
    observed=True
).size().plot.pie(autopct="%1.1f %%");