**Tämä notebook suorittaa blurb-sarakkeen analyysin ja käyttää add_blurb_features.py -moduulia, joka lisää datasettiin analyysiä varten blurb-piirteet:**

- Notebook lukee esikäsitellyn datasetin (parquet)  
- Kutsuu add_blurb_features.py-moduulia ja tuottaa tilastollisen yhteenvedon blurb-datasta

In [10]:
import os
import pandas as pd

from preprocessing.add_blurb_features import add_blurb_features


In [11]:
def analyze_blurbs(df: pd.DataFrame) -> None:
    """
    Analyze the blurb column of the dataset: missing/invalid values, length distributions,
    simple text-based features and language shares.
    """

    print("\n=== 1. Checking missing/invalid blurbs ===")

    missing_mask = df["blurb_missing"] == 1
    valid_mask = df["blurb_missing"] == 0

    total = len(df)
    missing = missing_mask.sum()
    valid = valid_mask.sum()

    print(f"Total rows:          {total}")
    print(f"Invalid/missing:     {missing} ({missing / total * 100:.2f}%)")
    print(f"Valid blurbs:        {valid} ({valid / total * 100:.2f}%)")

    print("\nBreakdown of invalid types:")
    blurb_raw = df["blurb"].astype(str).str.strip()

    # Check whether the string contains at least one letter or digit (unicode)
    contains_letter_or_digit = blurb_raw.apply(
        lambda s: any(ch.isalnum() for ch in s)
    )

    invalid_types = {
        "empty_or_nan": (df["blurb"].isna() | (blurb_raw == "")),
        "excel_artifacts": blurb_raw.str.upper().isin(
            {"#NAME", "#NAME?", "??", "?", "N/A", "NA", "NAN"}
        ),
        "only_digits": blurb_raw.str.fullmatch(r"\d+").fillna(False),
        "only_special_chars": ~contains_letter_or_digit & (blurb_raw != ""),
    }

    for name, mask in invalid_types.items():
        # Calculate the proportion only for the rows where blurb_missing == 1
        mask_invalid = mask & missing_mask
        count = mask_invalid.sum()
        print(f"  {name}: {count} ({count / total * 100:.2f}%)")

    print("\n=== 2. Length distributions (valid blurbs only) ===")
    valid_df = df[valid_mask]

    print(f"Average character length: {valid_df['blurb_char_len'].mean():.2f}")
    print(f"Median character length:  {valid_df['blurb_char_len'].median():.2f}")
    print(f"Average word count:       {valid_df['blurb_word_len'].mean():.2f}")
    print(f"Median word count:        {valid_df['blurb_word_len'].median():.2f}")

    print("\n=== 3. Simple blurb features (valid blurbs) ===")
    print(
        "Share containing numbers: "
        f"{valid_df['has_number'].mean() * 100:.2f}%"
    )
    print(
        "Share containing '!':     "
        f"{(valid_df['exclamation_count'] > 0).mean() * 100:.2f}%"
    )

    print("\n=== 4. Language distribution among valid blurbs ===")
    lang_share = (
        valid_df["blurb_lang"]
        .value_counts(normalize=True)
        .sort_values(ascending=False) * 100
    )
    print(lang_share)

    print("\nDone.")

In [12]:
df = pd.read_parquet("../data/processed_data.parquet")
df.head()

Unnamed: 0,blurb,category_parent_id,category_parent_name,country,creator_id,currency,deadline,goal,id,name,state,launched_date,fx_date,fx_daily_mean,usd_goal_fx
0,A Year of Sanderson: Enjoy books and swag boxe...,18,Publishing,US,74501917,USD,2022-03-31,1000000.0,1497949659,Surprise! Four Secret Novels by Brandon Sanderson,successful,2022-03-01,2022-03-01,1.0,1000000.0
1,Color e-paper smartwatch with up to 7 days of ...,7,Design,US,597507018,USD,2015-03-28,500000.0,1799979574,"Pebble Time - Awesome Smartwatch, No Compromises",successful,2015-02-24,2015-02-24,1.0,500000.0
2,Beginning with The Stormlight Archive and expa...,12,Games,US,237961243,USD,2024-08-30,250000.0,7816448,Brandon Sanderson's Cosmere® RPG,successful,2024-08-06,2024-08-06,1.0,250000.0
3,The COOLEST is a portable party disguised as a...,7,Design,US,203090294,USD,2014-08-30,50000.0,342886736,COOLEST COOLER: 21st Century Cooler that's Act...,successful,2014-07-08,2014-07-08,1.0,50000.0
4,Euro-inspired dungeon crawling sequel to the 2...,12,Games,US,1350948450,USD,2020-05-01,500000.0,374744378,Frosthaven,successful,2020-03-31,2020-03-31,1.0,500000.0


In [13]:
print("Generating blurb features...")
df_feat = add_blurb_features(df)

df_feat[[
    "blurb_missing",
    "blurb_char_len",
    "blurb_word_len",
    "exclamation_count",
    "has_number",
    "blurb_lang",
    "blurb_is_english",
]].head()

Generating blurb features...


Unnamed: 0,blurb_missing,blurb_char_len,blurb_word_len,exclamation_count,has_number,blurb_lang,blurb_is_english
0,0,64,11,1,1,en,1
1,0,128,22,0,1,en,1
2,0,132,20,1,0,en,1
3,0,118,20,0,0,en,1
4,0,70,9,0,1,en,1


In [14]:
print("Running blurb analysis...\n")
analyze_blurbs(df_feat)

Running blurb analysis...


=== 1. Checking missing/invalid blurbs ===
Total rows:          596353
Invalid/missing:     268 (0.04%)
Valid blurbs:        596085 (99.96%)

Breakdown of invalid types:
  empty_or_nan: 0 (0.00%)
  excel_artifacts: 97 (0.02%)
  only_digits: 12 (0.00%)
  only_special_chars: 161 (0.03%)

=== 2. Length distributions (valid blurbs only) ===
Average character length: 106.80
Median character length:  119.00
Average word count:       17.83
Median word count:        19.00

=== 3. Simple blurb features (valid blurbs) ===
Share containing numbers: 21.61%
Share containing '!':     22.63%

=== 4. Language distribution among valid blurbs ===
blurb_lang
en         96.248018
es          1.039952
fr          0.684969
de          0.655108
it          0.267747
da          0.137732
sv          0.135383
ca          0.127499
nl          0.103676
no          0.090927
ro          0.085391
af          0.068111
tl          0.061568
pt          0.046973
ja          0.034727
id       