In [1]:
import pandas as pd
import json
from pandas.api.types import is_object_dtype
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
# ver 1
df = pd.read_csv(r"D:\coding\CEPP\combined_output\all_combined.csv")
df = df.iloc[:900_000].copy()

original_df_count = len(df)

def parse_json_field(json_str):
    try:
        return json.loads(json_str) if pd.notnull(json_str) else {}
    except json.JSONDecodeError:
        return {}

#CATEGORY
df["category_dict"] = df["category"].apply(parse_json_field)

df["category_name"] = df["category_dict"].apply(lambda x: x.get("name"))
df["category_slug"] = df["category_dict"].apply(lambda x: x.get("slug"))
df["category_url"] = df["category_dict"].apply(lambda x: x.get("urls", {}).get("web", {}).get("discover"))

#CREATOR
df["creator_dict"] = df["creator"].apply(parse_json_field)

df["creator_id"] = df["creator_dict"].apply(lambda x: x.get("id"))
df["creator_name"] = df["creator_dict"].apply(lambda x: x.get("name"))
df["creator_avatar_thumb"] = df["creator_dict"].apply(lambda x: x.get("avatar", {}).get("thumb"))
df["creator_profile_url"] = df["creator_dict"].apply(lambda x: x.get("urls", {}).get("web", {}).get("user"))
df["creator_api_url"] = df["creator_dict"].apply(lambda x: x.get("urls", {}).get("api", {}).get("user"))

#LOCATION
df["location_dict"] = df["location"].apply(parse_json_field)

df["location_id"] = df["location_dict"].apply(lambda x: x.get("id"))
df["location_name"] = df["location_dict"].apply(lambda x: x.get("name"))
df["location_slug"] = df["location_dict"].apply(lambda x: x.get("slug"))
df["location_short_name"] = df["location_dict"].apply(lambda x: x.get("short_name"))
df["location_country"] = df["location_dict"].apply(lambda x: x.get("country"))
df["location_state"] = df["location_dict"].apply(lambda x: x.get("state"))
df["location_url_web_discover"] = df["location_dict"].apply(lambda x: x.get("urls", {}).get("web", {}).get("discover"))
df["location_url_web_location"] = df["location_dict"].apply(lambda x: x.get("urls", {}).get("web", {}).get("location"))

#PROFILE
df["profile_dict"] = df["profile"].apply(parse_json_field)
df["profile_id"] = df["profile_dict"].apply(lambda x: x.get("id"))
df["profile_project_id"] = df["profile_dict"].apply(lambda x: x.get("project_id"))
df["profile_state"] = df["profile_dict"].apply(lambda x: x.get("state"))

#DATE
df["created_at"] = pd.to_datetime(df["created_at"], unit="s")
df["deadline"] = pd.to_datetime(df["deadline"], unit="s")
df["launched_at"] = pd.to_datetime(df["launched_at"], unit="s")
df["state_changed_at"] = pd.to_datetime(df["state_changed_at"], unit="s")


df = df[df["state"].isin(["successful", "failed"])]
df["state"] = (df["state"] == "successful").astype(int)

df.drop(columns=["category", 
                 "category_dict", 
                 "creator", 
                 "creator_id",
                 "creator_dict",
                 "location", 
                 "location_dict", 
                 "profile", 
                 "profile_dict", 
                 "profile_id",
                 "profile_project_id",
                 "profile_state",
                 "urls",
                 "id",
                 "slug",
                 "usd_type",
                 "source_url",
                 "category_url",
                 "creator_avatar_thumb",
                 "creator_profile_url",
                 "creator_api_url",
                 "location_url_web_discover",
                 "location_url_web_location",
                 "location_slug",
                 "location_id",
                 "location_short_name",
                 "currency_symbol",
                 "currency_trailing_code",
                 "disable_communication",
                 "is_starrable",
                 "is_in_post_campaign_pledging_phase",
                 "is_disliked",
                 "is_liked",
                 "source_month",
                 "source_file",
                 "is_launched"
                 ], 
        inplace=True)


df["success_rate"] = df["percent_funded"].round(0).astype(int)
df["has_video"] = df["video"].notna().astype(int)
df["has_photo"] = df["photo"].notna().astype(int)
df["days_diff_created_at_deadline"] = (df["deadline"] - df["created_at"]).dt.days
df["days_diff_state_changed_at_launched_at"] = (df["state_changed_at"] - df["launched_at"]).dt.days
# df["days_diff_launched_at_deadline"] = (df["deadline"] - df["launched_at"]).dt.days.clip(lower=1).astype(np.float32)
df["days_diff_launched_at_deadline"] = (df["deadline"] - df["launched_at"]).dt.days
df["days_diff_launched_at_deadline_log"] = np.log1p(df["days_diff_launched_at_deadline"].clip(lower=1))
df["too_short_or_long"] = ((df["days_diff_launched_at_deadline"] < 30) | (df["days_diff_launched_at_deadline"] > 60)).astype("int8")
df["name_len"] = df["name"].astype(str).str.len()
df["blurb_len"] = df["blurb"].astype(str).str.len()

df["year_created_at"] = df["created_at"].dt.year
df["month_created_at"] = df["created_at"].dt.month
df["day_created_at"] = df["created_at"].dt.day

df["created_at_mon_sin"] = np.sin(2 * np.pi * (df["month_created_at"] - 1) / 12)
df["created_at_mon_cos"] = np.cos(2 * np.pi * (df["month_created_at"] - 1) / 12)

df["created_at_dom_sin"] = np.sin(2 * np.pi * (df["day_created_at"] - 1) / 31)
df["created_at_dom_cos"] = np.cos(2 * np.pi * (df["day_created_at"] - 1) / 31)

df["year_deadline"] = df["deadline"].dt.year
df["month_deadline"] = df["deadline"].dt.month
df["day_deadline"] = df["deadline"].dt.day

df["deadline_mon_sin"] = np.sin(2 * np.pi * (df["month_deadline"] - 1) / 12)
df["deadline_mon_cos"] = np.cos(2 * np.pi * (df["month_deadline"] - 1) / 12)

df["deadline_dom_sin"] = np.sin(2 * np.pi * (df["day_deadline"] - 1) / 31)
df["deadline_dom_cos"] = np.cos(2 * np.pi * (df["day_deadline"] - 1) / 31)

df["year_state_changed_at"] = df["state_changed_at"].dt.year
df["month_state_changed_at"] = df["state_changed_at"].dt.month
df["day_state_changed_at"] = df["state_changed_at"].dt.day

df["state_changed_at_mon_sin"] = np.sin(2 * np.pi * (df["month_state_changed_at"] - 1) / 12)
df["state_changed_at_mon_cos"] = np.cos(2 * np.pi * (df["month_state_changed_at"] - 1) / 12)

df["state_changed_at_dom_sin"] = np.sin(2 * np.pi * (df["day_state_changed_at"] - 1) / 31)
df["state_changed_at_dom_cos"] = np.cos(2 * np.pi * (df["day_state_changed_at"] - 1) / 31)

df["year_launched_at"] = df["launched_at"].dt.year
df["month_launched_at"] = df["launched_at"].dt.month
df["day_launched_at"] = df["launched_at"].dt.day

df["launched_at_mon_sin"] = np.sin(2 * np.pi * (df["month_launched_at"] - 1) / 12)
df["launched_at_mon_cos"] = np.cos(2 * np.pi * (df["month_launched_at"] - 1) / 12)

df["launched_at_dom_sin"] = np.sin(2 * np.pi * (df["day_launched_at"] - 1) / 31)
df["launched_at_dom_cos"] = np.cos(2 * np.pi * (df["day_launched_at"] - 1) / 31)

prep = (df["launched_at"] - df["created_at"]).dt.total_seconds() / 86400.0
df["prep_days"] = prep.clip(lower=0)
df["launch_dow"]    = df["launched_at"].dt.dayofweek 
df["deadline_dow"]    = df["deadline"].dt.dayofweek 

category_mapping = {
    # Art & Design
    "Art": "Art & Design", "Fine Art": "Art & Design", "Digital Art": "Art & Design",
    "Illustration": "Art & Design", "Mixed Media": "Art & Design", "Painting": "Art & Design",
    "Print": "Art & Design", "Printing": "Art & Design", "Graphic Design": "Art & Design",
    "Typography": "Art & Design", "Conceptual Art": "Art & Design", "Installations": "Art & Design",
    "Public Art": "Art & Design", "Design": "Art & Design", "Interactive Design": "Art & Design",
    "Product Design": "Art & Design", "Fashion": "Art & Design", "Jewelry": "Art & Design",
    "Pottery": "Art & Design", "Ceramics": "Art & Design", "Sculpture": "Art & Design",
    "Textile": "Art & Design", "Embroidery": "Art & Design", "Weaving": "Art & Design",
    "Knitting": "Art & Design", "Crochet": "Art & Design", "Stationery": "Art & Design",
    "Letterpress": "Art & Design", "Art Books": "Art & Design", "Crafts": "Art & Design",
    "DIY": "Art & Design", "DIY Electronics": "Art & Design", "Woodworking": "Art & Design",
    "Candle": "Art & Design", "Glass": "Art & Design", "Metal": "Art & Design",
    "Quilt": "Art & Design", "Fabrication Tools": "Art & Design", "Textiles": "Art & Design",
    "Quilts": "Art & Design",

    # Music
    "Music": "Music", "Classical Music": "Music", "Electronic Music": "Music",
    "Hip-Hop": "Music", "Indie Rock": "Music", "Jazz": "Music", "Latin": "Music",
    "Pop": "Music", "Punk": "Music", "R&B": "Music", "Rock": "Music", "Blues": "Music",
    "World Music": "Music", "Sound": "Music", "Music Videos": "Music", "Audio": "Music",
    "Country & Folk": "Music", "Chiptune": "Music", "Dance": "Music",

    # Film, Video & Theater
    "Film & Video": "Film, Video & Theater", "Narrative Film": "Film, Video & Theater",
    "Documentary": "Film, Video & Theater", "Shorts": "Film, Video & Theater",
    "Animation": "Film, Video & Theater", "Video": "Film, Video & Theater",
    "Video Art": "Film, Video & Theater", "Video Games": "Film, Video & Theater",
    "Television": "Film, Video & Theater", "Theater": "Film, Video & Theater",
    "Plays": "Film, Video & Theater", "Performances": "Film, Video & Theater",
    "Performance Art": "Film, Video & Theater", "Musical": "Film, Video & Theater",
    "Drama": "Film, Video & Theater", "Comedy": "Film, Video & Theater",
    "Horror": "Film, Video & Theater", "Thrillers": "Film, Video & Theater",
    "Webseries": "Film, Video & Theater", "Movie Theaters": "Film, Video & Theater",
    "Festivals": "Film, Video & Theater", "Action": "Film, Video & Theater",

    # Books, Writing & Publishing
    "Books": "Books, Writing & Publishing", "Children's Books": "Books, Writing & Publishing",
    "Comic Books": "Books, Writing & Publishing", "Comics": "Books, Writing & Publishing",
    "Graphic Novels": "Books, Writing & Publishing", "Cookbooks": "Books, Writing & Publishing",
    "Poetry": "Books, Writing & Publishing", "Fiction": "Books, Writing & Publishing",
    "Nonfiction": "Books, Writing & Publishing", "Romance": "Books, Writing & Publishing",
    "Science Fiction": "Books, Writing & Publishing", "Fantasy": "Books, Writing & Publishing",
    "Anthologies": "Books, Writing & Publishing", "Zines": "Books, Writing & Publishing",
    "Literary Journals": "Books, Writing & Publishing", "Literary Spaces": "Books, Writing & Publishing",
    "Periodicals": "Books, Writing & Publishing", "Translations": "Books, Writing & Publishing",
    "Publishing": "Books, Writing & Publishing", "Journalism": "Books, Writing & Publishing",
    "Radio & Podcasts": "Books, Writing & Publishing","Calendars": "Books, Writing & Publishing",
    "Young Adult": "Books, Writing & Publishing", "Academic": "Books, Writing & Publishing",

    # Games & Toys
    "Games": "Games & Toys", "Tabletop Games": "Games & Toys", "Live Games": "Games & Toys",
    "Mobile Games": "Games & Toys", "Gaming Hardware": "Games & Toys", "Toys": "Games & Toys",
    "Playing Cards": "Games & Toys", "Puzzles": "Games & Toys",

    # Fashion & Wearables
    "Apparel": "Fashion & Wearables", "Ready-to-wear": "Fashion & Wearables",
    "Childrenswear": "Fashion & Wearables", "Footwear": "Fashion & Wearables",
    "Pet Fashion": "Fashion & Wearables", "Accessories": "Fashion & Wearables",
    "Wearables": "Fashion & Wearables", "Couture": "Fashion & Wearables",

    # Food & Drink
    "Food": "Food & Drink", "Drinks": "Food & Drink", "Food Trucks": "Food & Drink",
    "Vegan": "Food & Drink", "Small Batch": "Food & Drink", "Restaurants": "Food & Drink",
    "Bakeries": "Food & Drink", "Farmers Markets": "Food & Drink", "Farms": "Food & Drink",
    "Bacon": "Food & Drink", "Candles": "Food & Drink", "Farmer's Markets": "Food & Drink",

    # Technology & Software
    "Apps": "Technology & Software", "Software": "Technology & Software", "Web": "Technology & Software",
    "Webcomics": "Technology & Software", "Robots": "Technology & Software", "Gadgets": "Technology & Software",
    "Camera Equipment": "Technology & Software", "Hardware": "Technology & Software",
    "Technology": "Technology & Software", "3D Printing": "Technology & Software",
    "Space Exploration": "Technology & Software",

    # Community & Social Impact
    "Community Gardens": "Community & Social Impact", "Social Practice": "Community & Social Impact",
    "Faith": "Community & Social Impact", "Civic Design": "Community & Social Impact",
    "Family": "Community & Social Impact", "People": "Community & Social Impact",
    "Kids": "Community & Social Impact", "Events": "Community & Social Impact",
    "Residencies": "Community & Social Impact", "Workshops": "Community & Social Impact",
    "Places": "Community & Social Impact", "Spaces": "Community & Social Impact",
    "Makerspaces": "Community & Social Impact",

    # Nature & Miscellaneous
    "Animals": "Nature & Miscellaneous", "Nature": "Nature & Miscellaneous", "Taxidermy": "Nature & Miscellaneous",
    "Flight": "Nature & Miscellaneous", "Immersive": "Nature & Miscellaneous", "Experimental": "Nature & Miscellaneous",
    "Photography": "Nature & Miscellaneous", "Photo": "Nature & Miscellaneous",
    "Photobooks": "Nature & Miscellaneous", "Architecture": "Nature & Miscellaneous",
}
target_categories = [
    "Art & Design",
    "Film, Video & Theater",
    "Books, Writing & Publishing",
    "Music",
    "Technology & Software",
    "Food & Drink",
    "Fashion & Wearables",
    "Games & Toys",
    "Community & Social Impact",
    "Nature & Miscellaneous"
]
df["category_group"] = df["category_name"].map(category_mapping).fillna("Unknown")
df = df[df["category_group"].isin(target_categories)].copy()
n_per_class = df["category_group"].value_counts().min()
df = (
    df.groupby("category_group", group_keys=False)
      .apply(lambda g: g.sample(n=n_per_class, random_state=42))
      .reset_index(drop=True)
)

rate = df["static_usd_rate"].fillna(df["usd_exchange_rate"]).astype(float).replace(0, np.nan)
goal_usd = (df["goal"].astype(float) * rate).replace([np.inf, -np.inf], np.nan)
usd_pledged = df["usd_pledged"].fillna(df.get("converted_pledged_amount")).astype(float)

valid = (goal_usd > 0) & usd_pledged.notna()
sr_usd = pd.Series(np.nan, index=df.index, dtype=float)
sr_usd[valid] = usd_pledged[valid] / goal_usd[valid]
sr_usd = sr_usd.replace([np.inf, -np.inf], np.nan)
df["success_ratio_usd"] = sr_usd  # ‚Üê ‡∏ï‡∏±‡∏ß‡πÄ‡∏î‡∏µ‡∏¢‡∏ß ‡πÉ‡∏ä‡πâ‡πÄ‡∏õ‡πá‡∏ô‡∏ê‡∏≤‡∏ô‡πÉ‡∏´‡πâ‡∏ó‡∏±‡πâ‡∏á‡∏™‡∏≠‡∏á‡∏´‡∏±‡∏ß

# --- risk_level: tertile ‡∏ï‡πà‡∏≠‡∏´‡∏°‡∏ß‡∏î (‡∏Å‡∏±‡∏ô ties + ‡∏Å‡∏±‡∏ô‡∏´‡∏°‡∏ß‡∏î‡πÄ‡∏•‡πá‡∏Å) ---
def pooled_tertile(vals, groups=None, alpha=50, labels=(0,1,2)):
    """
    vals   : pd.Series ‡∏Ç‡∏≠‡∏á‡∏Ñ‡πà‡∏≤‡∏ó‡∏µ‡πà‡∏à‡∏∞‡∏à‡∏±‡∏î‡∏≠‡∏±‡∏ô‡∏î‡∏±‡∏ö (index ‡∏ï‡πâ‡∏≠‡∏á‡∏ï‡∏£‡∏á‡∏Å‡∏±‡∏ö groups)
    groups : 
        - pd.Series ‡∏Ç‡∏≠‡∏á‡∏Å‡∏•‡∏∏‡πà‡∏° (‡∏¢‡∏≤‡∏ß‡πÄ‡∏ó‡πà‡∏≤ vals), 
        - ‡∏´‡∏£‡∏∑‡∏≠ list/tuple ‡∏Ç‡∏≠‡∏á Series (‡∏à‡∏±‡∏î‡∏Å‡∏•‡∏∏‡πà‡∏°‡πÅ‡∏ö‡∏ö‡∏´‡∏•‡∏≤‡∏¢‡∏°‡∏¥‡∏ï‡∏¥), 
        - ‡∏´‡∏£‡∏∑‡∏≠ None = ‡∏ñ‡∏∑‡∏≠‡∏ß‡πà‡∏≤‡πÄ‡∏õ‡πá‡∏ô‡∏Å‡∏•‡∏∏‡πà‡∏°‡πÄ‡∏î‡∏µ‡∏¢‡∏ß‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î
    alpha  : ‡∏ô‡πâ‡∏≥‡∏´‡∏ô‡∏±‡∏Å pooling ‡∏Å‡∏±‡∏ö global CDF (‡∏Å‡∏•‡∏∏‡πà‡∏°‡πÄ‡∏•‡πá‡∏Å‡∏à‡∏∞‡∏û‡∏∂‡πà‡∏á global ‡∏°‡∏≤‡∏Å‡∏Ç‡∏∂‡πâ‡∏ô)
    """
    import numpy as np
    import pandas as pd

    # ‡∏ó‡∏≥‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏∞‡∏≠‡∏≤‡∏î‡∏Ñ‡πà‡∏≤
    v_all = pd.to_numeric(vals, errors="coerce").replace([np.inf, -np.inf], np.nan)
    out   = pd.Series(pd.NA, index=v_all.index, dtype="Int64")
    if v_all.notna().sum() == 0:
        return out

    # Global CDF
    cdf_global = v_all.rank(method="average", pct=True)

    # ‡πÄ‡∏ï‡∏£‡∏µ‡∏¢‡∏°‡∏ï‡∏±‡∏ß‡πÅ‡∏ö‡πà‡∏á‡∏Å‡∏•‡∏∏‡πà‡∏°
    if groups is None:
        by = pd.Series(0, index=v_all.index)             # ‡∏Å‡∏•‡∏∏‡πà‡∏°‡πÄ‡∏î‡∏µ‡∏¢‡∏ß
    elif isinstance(groups, (list, tuple)):
        # ‡∏£‡∏≠‡∏á‡∏£‡∏±‡∏ö‡∏´‡∏•‡∏≤‡∏¢‡∏°‡∏¥‡∏ï‡∏¥ (‡πÄ‡∏ä‡πà‡∏ô [df['category_group'], df['month_launched_at']])
        by = [g.reindex(v_all.index) if isinstance(g, pd.Series) else pd.Series(g, index=v_all.index)
              for g in groups]
    else:
        # ‡πÄ‡∏î‡∏µ‡πà‡∏¢‡∏ß: ‡∏ï‡πâ‡∏≠‡∏á‡πÄ‡∏õ‡πá‡∏ô Series ‡∏¢‡∏≤‡∏ß‡πÄ‡∏ó‡πà‡∏≤ vals
        by = groups.reindex(v_all.index) if isinstance(groups, pd.Series) else pd.Series(groups, index=v_all.index)

    # ‡∏ß‡∏¥‡πà‡∏á‡∏ó‡∏µ‡∏•‡∏∞‡∏Å‡∏•‡∏∏‡πà‡∏°
    gb = v_all.groupby(by, dropna=False)
    for key, idx in gb.groups.items():
        v = v_all.loc[idx].dropna()
        if v.empty:
            continue
        cdf_g   = v.rank(method="average", pct=True)
        w       = len(v) / (len(v) + alpha)  # ‡∏Å‡∏•‡∏∏‡πà‡∏°‡πÉ‡∏´‡∏ç‡πà‡πÄ‡∏ä‡∏∑‡πà‡∏≠‡∏Ñ‡πà‡∏≤‡∏Å‡∏•‡∏∏‡πà‡∏°‡∏ï‡∏±‡∏ß‡πÄ‡∏≠‡∏á‡∏°‡∏≤‡∏Å‡∏Ç‡∏∂‡πâ‡∏ô
        cdf_star = w*cdf_g + (1-w)*cdf_global.loc[v.index]

        out.loc[v.index] = pd.cut(
            cdf_star, bins=[0, 1/3, 2/3, 1], labels=labels, include_lowest=True
        ).astype("Int64")

    return out



# ‡πÄ‡∏ï‡∏£‡∏µ‡∏¢‡∏°‡∏™‡πÄ‡∏Å‡∏•‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏ó‡∏±‡πâ‡∏á df (global)
sr = np.log1p(
    df["success_ratio_usd"].clip(
        lower=df["success_ratio_usd"].quantile(0.01),
        upper=df["success_ratio_usd"].quantile(0.99)
    )
)

# ‡∏ó‡∏≥ pooled tertile ‡πÅ‡∏ö‡∏ö‡∏´‡∏•‡∏≤‡∏¢‡∏°‡∏¥‡∏ï‡∏¥: category_group √ó month_launched_at
df["risk_level"] = pooled_tertile(
    sr,
    groups=[df["category_group"], df["month_launched_at"]],
    alpha=50
)



baseline = df.groupby(["category_group"])["usd_pledged"].median().rename("expected_pledge_baseline")
df = df.merge(baseline, on="category_group", how="left")
df["goal_suitability_index"] = (goal_usd / df["expected_pledge_baseline"]).replace([np.inf, -np.inf], np.nan)

bins_goal = [-np.inf, 0.5, 2.0, np.inf]
labels_goal = [0, 1, 2]   # 0=‡∏ï‡πà‡∏≥‡πÑ‡∏õ, 1=‡∏û‡∏≠‡πÄ‡∏´‡∏°‡∏≤‡∏∞, 2=‡∏™‡∏π‡∏á‡πÑ‡∏õ
df["goal_eval"] = pd.cut(df["goal_suitability_index"], bins=bins_goal, labels=labels_goal).astype("Int64")

s = df["days_diff_state_changed_at_launched_at"]
df["duration_class"] = pd.qcut(
    s.rank(method="first"),  # ‡∏ö‡∏±‡∏á‡∏Ñ‡∏±‡∏ö‡πÉ‡∏´‡πâ‡∏•‡∏≥‡∏î‡∏±‡∏ö‡πÑ‡∏°‡πà‡∏ä‡∏ô‡∏Å‡∏±‡∏ô
    q=4,
    labels=[0,1,2,3]
).astype(int)

# 0) ‡πÄ‡∏ï‡∏£‡∏µ‡∏¢‡∏°‡∏™‡πÄ‡∏Å‡∏• (‡∏ñ‡πâ‡∏≤ success_rate ‡πÄ‡∏î‡∏¥‡∏°‡πÄ‡∏õ‡πá‡∏ô % 0‚Äì100 ‡πÉ‡∏´‡πâ‡∏´‡∏≤‡∏£ 100)
s = pd.to_numeric(df["success_rate"], errors="coerce").astype(float)
s = s.clip(lower=0)                  # ‡∏Å‡∏±‡∏ô‡∏Ñ‡πà‡∏≤‡∏ï‡∏¥‡∏î‡∏•‡∏ö
s = s / 100.0                        # <-- ‡∏ñ‡πâ‡∏≤‡πÄ‡∏õ‡πá‡∏ô‡πÄ‡∏õ‡∏≠‡∏£‡πå‡πÄ‡∏ã‡πá‡∏ô‡∏ï‡πå
s_log = np.log1p(s)                  # log(1+x) ‡∏Å‡∏±‡∏ô‡∏´‡∏≤‡∏á‡∏Ç‡∏ß‡∏≤‡∏¢‡∏≤‡∏ß

bins  = [-np.inf, 50, 80, 100, 150, 300, np.inf]
labels = [0, 1, 2, 3, 4, 5]
df['success_rate_cls'] = pd.cut(df["percent_funded"], bins=bins, labels=labels).astype('int64')
# labels = [0, 1, 2, 3, 4, 5]
# df['success_rate_cls'] = pd.qcut(
#     df['percent_funded'],
#     q=[0, .15, .30, .50, .70, .85, 1.0],   # ‡∏õ‡∏£‡∏±‡∏ö‡πÄ‡∏õ‡∏≠‡∏£‡πå‡πÄ‡∏ã‡πá‡∏ô‡πÑ‡∏ó‡∏•‡πå‡∏ï‡∏≤‡∏°‡∏ó‡∏µ‡πà‡∏≠‡∏¢‡∏≤‡∏Å‡∏ö‡∏≤‡∏•‡∏≤‡∏ô‡∏ã‡πå
#     labels=labels, duplicates='drop'
# ).astype('int64')


df["goal_usd"] = goal_usd
df["goal_usd_log"] = np.log1p(df["goal"] * df["static_usd_rate"])
df["goal_per_day"] = (df["goal_usd"] / df["days_diff_launched_at_deadline"])
df["goal_per_day_log"] = np.log1p(df["goal_per_day"])

grp = df.groupby("category_group")
df["gpd_rank_in_cat"]      = grp["goal_per_day"].rank(pct=True)
df["gpd_vs_cat_median"]    = df["goal_per_day"] / grp["goal_per_day"].transform("median")
df["gpd_dist_cat_median"]  = (np.log(df["gpd_vs_cat_median"].clip(lower=1e-9)) ).abs()  

df["log1p_gpd_vs_cat_med"] = np.log1p(df["gpd_vs_cat_median"])

df["goal_rank_in_cat"]   = df.groupby("category_group")["goal_usd"].rank(pct=True)
df["goal_vs_cat_median"] = df["goal_usd"] / df.groupby("category_group")["goal_usd"].transform("median")
df["goal_vs_country_median"] = df["goal_usd"] / df.groupby("country_displayable_name")["goal_usd"].transform("median")

# --- roundness of goal ---
df["goal_round_100"]  = ((df["goal_usd"] % 100).abs()  < 1e-6).astype("int8")
df["goal_round_1000"] = ((df["goal_usd"] % 1000).abs() < 1e-6).astype("int8")

df["cat_freq"]      = df["category_group"].map(df["category_group"].value_counts())
df["country_freq"]  = df["country_displayable_name"].map(df["country_displayable_name"].value_counts())
cc = df["category_group"].astype(str) + "|" + df["country_displayable_name"].astype(str)
df["cat_x_country_freq"] = cc.map(cc.value_counts())
df["cat_country_share"] = df["cat_x_country_freq"] / df["cat_freq"].replace(0,1)

# ratio = (
#     pd.to_numeric(df["percent_funded"], errors="coerce")/100.0
# ).fillna(
#     pd.to_numeric(df["pledged"], errors="coerce") /
#     pd.to_numeric(df["goal"], errors="coerce").replace(0, np.nan)
# ).fillna(0.0).clip(lower=0.0)

ratio = (pd.to_numeric(df["pledged"], errors="coerce") /
    pd.to_numeric(df["goal"], errors="coerce").replace(0, np.nan)
).fillna(0.0).clip(lower=0.0)
# 1) near_miss_cls: 0=clear_fail, 1=near_miss, 2=success
# df["near_miss_cls"] = np.select(
#     [ratio < 0.90, (ratio >= 0.90) & (ratio < 1.00), ratio >= 1.00],
#     [0, 1, 2],
#     default=0
# ).astype("int8")

# 2) shortfall_severity_cls: 0=no_shortfall, 1=mild, 2=moderate, 3=severe
df["shortfall_severity_cls"] = np.select(
    [ratio >= 1.00, (ratio >= 0.80) & (ratio < 1.00), (ratio >= 0.50) & (ratio < 0.80), ratio < 0.50],
    [0, 1, 2, 3],
    default=3
).astype("int8")

# 3) stretch_potential_cls: 0=no_stretch, 1=light, 2=strong
df["stretch_potential_cls"] = np.select(
    [ratio < 1.00, (ratio >= 1.00) & (ratio < 1.25), ratio >= 1.25],
    [0, 1, 2],
    default=0
).astype("int8")

print(df.isnull().sum())

  .apply(lambda g: g.sample(n=n_per_class, random_state=42))


backers_count               0
blurb                       3
converted_pledged_amount    0
country                     0
country_displayable_name    0
                           ..
country_freq                0
cat_x_country_freq          0
cat_country_share           0
shortfall_severity_cls      0
stretch_potential_cls       0
Length: 99, dtype: int64


In [3]:
from sklearn.model_selection import KFold

def eb_prior_oof(y, group_keys, alpha=50, n_splits=5, random_state=42):
    """
    y           : pd.Series (‡πÄ‡∏ä‡πà‡∏ô log1p(success_ratio_usd))
    group_keys  : list of Series (‡πÄ‡∏ä‡πà‡∏ô [df['category_group'], df['month_launched_at']])
    return      : pd.Series OOF-smoothed prior
    """
    idx = y.index
    gkey = pd.MultiIndex.from_arrays([g.reindex(idx) for g in group_keys])
    prior = pd.Series(index=idx, dtype=float)

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    for train_idx, valid_idx in kf.split(idx):
        tr = idx[train_idx]; va = idx[valid_idx]
        y_tr = y.loc[tr]
        g_tr = gkey[train_idx]; g_va = gkey[valid_idx]

        # ‡∏™‡∏ñ‡∏¥‡∏ï‡∏¥‡∏Å‡∏•‡∏∏‡πà‡∏°‡∏à‡∏≤‡∏Å train fold ‡πÄ‡∏ó‡πà‡∏≤‡∏ô‡∏±‡πâ‡∏ô
        cnt = pd.Series(1, index=tr).groupby(g_tr).sum()
        sumy = y_tr.groupby(g_tr).sum()
        mu  = y_tr.mean()

        # EB smoothing
        eb = (sumy + alpha*mu) / (cnt + alpha)

        # map ‡πÑ‡∏õ‡∏ó‡∏µ‡πà‡∏ù‡∏±‡πà‡∏á val
        prior.loc[va] = eb.reindex(g_va).values

    return prior

# y = global target proxy ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö prior
y_sr = np.log1p(
    df["success_ratio_usd"].clip(df["success_ratio_usd"].quantile(0.01),
                                 df["success_ratio_usd"].quantile(0.99))
)

df["prior_cat_mon_sr"] = eb_prior_oof(
    y=y_sr,
    group_keys=[df["category_group"], df["month_launched_at"]],
    alpha=50
)


In [4]:
grp = df.groupby(["category_group","month_launched_at"])
df["cat_mon_n"]           = grp["goal_usd"].transform("size")
df["cat_mon_goal_med"]    = grp["goal_usd"].transform("median")
df["cat_mon_gpd_med"]     = grp["goal_per_day"].transform("median")
df["gpd_rank_in_cat_mon"] = grp["goal_per_day"].rank(pct=True)
df["goal_rank_in_cat_mon"]= grp["goal_usd"].rank(pct=True)


In [5]:
df["goal_vs_cat_country_med"] = df["goal_usd"] / df.groupby(
    ["category_group","country_displayable_name"]
)["goal_usd"].transform("median")

df["gpd_vs_cat_country_med"]  = df["goal_per_day"] / df.groupby(
    ["category_group","country_displayable_name"]
)["goal_per_day"].transform("median")


In [6]:
# ‡∏™‡∏≠‡∏á‡∏´‡∏•‡∏±‡∏Å‡∏ó‡πâ‡∏≤‡∏¢
last2 = (df["goal_usd"].round(0) % 100).fillna(-1).astype(int)
df["goal_last2"]    = last2
df["goal_end_00"]   = (last2 == 0).astype("int8")
df["goal_end_99"]   = (last2 == 99).astype("int8")
df["goal_k_bucket"] = np.floor(np.log10(df["goal_usd"].clip(lower=1))).astype("Int64")  # scale order


In [7]:
# ‡∏ö‡∏¥‡∏ô‡∏£‡∏∞‡∏¢‡∏∞‡∏ß‡∏±‡∏ô‡πÄ‡∏õ‡πá‡∏ô‡∏ö‡∏¥‡∏ô‡∏´‡∏¢‡∏≤‡∏ö ‡πÜ
dur_bin = pd.cut(df["days_diff_launched_at_deadline"],
                 bins=[-np.inf, 29, 45, 60, np.inf],
                 labels=["<30","30-45","46-60",">60"])
df["dur_bin"] = dur_bin.astype("category")

g2 = df.groupby(["category_group","dur_bin"])
df["gpd_rank_cat_dur"]   = g2["goal_per_day"].rank(pct=True)
df["gpd_vs_catdur_med"]  = df["goal_per_day"] / g2["goal_per_day"].transform("median")


  g2 = df.groupby(["category_group","dur_bin"])


In [8]:
# ‡πÄ‡∏ï‡∏£‡∏µ‡∏¢‡∏°‡∏ß‡∏±‡∏ô
df = df.sort_values("launched_at").copy()
df["day"] = df["launched_at"].dt.floor("D")

# ‡∏ô‡∏±‡∏ö‡∏à‡∏≥‡∏ô‡∏ß‡∏ô launch ‡∏ï‡πà‡∏≠‡∏ß‡∏±‡∏ô‡∏ï‡πà‡∏≠‡∏´‡∏°‡∏ß‡∏î (‡πÑ‡∏î‡πâ Series ‡∏ó‡∏µ‡πà‡∏°‡∏µ MultiIndex: (category_group, day))
daily = (df.groupby(["category_group", "day"])
           .size()
           .rename("launches"))

# ‡∏Å‡∏•‡∏∏‡πà‡∏°‡∏•‡∏∞‡∏´‡∏°‡∏ß‡∏î -> ‡∏î‡∏£‡∏≠‡∏õ level ‡∏´‡∏°‡∏ß‡∏î‡πÉ‡∏´‡πâ‡πÄ‡∏´‡∏•‡∏∑‡∏≠ index = day (DatetimeIndex)
# ‡πÄ‡∏ï‡∏¥‡∏°‡∏ß‡∏±‡∏ô‡∏ß‡πà‡∏≤‡∏á‡πÄ‡∏õ‡πá‡∏ô‡∏£‡∏≤‡∏¢‡∏ß‡∏±‡∏ô ‡πÅ‡∏•‡πâ‡∏ß rolling 30 ‡∏ß‡∏±‡∏ô
roll_30d = (daily
            .groupby(level=0)                              # ‡∏≠‡∏¢‡πà‡∏≤‡∏ï‡∏±‡πâ‡∏á group_keys=False
            .apply(lambda s: s.droplevel(0)                # <‚Äî ‡∏™‡∏≥‡∏Ñ‡∏±‡∏ç: ‡πÄ‡∏≠‡∏≤ level ‡∏´‡∏°‡∏ß‡∏î‡∏≠‡∏≠‡∏Å‡∏Å‡πà‡∏≠‡∏ô
                              .asfreq("D", fill_value=0)   # ‡πÄ‡∏ï‡∏¥‡∏°‡∏ß‡∏±‡∏ô‡∏ß‡πà‡∏≤‡∏á = 0
                              .rolling(30, min_periods=1).sum())
            .rename("cat_30d_launch_density"))

# map ‡∏Å‡∏•‡∏±‡∏ö‡πÄ‡∏Ç‡πâ‡∏≤ df ‡∏î‡πâ‡∏ß‡∏¢ MultiIndex (category_group, day)
idx = pd.MultiIndex.from_frame(df[["category_group", "day"]])
df["cat_30d_launch_density"] = roll_30d.reindex(idx).to_numpy()

# ‡∏ü‡∏µ‡πÄ‡∏à‡∏≠‡∏£‡πå‡πÄ‡∏™‡∏£‡∏¥‡∏°‡πÉ‡∏´‡πâ‡∏≠‡πà‡∏≤‡∏ô‡∏á‡πà‡∏≤‡∏¢‡∏Ç‡∏∂‡πâ‡∏ô (rank ‡πÅ‡∏•‡∏∞ z-score ‡∏†‡∏≤‡∏¢‡πÉ‡∏ô‡∏´‡∏°‡∏ß‡∏î)
df["cat_30d_density_rank"] = (
    df.groupby("category_group")["cat_30d_launch_density"].rank(pct=True)
)
mu = df.groupby("category_group")["cat_30d_launch_density"].transform("mean")
sd = df.groupby("category_group")["cat_30d_launch_density"].transform("std").replace(0, 1)
df["cat_30d_density_z"] = (df["cat_30d_launch_density"] - mu) / sd


In [9]:
df["prior_cat_mon_goal"] = eb_prior_oof(
    y=np.log1p(df["goal_usd"]),
    group_keys=[df["category_group"], df["month_launched_at"]],
    alpha=30
)
df["goal_minus_prior_cat_mon"] = np.log1p(df["goal_usd"]) - df["prior_cat_mon_goal"]

df["prior_cat_country_mon_goal"] = eb_prior_oof(
    y=np.log1p(df["goal_usd"]),
    group_keys=[df["category_group"], df["country_displayable_name"], df["month_launched_at"]],
    alpha=50
)
df["goal_minus_prior_cat_country_mon"] = np.log1p(df["goal_usd"]) - df["prior_cat_country_mon_goal"]


In [10]:
df["gpd_vs_catdur_med"].value_counts().sort_index()

gpd_vs_catdur_med
0.000020        1
0.000064        1
0.000067        2
0.000079        1
0.000089        7
               ..
26288.460129    1
28196.999747    3
31416.003791    1
32135.331336    1
56020.870833    2
Name: count, Length: 73955, dtype: int64

In [11]:
uniq_all = df["success_rate_cls"].unique()
print("unique (with NA):", uniq_all)

unique (with NA): [5 4 3 2 0 1]


In [12]:
unique_categories = sorted(df["category_name"].dropna().unique())
print(f"üß© ‡∏°‡∏µ‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î {len(unique_categories)} ‡∏´‡∏°‡∏ß‡∏î‡∏´‡∏°‡∏π‡πà\n")
for i, cat in enumerate(unique_categories, 1):
    print(f"{i:>2}. {cat}")


üß© ‡∏°‡∏µ‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î 161 ‡∏´‡∏°‡∏ß‡∏î‡∏´‡∏°‡∏π‡πà

 1. 3D Printing
 2. Academic
 3. Accessories
 4. Action
 5. Animals
 6. Animation
 7. Anthologies
 8. Apparel
 9. Apps
10. Architecture
11. Art
12. Art Books
13. Audio
14. Bacon
15. Blues
16. Calendars
17. Camera Equipment
18. Candles
19. Ceramics
20. Children's Books
21. Childrenswear
22. Chiptune
23. Civic Design
24. Classical Music
25. Comedy
26. Comic Books
27. Comics
28. Community Gardens
29. Conceptual Art
30. Cookbooks
31. Country & Folk
32. Couture
33. Crafts
34. Crochet
35. DIY
36. DIY Electronics
37. Dance
38. Design
39. Digital Art
40. Documentary
41. Drama
42. Drinks
43. Electronic Music
44. Embroidery
45. Events
46. Experimental
47. Fabrication Tools
48. Faith
49. Family
50. Fantasy
51. Farmer's Markets
52. Farms
53. Fashion
54. Festivals
55. Fiction
56. Film & Video
57. Fine Art
58. Flight
59. Food
60. Food Trucks
61. Footwear
62. Gadgets
63. Games
64. Gaming Hardware
65. Glass
66. Graphic Design
67. Graphic N

In [13]:
df["category_group"].value_counts()


category_group
Technology & Software          31495
Art & Design                   31495
Books, Writing & Publishing    31495
Music                          31495
Film, Video & Theater          31495
Games & Toys                   31495
Nature & Miscellaneous         31495
Community & Social Impact      31495
Fashion & Wearables            31495
Food & Drink                   31495
Name: count, dtype: int64

In [14]:
'''

### üé® **Art & Design**

* Art
* Fine Art
* Digital Art
* Illustration
* Mixed Media
* Painting
* Print
* Printing
* Graphic Design
* Typography
* Conceptual Art
* Installations
* Public Art
* Design
* Interactive Design
* Product Design
* Fashion
* Jewelry
* Pottery
* Ceramics
* Sculpture
* Textile
* Embroidery
* Weaving
* Knitting
* Crochet
* Stationery
* Letterpress
* Art Books
* Crafts
* DIY
* DIY Electronics
* Woodworking
* Candle
* Glass
* Metal
* Quilt
* Fabrication Tools

---

### üéµ **Music**

* Music
* Classical Music
* Electronic Music
* Hip-Hop
* Indie Rock
* Jazz
* Latin
* Pop
* Punk
* R\&B
* Rock
* Blues
* World Music
* Sound
* Music Videos
* Audio

---

### üé¨ **Film, Video & Theater**

* Film & Video
* Narrative Film
* Documentary
* Shorts
* Animation
* Video
* Video Art
* Video Games
* Television
* Theater
* Plays
* Performances
* Performance Art
* Musical
* Drama
* Comedy
* Horror
* Thrillers
* Webseries

---

### üìö **Books, Writing & Publishing**

* Books
* Children's Books
* Comic Books
* Comics
* Graphic Novels
* Cookbooks
* Poetry
* Fiction
* Nonfiction
* Romance
* Science Fiction
* Fantasy
* Anthologies
* Zines
* Literary Journals
* Literary Spaces
* Periodicals
* Translations
* Publishing
* Journalism

---

### üïπÔ∏è **Games & Toys**

* Games
* Tabletop Games
* Live Games
* Mobile Games
* Gaming Hardware
* Toys
* Playing Cards
* Puzzles

---

### üëó **Fashion & Wearables**

* Apparel
* Ready-to-wear
* Childrenswear
* Footwear
* Pet Fashion
* Accessories
* Wearables
* Couture

---

### üçΩÔ∏è **Food & Drink**

* Food
* Drinks
* Food Trucks
* Vegan
* Small Batch
* Restaurants
* Bakeries
* Farmers Markets
* Farms
* Bacon
* Candles

---

### üß† **Technology & Software**

* Apps
* Software
* Web
* Webcomics
* Webseries
* Robots
* Gadgets
* Camera Equipment
* Hardware
* Technology
* 3D Printing
* Space Exploration

---

### üßë‚Äçü§ù‚Äçüßë **Community & Social Impact**

* Community Gardens
* Social Practice
* Faith
* Civic Design
* Family
* People
* Kids
* Events
* Residencies
* Workshops
* Places
* Spaces
* Makerspaces

---

### üêæ **Nature & Miscellaneous**

* Animals
* Nature
* Taxidermy
* Flight
* Immersive
* Experimental
* Photography
* Photo
* Photobooks
* Architecture

---

'''


"\n\n### üé® **Art & Design**\n\n* Art\n* Fine Art\n* Digital Art\n* Illustration\n* Mixed Media\n* Painting\n* Print\n* Printing\n* Graphic Design\n* Typography\n* Conceptual Art\n* Installations\n* Public Art\n* Design\n* Interactive Design\n* Product Design\n* Fashion\n* Jewelry\n* Pottery\n* Ceramics\n* Sculpture\n* Textile\n* Embroidery\n* Weaving\n* Knitting\n* Crochet\n* Stationery\n* Letterpress\n* Art Books\n* Crafts\n* DIY\n* DIY Electronics\n* Woodworking\n* Candle\n* Glass\n* Metal\n* Quilt\n* Fabrication Tools\n\n---\n\n### üéµ **Music**\n\n* Music\n* Classical Music\n* Electronic Music\n* Hip-Hop\n* Indie Rock\n* Jazz\n* Latin\n* Pop\n* Punk\n* R\\&B\n* Rock\n* Blues\n* World Music\n* Sound\n* Music Videos\n* Audio\n\n---\n\n### üé¨ **Film, Video & Theater**\n\n* Film & Video\n* Narrative Film\n* Documentary\n* Shorts\n* Animation\n* Video\n* Video Art\n* Video Games\n* Television\n* Theater\n* Plays\n* Performances\n* Performance Art\n* Musical\n* Drama\n* Comedy\n* H

In [15]:
df.to_csv("convert_json_data.csv", index=False)

In [16]:
df.info()
print(len(df))

<class 'pandas.core.frame.DataFrame'>
Index: 314950 entries, 296885 to 32261
Columns: 122 entries, backers_count to goal_minus_prior_cat_country_mon
dtypes: Int64(3), bool(3), category(1), datetime64[ns](5), float64(57), int32(20), int64(11), int8(7), object(15)
memory usage: 257.4+ MB
314950
