In [2]:
"""
Basic Exploratory Data Analysis (EDA)
for Party Aggregator project.
"""
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
from collections import Counter, defaultdict
from stop_words import get_stop_words


party_df = pd.read_csv("../data/processed/df.csv", parse_dates=["timestamp"])

def contains_hebrew(text):
    return bool(re.search(r'[\u0590-\u05FF]', text))

def reverse_if_hebrew(text):
    return text[::-1] if contains_hebrew(text) else text

  party_df = pd.read_csv("../data/processed/df.csv", parse_dates=["timestamp"])


In [3]:
# print(f"df shape: {df.shape}")
# print(f"df columns: {df.columns}")
# print(f"df dtypes: {df.dtypes}")

# 1. Remove duplicate rows
party_df = party_df.drop_duplicates()

# 2. Convert timestamp to datetime
party_df["timestamp"] = pd.to_datetime(party_df["timestamp"], errors="coerce")  # coerce bad formats to NaT

# 3. Normalize string columns
string_cols = ["arrived_marker", "file_id", "folder_name", "full_name", "special_requests", "song_requests"]

for col in string_cols:
    party_df[col] = party_df[col].astype(str).str.strip()  # remove leading/trailing spaces
    party_df[col] = party_df[col].str.replace(r"\s+", " ", regex=True)  # replace multiple spaces with single space

party_df["arrived"] = party_df["arrived"].astype("boolean")  # Nullable BooleanDtype

# 4. Replace blank strings with NaN in relevant text fields
party_df[["special_requests", "song_requests"]] = party_df[["special_requests", "song_requests"]].replace(r"^\s*$", pd.NA, regex=True)

# 5. Convert folder names to categorical
party_df["folder_name"] = party_df["folder_name"].astype("category")

# print(f"df dtypes: {df.dtypes}")

  party_df["timestamp"] = pd.to_datetime(party_df["timestamp"], errors="coerce")  # coerce bad formats to NaT


In [4]:
# party_df[['arrived_marker', 'full_name']].groupby(['arrived_marker']).count()
# party_df

# Song Requests

In [5]:
party_df["song_requests_clean"] = (
    party_df["song_requests"]
    .replace(["", " ", "NA", "NaN", "nan", "n/a", None], pd.NA)
    .astype("string")  # nullable string dtype
    .str.strip()
    .str.lower()
)

In [7]:
def plot_request_completion(
    df, request_col="song_requests", folder_col="folder_name", top_n=None, title=None
):
    """
    Plot a stacked bar chart showing percent-filled and percent-empty requests per folder.

    Args:
        df (pd.DataFrame): The input DataFrame.
        request_col (str): Column with the requests (e.g., 'song_requests').
        folder_col (str): Column indicating folder/party (e.g., 'folder_name').
        top_n (int, optional): Show only top-N parties by submission count.
        title (str, optional): Custom title for the plot.
    """
    df = df.copy()

    # === Aggregation logic ===
    g_df = (
        df.groupby(folder_col)
        .agg(
            total_requests=(request_col, "count"),
            non_empty_requests=(request_col, lambda x: x.astype(str).str.strip().ne("").sum()),
        )
        .reset_index()
    )

    # Percent calculations
    g_df["percent_filled"] = (g_df["total_requests"] / g_df["non_empty_requests"]) * 100
    g_df["percent_empty"] = 100 - g_df["percent_filled"]
    g_df["folder_name_display"] = g_df[folder_col].apply(reverse_if_hebrew)

    if top_n:
        g_df = g_df.sort_values("non_empty_requests", ascending=False).head(top_n)

    # === Plot ===
    plt.figure(figsize=(10, 6))
    bars1 = plt.bar(
        g_df["folder_name_display"],
        g_df["percent_filled"],
        label="Non-empty (%)",
        color="skyblue",
    )
    bars2 = plt.bar(
        g_df["folder_name_display"],
        g_df["percent_empty"],
        bottom=g_df["percent_filled"],
        label="Empty (%)",
        color="lightcoral",
    )

    # Dynamic labels inside bars
    for i in range(len(g_df)):
        filled = g_df.loc[i, "percent_filled"]
        empty = g_df.loc[i, "percent_empty"]
        x = bars1[i].get_x() + bars1[i].get_width() / 2

        if filled > 5:
            plt.text(
                x,
                filled / 2,
                f"{filled:.0f}%",
                ha="center",
                va="center",
                fontsize=8,
                color="black",
            )
        if empty > 5:
            plt.text(
                x,
                filled + empty / 2,
                f"{empty:.0f}%",
                ha="center",
                va="center",
                fontsize=8,
                color="white",
            )

    # Final touches
    plt.title(title or f"{request_col.replace('_', ' ').title()} Completion per Party")
    plt.xlabel("Party Name")
    plt.ylabel("Request Completion (%)")
    plt.ylim(0, 110)
    plt.legend()
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.grid(axis="y", linestyle="--", alpha=0.3)
    plt.show()

In [17]:
# plot_request_completion(party_df, request_col="song_requests", title="Song Requests Completion per Party")

### Stop words

In [9]:

he_stop_words_df = pd.read_csv("../top_3000_most_freq_wiki.csv", header=None)
hebrew_stop_words = set(he_stop_words_df[1].astype(str).str.strip())

# Combine English and Hebrew stop words
en_stop_words = set(get_stop_words('en'))
he_stop_words = set(hebrew_stop_words)
all_stop_words = en_stop_words.union(he_stop_words)

def tokenize_and_filter(text):
    # protect against missing/NA values
    if pd.isna(text):
        return []
    # Tokenize with regex to support Hebrew + English
    tokens = re.findall(r'\b\w+\b', text, flags=re.UNICODE)
    # Keep only non-stop words and alphabetic tokens
    return [t for t in tokens if t not in all_stop_words and t.isalpha() and len(t) > 2]

party_df["song_request_tokens"] = party_df["song_requests_clean"].apply(tokenize_and_filter)

# df[["song_requests", "song_requests_clean", "song_request_tokens"]].sample(15)

### N-grams Utility Functions (Bag of Words)

In [10]:
# === Step 1: Get global token frequency from flattened list ===
def get_global_token_freq(df: pd.DataFrame, token_col: str = "song_request_tokens", top_n: int = 50) -> pd.DataFrame:
    all_tokens = [token for tokens in df[token_col] for token in tokens]
    token_counts = Counter(all_tokens)
    return pd.DataFrame(token_counts.most_common(top_n), columns=["token", "count"])

# === Step 2: Plot global token frequency (with RTL support) ===
def plot_top_tokens(freq_df: pd.DataFrame, top_n: int = 10, title: str = "Top Song Request Tokens") -> None:
    freq_df = freq_df.copy()
    freq_df["token"] = freq_df["token"].apply(reverse_if_hebrew)
    top_df = freq_df.nlargest(top_n, "count")

    plt.figure(figsize=(15, 5))
    plt.bar(top_df["token"], top_df["count"], color="orchid")
    plt.xticks(rotation=45, ha="right")
    plt.title(title)
    plt.xlabel("Token")
    plt.ylabel("Frequency")
    plt.grid(True, linestyle="--", alpha=0.3)
    plt.tight_layout()
    plt.show()

# === Step 3: Get per-party token frequency table ===
def get_party_token_freq(df: pd.DataFrame, folder_col: str = "folder_name", token_col: str = "song_request_tokens") -> pd.DataFrame:
    party_token_counts = defaultdict(Counter)

    for _, row in df.iterrows():
        party = row[folder_col]
        tokens = row[token_col]
        party_token_counts[party].update(tokens)

    rows = []
    for party, counts in party_token_counts.items():
        for token, count in counts.items():
            rows.append({"party": party, "token": token, "count": count})

    return pd.DataFrame(rows)

# === Step 4: Get top N tokens per party (optional) ===
def get_top_tokens_by_party(df: pd.DataFrame, top_n: int = 10) -> pd.DataFrame:
    return (
        df.sort_values(["party", "count"], ascending=[True, False])
        .groupby("party")
        .head(top_n)
        .reset_index(drop=True)
    )
    

### Song Request Unigram

In [18]:
global_freq_df = get_global_token_freq(party_df, top_n=50)
# plot_top_tokens(global_freq_df, top_n=30, title="Top 30 Global Song Request Tokens")

### Song Request Bigrams

In [19]:
from nltk.util import ngrams

def generate_ngrams(tokens, n=2):
    return [' '.join(gram) for gram in ngrams(tokens, n)]

party_df["bigrams"] = party_df["song_request_tokens"].apply(lambda tokens: generate_ngrams(tokens, 2))
party_df["trigrams"] = party_df["song_request_tokens"].apply(lambda tokens: generate_ngrams(tokens, 3))

# Get global top bigrams
global_bigram_df = get_global_token_freq(party_df, token_col="bigrams", top_n=30)

# Plot
# plot_top_tokens(global_bigram_df, top_n=20, title="Top 20 Global Bigrams in Song Requests")

# Special Requests

In [13]:
party_df["special_requests_clean"] = (
    party_df["special_requests"]
    .replace(["", " ", "NA", "NaN", "nan", "n/a", None], pd.NA)
    .astype("string")
    .str.strip()
    .str.lower()
)

party_df["special_request_tokens"] = party_df["special_requests_clean"].apply(tokenize_and_filter)

### Special requests Unigram

In [20]:
special_unigram_df = get_global_token_freq(party_df, token_col="special_request_tokens", top_n=50)
# plot_top_tokens(special_unigram_df, top_n=30, title="Top Special Request Tokens")

### Special requests Bigram

In [21]:
party_df["special_bigrams"] = party_df["special_request_tokens"].apply(lambda tokens: generate_ngrams(tokens, 2))
special_bigram_df = get_global_token_freq(party_df, token_col="special_bigrams", top_n=30)
# plot_top_tokens(special_bigram_df, top_n=20, title="Top Special Request Bigrams")

In [22]:
# plot_request_completion(party_df, request_col="special_requests", title="Special Request Completion per Party")