In [3]:
"""
Basic Exploratory Data Analysis (EDA) functions
for Party Aggregator project.
"""

import pandas as pd

df = pd.DataFrame(data='../arrived_pivot.csv', sep=',')
df.head()
def top_n_participants(df: pd.DataFrame, n: int = 10) -> pd.DataFrame:
    """
    Return the top N participants by number of arrived True records.

    Parameters:
    - df: pd.DataFrame - DataFrame containing at least 'full_name' and 'arrived' columns.
    - n: int - Number of top participants to return.

    Returns:
    - pd.DataFrame - Top N participants sorted by number of arrivals.
    """
    if 'full_name' not in df.columns or 'arrived' not in df.columns:
        raise ValueError("DataFrame must contain 'full_name' and 'arrived' columns.")

    arrived_df = df[df['arrived'] == True]
    
    top_arrivals = (
        arrived_df['full_name']
        .value_counts()
        .reset_index()
        .rename(columns={'index': 'full_name', 'full_name': 'arrival_count'})
        .sort_values(by='arrival_count', ascending=False)
        .head(n)
    )

    return top_arrivals


def overall_arrival_rate(df: pd.DataFrame) -> float:
    """
    Calculate the overall arrival rate (True arrivals / total records).

    Parameters:
    - df: pd.DataFrame - DataFrame containing 'arrived' column.

    Returns:
    - float - Overall arrival percentage.
    """
    if 'arrived' not in df.columns:
        raise ValueError("DataFrame must contain 'arrived' column.")

    arrival_rate = df['arrived'].mean()
    return round(arrival_rate * 100, 2)


def participant_duplicate_counts(df: pd.DataFrame) -> pd.DataFrame:
    """
    Count how many times each full name appears in the dataset.

    Parameters:
    - df: pd.DataFrame - DataFrame containing 'full_name' column.

    Returns:
    - pd.DataFrame - DataFrame of names and counts.
    """
    if 'full_name' not in df.columns:
        raise ValueError("DataFrame must contain 'full_name' column.")

    duplicates = (
        df['full_name']
        .value_counts()
        .reset_index()
        .rename(columns={'index': 'full_name', 'full_name': 'appearance_count'})
        .sort_values(by='appearance_count', ascending=False)
    )

    return duplicates


def summarize_basic_statistics(df: pd.DataFrame) -> None:
    """
    Print basic summary statistics of the dataset.

    Parameters:
    - df: pd.DataFrame - The full party attendance DataFrame.
    """
    print(f"📝 Total records: {len(df)}")
    print(f"🧍‍♂️ Unique participants: {df['full_name'].nunique()}")
    print(f"📈 Overall arrival rate: {overall_arrival_rate(df)}%")
    print(f"🎯 Top participants:\n{top_n_participants(df)}")

TypeError: DataFrame.__init__() got an unexpected keyword argument 'sep'

In [2]:
# summarize_basic_statistics()

TypeError: summarize_basic_statistics() missing 1 required positional argument: 'df'