In [None]:
import pandas as pd
import numpy as np

# --- Configuration ---
TRAIN_FILE = "data/train_data_SMM4H_2025_Task_1.csv"
DEV_FILE = "data/dev_data_SMM4H_2025_Task_1.csv"

# --- Functions ---
def compute_text_length(df):
    """Computes the text length (number of words)."""
    df['text'] = df['text'].astype(str).fillna('')
    df['text_length'] = df['text'].str.split().str.len().fillna(0).astype(int)
    return df

def get_stats(df, dataset_name):
    """Computes descriptive statistics for a DataFrame."""
    df = compute_text_length(df.copy())
    labels = sorted(df['label'].unique())
    quantiles_list = [0.25, 0.5, 0.75]
    quantile_names = {0.25: 'Q1', 0.50: 'Median', 0.75: 'Q3'}

    # Grouped calculations
    lang_counts = df['language'].value_counts()
    type_counts = df['type'].value_counts()
    label_counts_overall = df['label'].value_counts()
    lang_label_counts = df.groupby(['language', 'label']).size().unstack(fill_value=0)

    avg_len_label_overall = df.groupby('label')['text_length'].mean()
    avg_len_lang_overall = df.groupby('language')['text_length'].mean()
    avg_len_lang_label = df.groupby(['language', 'label'])['text_length'].mean().unstack(fill_value=0)

    quantile_len_label_overall = df.groupby('label')['text_length'].quantile(quantiles_list).unstack()
    quantile_len_label_overall.columns = [quantile_names[q] for q in quantiles_list]

    quantile_len_lang_overall = df.groupby('language')['text_length'].quantile(quantiles_list).unstack()
    quantile_len_lang_overall.columns = [quantile_names[q] for q in quantiles_list]

    quantile_len_lang_label = df.groupby(['language', 'label'])['text_length'].quantile(quantiles_list).unstack(level='label').unstack(level=-1)
    if isinstance(quantile_len_lang_label.columns, pd.MultiIndex):
        quantile_len_lang_label.columns = quantile_len_lang_label.columns.swaplevel(0, 1)
        # Rename quantile level from number to name (Q1, Median, Q3)
        quantile_len_lang_label.columns = pd.MultiIndex.from_tuples(
            [(quantile_names[col[0]], col[1]) for col in quantile_len_lang_label.columns],
            names=['Quantile', 'Label']
        )
        quantile_len_lang_label.sort_index(axis=1, level=[1, 0], inplace=True) # Sort by Label then Quantile

    return {
        "name": dataset_name, "total": len(df), "languages": df['language'].unique(),
        "types": df['type'].unique(), "labels": labels, "lang_counts": lang_counts,
        "type_counts": type_counts, "label_counts_overall": label_counts_overall,
        "lang_label_counts": lang_label_counts, "avg_len_label_overall": avg_len_label_overall,
        "avg_len_lang_label": avg_len_lang_label, "avg_len_lang_overall": avg_len_lang_overall,
        "quantile_len_label_overall": quantile_len_label_overall,
        "quantile_len_lang_overall": quantile_len_lang_overall,
        "quantile_len_lang_label": quantile_len_lang_label
    }

# --- Loading and Calculation ---
train_data = pd.read_csv(TRAIN_FILE)
dev_data = pd.read_csv(DEV_FILE)

train_stats = get_stats(train_data, "Train")
dev_stats = get_stats(dev_data, "Dev")

all_languages = sorted(list(set(train_stats['languages']) | set(dev_stats['languages'])))
all_types = sorted(list(set(train_stats['types']) | set(dev_stats['types'])))
all_labels = train_stats['labels']
metrics_quantiles = ['Q1', 'Median', 'Q3']
metric_map_quantiles = {'Q1': 'Q1', 'Median': 'Med', 'Q3': 'Q3'} # Short names

# --- Table Construction ---

# Table 1: Overview
df_overview = pd.DataFrame({
    "Total Examples": [train_stats['total'], dev_stats['total']],
    "Languages": [', '.join(sorted(train_stats['languages'])), ', '.join(sorted(dev_stats['languages']))],
    "Unique Types": [', '.join(sorted(train_stats['types'])), ', '.join(sorted(dev_stats['types']))]
}, index=[train_stats['name'], dev_stats['name']])
df_overview.index.name = "Dataset"

# Table 2: Distribution by Language and Label (Counts)
df_lang_label_counts = pd.DataFrame(index=all_languages)
df_lang_label_counts[('Train', 'Total')] = train_stats['lang_counts']
for label in all_labels:
    df_lang_label_counts[('Train', f'Label {label}')] = train_stats['lang_label_counts'].get(label, 0)
df_lang_label_counts[('Dev', 'Total')] = dev_stats['lang_counts']
for label in all_labels:
     df_lang_label_counts[('Dev', f'Label {label}')] = dev_stats['lang_label_counts'].get(label, 0)
df_lang_label_counts = df_lang_label_counts.fillna(0).astype(int)
# Reindex to include all languages and set column order
df_lang_label_counts = df_lang_label_counts.reindex(all_languages, fill_value=0)
train_cols = [('Train', 'Total')] + [('Train', f'Label {l}') for l in all_labels]
dev_cols = [('Dev', 'Total')] + [('Dev', f'Label {l}') for l in all_labels]
df_lang_label_counts = df_lang_label_counts[train_cols + dev_cols]
df_lang_label_counts.columns = pd.MultiIndex.from_tuples(df_lang_label_counts.columns, names=['Dataset', 'Metric'])

# Table 3: Distribution by Type
df_type_counts = pd.DataFrame({
    'Train Count': train_stats['type_counts'],
    'Dev Count': dev_stats['type_counts']
}, index=all_types).fillna(0).astype(int)
df_type_counts.index.name = "Type"

# Table 4: Overall Label Distribution
df_label_overall = pd.DataFrame({
    'Train Count': train_stats['label_counts_overall'],
    'Train %': (train_stats['label_counts_overall'] / train_stats['total'] * 100).round(2),
    'Dev Count': dev_stats['label_counts_overall'],
    'Dev %': (dev_stats['label_counts_overall'] / dev_stats['total'] * 100).round(2)
}, index=all_labels).fillna(0)
df_label_overall.index.name = "Label"

# Table 5: Average Text Length
df_avg_len = pd.DataFrame(index=all_languages)
df_avg_len[('Train', 'Overall Avg Len')] = train_stats['avg_len_lang_overall']
for label in all_labels:
    df_avg_len[('Train', f'Avg Len L{label}')] = train_stats['avg_len_lang_label'].get(label, np.nan)
df_avg_len[('Dev', 'Overall Avg Len')] = dev_stats['avg_len_lang_overall']
for label in all_labels:
    df_avg_len[('Dev', f'Avg Len L{label}')] = dev_stats['avg_len_lang_label'].get(label, np.nan)
# Reindex and ordering
df_avg_len = df_avg_len.reindex(all_languages)
train_len_cols = [('Train', 'Overall Avg Len')] + [('Train', f'Avg Len L{l}') for l in all_labels]
dev_len_cols = [('Dev', 'Overall Avg Len')] + [('Dev', f'Avg Len L{l}') for l in all_labels]
df_avg_len = df_avg_len[train_len_cols + dev_len_cols].round(2)
df_avg_len.columns = pd.MultiIndex.from_tuples(df_avg_len.columns, names=['Dataset', 'Metric'])

# Table 6: Text Length Quantiles
df_quantile_len = pd.DataFrame(index=all_languages)
# Add overall quantiles by language
for metric in metrics_quantiles:
    df_quantile_len[('Train', f'Overall {metric_map_quantiles[metric]}')] = train_stats['quantile_len_lang_overall'][metric]
    df_quantile_len[('Dev', f'Overall {metric_map_quantiles[metric]}')] = dev_stats['quantile_len_lang_overall'][metric]
# Add quantiles by language and label
for label in all_labels:
    for metric in metrics_quantiles:
        col_name = metric_map_quantiles[metric]
        # Check if the column exists in the multiindex (Quantile, Label)
        train_col = (metric, label)
        dev_col = (metric, label)
        if train_col in train_stats['quantile_len_lang_label']:
            df_quantile_len[('Train', f'L{label} {col_name}')] = train_stats['quantile_len_lang_label'][train_col]
        else:
            df_quantile_len[('Train', f'L{label} {col_name}')] = np.nan
        if dev_col in dev_stats['quantile_len_lang_label']:
             df_quantile_len[('Dev', f'L{label} {col_name}')] = dev_stats['quantile_len_lang_label'][dev_col]
        else:
            df_quantile_len[('Dev', f'L{label} {col_name}')] = np.nan

# Reindex and ordering
df_quantile_len = df_quantile_len.reindex(all_languages)
train_quantile_cols = [('Train', f'Overall {metric_map_quantiles[m]}') for m in metrics_quantiles] + \
                      [('Train', f'L{l} {metric_map_quantiles[m]}') for l in all_labels for m in metrics_quantiles]
dev_quantile_cols = [('Dev', f'Overall {metric_map_quantiles[m]}') for m in metrics_quantiles] + \
                    [('Dev', f'L{l} {metric_map_quantiles[m]}') for l in all_labels for m in metrics_quantiles]
# Filter columns that actually exist before using them for indexing
train_quantile_cols_exist = [col for col in train_quantile_cols if col in df_quantile_len.columns]
dev_quantile_cols_exist = [col for col in dev_quantile_cols if col in df_quantile_len.columns]
df_quantile_len = df_quantile_len[train_quantile_cols_exist + dev_quantile_cols_exist]
df_quantile_len = df_quantile_len.astype(float).round(1) # Round quantiles
df_quantile_len.columns = pd.MultiIndex.from_tuples(df_quantile_len.columns, names=['Dataset', 'Metric'])

# Overall summaries by label (Average and Quantiles)
df_avg_len_overall_labels = pd.DataFrame({
    'Train Avg Len': train_stats['avg_len_label_overall'],
    'Dev Avg Len': dev_stats['avg_len_label_overall']
}, index=all_labels).round(2)
df_avg_len_overall_labels.index.name = 'Label'

df_quantile_overall_labels = pd.DataFrame(index=all_labels)
for metric in metrics_quantiles:
    df_quantile_overall_labels[f'Train {metric}'] = train_stats['quantile_len_label_overall'][metric]
    df_quantile_overall_labels[f'Dev {metric}'] = dev_stats['quantile_len_label_overall'][metric]
df_quantile_overall_labels = df_quantile_overall_labels.astype(float).round(1)
df_quantile_overall_labels.index.name = 'Label'


# --- Display ---
print("--- Table 1: Datasets Overview ---")
print(df_overview)
print("\n--- Table 2: Distribution by Language and Label (Counts) ---")
print(df_lang_label_counts)
print("\n--- Table 3: Distribution by Type (Counts) ---")
print(df_type_counts)
print("\n--- Table 4: Overall Label Distribution ---")
print(df_label_overall)
print("\n--- Table 5: Average Text Length (in words) ---")
print(df_avg_len)
print("\n--- Overall Average Length by Label (All languages combined) ---")
print(df_avg_len_overall_labels)
print("\n--- Table 6: Text Length Quantiles (Q1, Median, Q3) ---")
print(df_quantile_len)
print("\n--- Overall Length Quantiles by Label (All languages combined) ---")
print(df_quantile_overall_labels)

--- Tableau 1: Vue d'ensemble des datasets ---
         Total Examples       Languages                         Unique Types
Dataset                                                                     
Train             31187  de, en, fr, ru  forum post, review, sentence, tweet
Dev                4625  de, en, fr, ru  forum post, review, sentence, tweet

--- Tableau 2: Distribution par Langue et Label (Comptes) ---
Dataset  Train                   Dev                
Metric   Total Label 0 Label 1 Total Label 0 Label 1
de        1482    1399      83   634     599      35
en       17974   16760    1214   902     841      61
fr         977     907      70   419     389      30
ru       10754    9670    1084  2670    2398     272

--- Tableau 3: Distribution par Type (Comptes) ---
            Train Count  Dev Count
Type                              
forum post         2459       1053
review              397         99
sentence           3810        935
tweet             24521       2538

-