# Prolific demographics summary
Collect all exported demographic CSV files, keep only accepted participants, and compute age and gender summaries.

In [69]:
from pathlib import Path
import pandas as pd
import plotly.express as px

data_dir = Path("demographic_data")
csv_files = sorted(data_dir.glob("*.csv"))
if not csv_files:
    raise FileNotFoundError(f"No CSV files found in {data_dir.resolve()}")

raw = pd.concat((pd.read_csv(path) for path in csv_files), ignore_index=True)
raw.head()

Unnamed: 0,Submission id,Participant id,Status,Custom study tncs accepted at,Started at,Completed at,Reviewed at,Archived at,Time taken,Completion code,Total approvals,Age,Sex,Ethnicity simplified,Country of birth,Country of residence,Nationality,Language,Student status,Employment status
0,68d172ee4f15faf3af4b65bb,60d26e7cd9f0761e4d12b9f8,APPROVED,Not Applicable,2025-09-22T16:02:34.414000Z,2025-09-22T16:34:08.982000Z,2025-09-23T04:30:20.403000Z,2025-09-22T16:34:09.540159Z,1895.0,C1KD1A87,396,25,Female,White,Hungary,Hungary,Hungary,Hungarian,No,Unemployed (and job seeking)
1,68d17cc5d023588e9713128a,6108da57e362f96a3ee32a88,APPROVED,Not Applicable,2025-09-22T16:52:05.563000Z,2025-09-22T17:12:47.769000Z,2025-09-23T04:30:22.304000Z,2025-09-22T17:12:48.327157Z,1243.0,C1KD1A87,83,27,Female,White,Romania,Hungary,Hungary,Hungarian,No,Full-Time
2,68d18ff8bfe13459d8e02fc7,5dade76a4860f70017f70ec5,APPROVED,Not Applicable,2025-09-22T18:05:51.705000Z,2025-09-22T18:24:29.813000Z,2025-09-23T04:30:23.966000Z,2025-09-22T18:24:30.846150Z,1119.0,C1KD1A87,677,23,Male,White,Hungary,Hungary,Hungary,Hungarian,Yes,DATA_EXPIRED
3,68d137139fe16f73622b2536,61717173748006894b2b54ff,RETURNED,Not Applicable,,,,,,,1885,CONSENT_REVOKED,CONSENT_REVOKED,CONSENT_REVOKED,CONSENT_REVOKED,CONSENT_REVOKED,CONSENT_REVOKED,CONSENT_REVOKED,CONSENT_REVOKED,CONSENT_REVOKED
4,68d138b03c80804ac7f2915d,5c5b333a2b7de10001b78759,RETURNED,Not Applicable,2025-09-22T11:53:20.281000Z,,,,,,403,CONSENT_REVOKED,CONSENT_REVOKED,CONSENT_REVOKED,CONSENT_REVOKED,CONSENT_REVOKED,CONSENT_REVOKED,CONSENT_REVOKED,CONSENT_REVOKED,CONSENT_REVOKED


In [70]:
participant_to_remove = ["641379405684937e6fad9f1b", "642b35c70771761602e9c3ae", "5f046bf88d2c186cc10c6ad0"]
raw = raw[~raw["Participant id"].isin(participant_to_remove)].reset_index(drop=True)

In [71]:
accepted = (
    raw[raw["Status"].str.upper() == "APPROVED"]
    .assign(Age=lambda df: pd.to_numeric(df["Age"], errors="coerce"))
    .sort_values("Completed at")
    .drop_duplicates(subset="Participant id", keep="last")
    .reset_index(drop=True)
 )
accepted[["Participant id", "Status", "Age", "Sex"]].head()

Unnamed: 0,Participant id,Status,Age,Sex
0,60d26e7cd9f0761e4d12b9f8,APPROVED,25,Female
1,6151b07ac0d164fdd7e53100,APPROVED,24,Male
2,5f338ba6ea047119dbd6e49e,APPROVED,30,Male
3,6154d933a58bf7bcd9e81fed,APPROVED,38,Female
4,5d4fe6a2ffbcf800019d5e54,APPROVED,59,Female


In [72]:
age_summary = accepted["Age"].describe().to_frame().T
gender_summary = (
    accepted["Sex"]
    .str.title()
    .value_counts(dropna=False)
    .rename_axis("Sex")
    .to_frame("count")
 )
gender_summary["percent"] = (gender_summary["count"] / gender_summary["count"].sum() * 100).round(1)

total_participants = len(accepted)
age_summary, gender_summary, total_participants

(     count   mean        std   min    25%   50%    75%   max
 Age   50.0  35.32  10.304427  20.0  26.25  34.0  40.75  61.0,
                    count  percent
 Sex                              
 Female                32     64.0
 Male                  17     34.0
 Prefer Not To Say      1      2.0,
 50)

In [73]:
mean_age = age_summary.loc["Age", "mean"]
std_age = age_summary.loc["Age", "std"]
min_age = age_summary.loc["Age", "min"]
max_age = age_summary.loc["Age", "max"]

gender_parts = ", ".join(
    f"{sex.lower()} {row['percent']:.1f}% (n={row['count']})"
    for sex, row in gender_summary.iterrows()
)

summary_note = (
    f"The analytic sample comprised {total_participants} approved Prolific participants "
    f"(M_age = {mean_age:.1f} years, SD = {std_age:.1f}, range = {min_age:.0f}–{max_age:.0f}). "
    f"Gender composition was {gender_parts}."
)
print(summary_note)

The analytic sample comprised 50 approved Prolific participants (M_age = 35.3 years, SD = 10.3, range = 20–61). Gender composition was female 64.0% (n=32.0), male 34.0% (n=17.0), prefer not to say 2.0% (n=1.0).


## Plotly donut charts
Interactive visuals to quickly scan gender distribution and age-band composition among accepted participants.

In [74]:
gender_fig = px.pie(
    gender_summary.reset_index(),
    names="Sex",
    values="count",
    hole=0.5,
    title=f"Gender distribution (accepted participants, n={total_participants})",
    color_discrete_sequence=px.colors.qualitative.Pastel,
)
# show label, then count (just above) then percent
gender_fig.update_traces(
    texttemplate="<b>%{label}</b><br>n=%{value}<br>%{percent}",
    # textposition="inside",
    hovertemplate="%{label}: %{value} participants",
)
# Remove fig padding
gender_fig.update_layout(margin=dict(l=0, r=0, t=40, b=0))

# Save fig to plots
gender_fig.write_image("plots/demographics_gender_pie.png", width=1200, height=400, scale=3)

# show fig
gender_fig

In [75]:
age_bins = pd.cut(
    accepted["Age"],
    bins=[18, 25, 30, 35, 40, 50, 60, 120],
    labels=["18-24", "25-29", "30-34", "35-39", "40-49", "50-59", "60+"],
    right=False,
)

# enforce logical ascending order for plotting
age_order = ["18-24", "25-29", "30-34", "35-39", "40-49", "50-59", "60+"]
age_counts = (
    age_bins.value_counts(sort=False)
    .reindex(age_order, fill_value=0)
    .reset_index()
)
age_counts.columns = ["Age band", "count"]

age_fig = px.pie(
    age_counts,
    names="Age band",
    values="count",
    hole=0.5,
    title=f"Age bands (accepted participants, n={total_participants})",
    color_discrete_sequence=px.colors.qualitative.Set3,
    category_orders={"Age band": age_order},
)
age_fig.update_traces(
    texttemplate="<b>%{label}</b><br>n=%{value}<br>%{percent}",
    # textposition="inside",
    hovertemplate="%{label}: %{value} participants",
)

# remove fig padding
age_fig.update_layout(margin=dict(l=0, r=0, t=40, b=0))

# Save fig to plots
age_fig.write_image("plots/demographics_age_pie.png", width=1200, height=400, scale=3)

# show fig
age_fig