In [1]:
from pathlib import Path
import sys
import os
sys.path.append(os.path.abspath("..")) 

import numpy as np
import pandas as pd
import re

from typing import Any, Dict, Optional, List
import json

from scripts.get_paths import get_path

pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [2]:
paths = get_path()
df = pd.read_csv(paths.features_2 / "extracted_features.csv", index_col=False)

### Extract likert scale scores for the 4 questions 

In [3]:
def find_questions_json(folder: Path) -> Optional[Path]:
    """
    Find the first JSON file under `folder` (recursive) whose name contains
    the substring '_questions.json' (case-insensitive).
    """
    # rglob is recursive; we filter by filename substring
    candidates = []
    for p in folder.rglob("*.json"):
        if p.is_file() and p.name.lower().endswith("_questions.json"):
            candidates.append(p)

    # If multiple exist, pick the first deterministically
    return sorted(candidates)[0] if candidates else None


def extract_a_number_from_filename(path: Path) -> Optional[int]:
    """
    Extract the number that appears immediately before '_questions.json'
    Example: '15_questions.json' -> 15
             'abc_15_questions.json' -> 15 (only if it ends with '15_questions.json')
    """
    m = re.compile(r"(\d+)(?=_questions\.json$)", re.IGNORECASE).search(path.name)
    return int(m.group(1)) if m else None


def parse_questions(payload: Dict[str, Any]) -> Dict[str, Any]:
    current = payload.get("current", {}) or {}
    return {
        "question_1": current.get("question_1"),
        "question_2": current.get("question_2"),
        "question_3": current.get("question_3"),
        "question_4": current.get("question_4"),
    }


def build_df(root_path: str) -> pd.DataFrame:
    root = Path(root_path).expanduser().resolve()
    if not root.exists():
        raise FileNotFoundError(f"Root path does not exist: {root}")

    rows: List[Dict[str, Any]] = []

    for sub in sorted([p for p in root.iterdir() if p.is_dir()]):
        json_path = find_questions_json(sub)
        if not json_path:
            # skip folders without matching file
            continue

        a_number = extract_a_number_from_filename(json_path)

        try:
            payload = json.loads(json_path.read_text(encoding="utf-8"))
        except Exception:
            # skip unreadable JSON (or record error if you prefer)
            continue

        q = parse_questions(payload)
        rows.append({"participant_id": a_number, **q})

    return pd.DataFrame(rows, columns=["participant_id", "question_1", "question_2", "question_3", "question_4"])


In [4]:
df_likert = build_df(paths.raw)

In [5]:
df_NT = df[df['NT']==1]
nt_participants = df_NT['participant_id'].unique().tolist()

In [6]:
df_sub = df_likert[df_likert["participant_id"].isin(nt_participants)].copy()

In [7]:
question_cols = ["question_1", "question_2", "question_3", "question_4"]

for col in question_cols:
    df_sub[col] = (
        df_sub[col]
        .str.replace("checked", "", regex=False)
        .astype(float)   # use float in case of NaN
        .fillna(0)
        .astype(int)
    )


In [8]:
# number of times a score was given to a question
dist_df = (
    df_sub[question_cols]
    .apply(lambda s: s.value_counts())
    .fillna(0)
    .astype(int)
    .sort_index()
)

print(dist_df)


   question_1  question_2  question_3  question_4
0           1          14           6           2
1           0           2          11           0
2           3           1           2           2
3           2           2           1           0
4           4           0           1           5
5           4           2           0           6
6           7           0           0           6


In [9]:
# percentages
percent_df = dist_df.div(dist_df.sum(axis=0), axis=1) * 100
percent_df = percent_df.round(1)
percent_df

Unnamed: 0,question_1,question_2,question_3,question_4
0,4.8,66.7,28.6,9.5
1,0.0,9.5,52.4,0.0
2,14.3,4.8,9.5,9.5
3,9.5,9.5,4.8,0.0
4,19.0,0.0,4.8,23.8
5,19.0,9.5,0.0,28.6
6,33.3,0.0,0.0,28.6


In [10]:
# Median and IQR
def summary_median_iqr_extremes(dist_df: pd.DataFrame) -> pd.DataFrame:
    values = dist_df.index.to_numpy(dtype=int)
    summaries = {}

    for col in dist_df.columns:
        counts = dist_df[col].to_numpy(dtype=int)
        expanded = np.repeat(values, counts)

        median = np.median(expanded)
        q1, q3 = np.percentile(expanded, [25, 75])

        lower_extreme = values.min()
        upper_extreme = values.max()

        pct_low = (expanded == lower_extreme).mean() * 100
        pct_high = (expanded == upper_extreme).mean() * 100

        summaries[col] = {
            "Median": median,
            "IQR": f"{q1:.1f}–{q3:.1f}",
            f"% at {lower_extreme}": round(pct_low, 1),
            f"% at {upper_extreme}": round(pct_high, 1),
        }

    return pd.DataFrame(summaries)

summary_df = summary_median_iqr_extremes(dist_df)
summary_df


Unnamed: 0,question_1,question_2,question_3,question_4
Median,5.0,0.0,1.0,5.0
IQR,3.0–6.0,0.0–1.0,0.0–1.0,4.0–6.0
% at 0,4.8,66.7,28.6,9.5
% at 6,33.3,0.0,0.0,28.6
