# Survey Data Dictionary Generator

This notebook let researchers run it cell-by-cell in Jupyter.

> Steps:
> 1. Set `JSON_FILE` below to your survey JSON (e.g., `Yale_Fucito_Young_Adult_Alcohol_-_Live_Study_surveys_and_settings.json`).
> 2. Run the cells in order.
> 3. The notebook will create `survey_summary.md` and `survey_summary.pdf` next to your JSON file.

## (Optional) Install dependencies

In [1]:
# If needed, uncomment and run:
# !pip install reportlab pandas matplotlib numpy

## Parameters

In [2]:
# SET THIS TO YOUR INPUT JSON FILEPATH
JSON_FILE = "data/Yale_Fucito_Young_Adult_Alcohol_-_Live_Study_surveys_and_settings.json"

import os
# Write outputs to the JSON's directory
OUTDIR = os.path.dirname(os.path.abspath(JSON_FILE))
print("JSON_FILE:", JSON_FILE)
print("OUTDIR:", OUTDIR)


JSON_FILE: data/Yale_Fucito_Young_Adult_Alcohol_-_Live_Study_surveys_and_settings.json
OUTDIR: /Users/zhusiyao/Documents/Beiwe Projects/json transform/data


## Imports and constants

In [3]:
import json
import os
import argparse
from datetime import datetime, timedelta
from reportlab.platypus import (
    SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, HRFlowable
)
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib.pagesizes import letter
from reportlab.lib import colors
import pandas as pd

DAY_NAMES = ['Sunday', 'Monday', 'Tuesday', 'Wednesday',
              'Thursday', 'Friday', 'Saturday']

STREAMS = [
    'accelerometer', 'gps', 'calls', 'texts', 'wifi', 'bluetooth',
    'power_state', 'ambient_audio', 'proximity', 'gyro',
    'magnetometer', 'devicemotion', 'reachability'
]


## Helper functions

In [4]:
def seconds_to_hhmm_ampm(sec):
    dt = datetime(2000, 1, 1) + timedelta(seconds=sec)
    try:
        return dt.strftime("%-I:%M %p")
    except ValueError:
        return dt.strftime("%I:%M %p").lstrip('0')

def parse_display_if(expr, id_to_num, id_to_answers):
    if isinstance(expr, dict) and len(expr) == 1:
        op, val = next(iter(expr.items()))
        if op in ('==', '!=', '<=', '<', '>=', '>'):
            qid, v = val
            qnum = id_to_num.get(qid, '?')
            answers = id_to_answers.get(qid, [])
            v_str = answers[v] if isinstance(v, int) and 0 <= v < len(answers) else str(v)
            return f"Question {qnum} {op} {v_str}"
        if op in ('and', 'or'):
            parts = [parse_display_if(item, id_to_num, id_to_answers) for item in val]
            joiner = ' AND ' if op == 'and' else ' OR '
            return '(' + joiner.join(parts) + ')'
        if op == 'not':
            inner = parse_display_if(val, id_to_num, id_to_answers)
            return f"NOT ({inner})"
    if isinstance(expr, list):
        return ', '.join(parse_display_if(e, id_to_num, id_to_answers) for e in expr)
    return str(expr)

def extract_schedule(survey):
    weekly, absolute, relative = [], [], []
    timings = survey.get('timings', {})
    items = timings.items() if isinstance(timings, dict) else enumerate(timings)
    for day_idx, times in items:
        for t in times:
            desc = f"{DAY_NAMES[int(day_idx)]} at {seconds_to_hhmm_ampm(t)}"
            dt = datetime(2000, 1, 1) + timedelta(seconds=t)
            weekly.append((int(day_idx), dt.time(), desc))
    weekly.sort(key=lambda x: (x[0], x[1]))

    for t in survey.get('absolute_timings', []):
        if isinstance(t, (int, float)):
            dt = datetime.fromtimestamp(t)
            desc = dt.strftime("%Y-%m-%d %-I:%M %p")
            try:
                desc = dt.strftime("%Y-%m-%d %-I:%M %p")
            except ValueError:
                desc = dt.strftime("%Y-%m-%d %I:%M %p").replace(" 0", " ")
            absolute.append((dt, desc))
        else:
            absolute.append((None, str(t)))
    absolute.sort(key=lambda x: x[0] or datetime.min)

    raw = survey.get('relative_timings', {})
    entries = raw.values() if isinstance(raw, dict) else raw
    for e in entries:
        if isinstance(e, dict):
            anchor = e.get('anchor') or e.get('0')
            days   = e.get('days')   or e.get('1')
            secs   = e.get('seconds') or e.get('2')
        elif isinstance(e, (list, tuple)) and len(e) >= 3:
            anchor, days, secs = e[0], e[1], e[2]
        else:
            continue
        desc = f"{days} days after {anchor} at {seconds_to_hhmm_ampm(secs)}"
        relative.append((int(days), secs, desc))
    relative.sort(key=lambda x: (x[0], x[1]))

    return weekly, absolute, relative

In [5]:
def generate_markdown(json_path, md_path):
    data = json.load(open(json_path))
    surveys   = [s for s in data.get('surveys', []) if s.get('content')]
    non_audio = [s for s in surveys if s.get('survey_type') != 'audio_survey']
    audio     = [s for s in surveys if s.get('survey_type') == 'audio_survey']

    with open(md_path, 'w') as doc:
        doc.write("Active Surveys\n\n")
        # -- non-audio surveys --
        for survey in non_audio:
            weekly, absolute, relative = extract_schedule(survey)
            baseline = survey.get('trigger_on_first_download', False)
            always   = survey.get('always_available', False)
            if not (weekly or absolute or relative or baseline or always):
                continue

            doc.write(f"Survey Name: {survey.get('name','Unnamed Survey')}\n\n")
            if always:   doc.write("Survey Always Available\n\n")
            if baseline: doc.write("Baseline\n\n")

            if weekly or absolute or relative:
                doc.write("Schedule:\n\n")
                if weekly:
                    doc.write("Weekly:\n")
                    for i, (_, _, d) in enumerate(weekly, 1):
                        doc.write(f"{i}. {d}\n")
                    doc.write("\n")
                if absolute:
                    doc.write("Absolute:\n")
                    for i, (_, d) in enumerate(absolute, 1):
                        doc.write(f"{i}. {d}\n")
                    doc.write("\n")
                if relative:
                    doc.write("Relative:\n")
                    for i, (_, _, d) in enumerate(relative, 1):
                        doc.write(f"{i}. {d}\n")
                    doc.write("\n")
                if not weekly and not always:
                    count = (1 if baseline else 0) + len(absolute) + len(relative)
                    doc.write(f"Total Deployments: {count}\n\n")

            # Questions
            items = survey['content']
            id_to_num = {
                it['question_id']: idx
                for idx, it in enumerate(items, 1)
                if it.get('question_id')
            }
            id_to_ans = {
                it['question_id']: [a['text'] for a in it.get('answers', [])]
                for it in items if it.get('question_id')
            }

            for idx, it in enumerate(items, 1):
                doc.write(f"Question {idx}\n{it.get('question_text', it.get('prompt','')).strip()}\n")
                doc.write(f"- Type: {it.get('question_type','')}\n")
                if it.get('question_type') == 'slider':
                    doc.write(f"- Range: {it.get('min','')} to {it.get('max','')}\n")
                if it.get('answers'):
                    opts = '; '.join(a['text'] for a in it['answers'])
                    doc.write(f"- Options: {opts}\n")
                if it.get('display_if'):
                    cond = parse_display_if(it['display_if'], id_to_num, id_to_ans)
                    doc.write(f"- Display If: {cond}\n")
                doc.write("\n")

            doc.write("---\n\n")

        # -- audio surveys --
        if audio:
            doc.write("Audio Surveys\n\n")
            for survey in audio:
                weekly, absolute, relative = extract_schedule(survey)
                baseline = survey.get('trigger_on_first_download', False)
                always   = survey.get('always_available', False)
                if not (weekly or absolute or relative or baseline or always):
                    continue

                doc.write(f"Survey Name: {survey.get('name','Unnamed Survey')}\n\n")
                prompt = survey['content'][0].get('prompt','')
                doc.write(f"Prompt: {prompt}\n\n")
                if always:   doc.write("Survey Always Available\n\n")
                if baseline: doc.write("Baseline\n\n")

                if weekly or absolute or relative:
                    doc.write("Schedule:\n\n")
                    if weekly:
                        doc.write("Weekly:\n")
                        for i, (_, _, d) in enumerate(weekly, 1):
                            doc.write(f"{i}. {d}\n")
                        doc.write("\n")
                    if absolute:
                        doc.write("Absolute:\n")
                        for i, (_, d) in enumerate(absolute, 1):
                            doc.write(f"{i}. {d}\n")
                        doc.write("\n")
                    if relative:
                        doc.write("Relative:\n")
                        for i, (_, _, d) in enumerate(relative, 1):
                            doc.write(f"{i}. {d}\n")
                        doc.write("\n")
                    if not weekly and not always:
                        count = (1 if baseline else 0) + len(absolute) + len(relative)
                        doc.write(f"Total Deployments: {count}\n\n")

                doc.write("---\n\n")

def generate_pdf(md_path, pdf_path, json_path):
    lines = open(md_path).read().splitlines()
    doc = SimpleDocTemplate(pdf_path, pagesize=letter)
    styles = getSampleStyleSheet()
    styles['Heading1'].fontSize, styles['Heading1'].leading = 14, 16
    styles['Heading2'].fontSize, styles['Heading2'].leading = 12, 14
    styles['Normal'].fontSize, styles['Normal'].leading = 8, 10

    story = []
    for text in lines:
        if not text:
            story.append(Spacer(1, 6)); continue
        if text in ("Active Surveys", "Audio Surveys"):
            story.append(Paragraph(text, styles['Heading1']))
        elif text.startswith("Survey Name:"):
            story.append(Paragraph(text, styles['Heading2']))
        elif text.startswith("Prompt:"):
            story.append(Paragraph(f"<b>Prompt:</b> {text[8:]}", styles['Normal']))
        elif text in ("Survey Always Available", "Baseline"):
            story.append(Paragraph(text, styles['Normal']))
        elif text == "Schedule:":
            story.append(Paragraph("Schedule", styles['Heading2']))
        elif any(text.startswith(p) for p in ("Weekly:", "Absolute:", "Relative:")):
            story.append(Paragraph(f"<b>{text}</b>", styles['Normal']))
        elif text and text[0].isdigit() and text[1] == '.':
            story.append(Paragraph(text, styles['Normal']))
        elif text.startswith("Total Deployments:"):
            story.append(Paragraph(text, styles['Normal']))
        elif text.startswith("Question "):
            story.append(Paragraph(text, styles['Heading2']))
        elif text.startswith("- "):
            story.append(Paragraph(text, styles['Normal']))
        elif text == "---":
            story.append(Spacer(1, 6))
            story.append(HRFlowable(width="100%", thickness=1, color=colors.grey))
            story.append(Spacer(1, 6))
        else:
            story.append(Paragraph(text, styles['Normal']))

    # Device settings table
    data = json.load(open(json_path))
    ds = data.get('device_settings', {})
    records = []
    for s in STREAMS:
        rec = {'data_stream': s, 'enabled': ds.get(s, False)}
        for k, v in ds.items():
            if k.startswith(f"{s}_"):
                rec[k[len(s)+1:]] = v
        records.append(rec)

    df = pd.DataFrame(records)
    cols = list(df.columns)
    table_data = [cols] + df[cols].astype(str).values.tolist()

    table = Table(table_data, repeatRows=1)
    table.setStyle(TableStyle([
        ('FONTNAME',  (0,0), (-1,-1), 'Helvetica'),
        ('FONTSIZE',  (0,0), (-1,-1),  6),
        ('BACKGROUND',(0,0), (-1, 0),  colors.lightgrey),
        ('GRID',      (0,0), (-1,-1), 0.25, colors.black),
        ('PADDING',   (0,0), (-1,-1),  2),
    ]))

    story.append(Spacer(1, 12))
    story.append(Paragraph("Device Settings", styles['Heading1']))
    story.append(Spacer(1, 6))
    story.append(table)

    doc.build(story)


## Run

In [6]:
md_path  = os.path.join(OUTDIR, 'survey_summary.md')
pdf_path = os.path.join(OUTDIR, 'survey_summary.pdf')

generate_markdown(JSON_FILE, md_path)
generate_pdf(md_path, pdf_path, JSON_FILE)

print(f"Generated Markdown at: {md_path}")
print(f"Generated PDF at: {pdf_path}")

Generated Markdown at: /Users/zhusiyao/Documents/Beiwe Projects/json transform/data/survey_summary.md
Generated PDF at: /Users/zhusiyao/Documents/Beiwe Projects/json transform/data/survey_summary.pdf
