# Generate configuration using the `revisitpy` package

Source: https://revisit.dev/docs/revisitpy/

Also: 
- https://github.com/revisit-studies/revisitpy-examples
- https://github.com/revisit-studies/revisitpy

In [1]:
import revisitpy as rvt
import pandas as pd 
import polars as pl
from datetime import date 
import json
# import revisitpy_server as rvt_server

  class StudyConfig(BaseModel):


In [2]:
# confirm working directory 

import os 
os.getcwd()

'/Users/shenglong/Downloads/study'

A revisit study has the following components in its `config.json`: [original link](https://revisit.dev/docs/typedoc/interfaces/StudyConfig/#importedlibraries)  

- `$schema`: ...
- `studyMetadata`: ...
- `uiConfig`: ...
- `importedLibraries`: ...
- `components`: ...
- `sequence`: ...

---

Some other things: 

- [`dataclasses`](https://docs.python.org/3/library/dataclasses.html): something that is related to python
- [`dataclasses.asdict`](https://docs.python.org/3/library/dataclasses.html): converts the dataclass obj to a dict

They are creating a specific dataclass obj `DataRow`: [link](https://github.com/revisit-studies/revisitpy/blob/51414e51d4c1f9c1f66b3f9c642c3c40a60138fc/src/revisitpy/revisitpy.py#L559)

```python
# Create a data class with attributes based on the headers
        DataRow = make_dataclass("DataRow", [(header, Any) for header in headers])
```

## Metadata 

- Example here: https://revisit.dev/docs/revisitpy/examples/example_jnd_study/ 

In [None]:
# Meta Data
study_metadata = rvt.studyMetadata(
    authors=["Sheng Long"],
    organizations=["Northwestern University"],
    title='Retrieve Value Judgment Study',
    description='',
    date=date.today().strftime("%Y-%m-%d"),
    version='1.0'
)

# UI Config
ui_config = rvt.uiConfig(
  contactEmail="shenglong@u.northwestern.edu",
  logoPath="assets/revisitLogoSquare.svg",
  withSidebar=True,
  withProgressBar=False,
  nextOnEnter=True,
  minWidthSize=800,
  minHeightSize=800,
)

# print(study_metadata)
# print(ui_config)

In [4]:
# read in external data 

ret_df = pl.read_parquet('public/vis-decode-retrieve-value/encqa_v1_ret.parquet')
# add a column for the id 
ret_df = ret_df.with_row_index("id", offset=1)
ret_df.head()

print(ret_df.select(pl.col('chart_spec')).head(1).item())

{"config": {"view": {"continuousWidth": 300, "continuousHeight": 300}, "axis": {"grid": false}, "bar": {"color": "gray"}}, "data": {"name": "data-9a1c73a21da12f2e668e101efeeb1bbd"}, "mark": {"type": "point", "color": "gray", "filled": true, "size": 300}, "encoding": {"x": {"axis": {"labelAngle": 0, "title": null}, "field": "cat", "type": "nominal"}, "y": {"aggregate": "mean", "axis": {"title": "Var"}, "field": "var1", "type": "quantitative"}}, "height": 400, "width": 400, "$schema": "https://vega.github.io/schema/vega-lite/v5.17.0.json", "datasets": {"data-9a1c73a21da12f2e668e101efeeb1bbd": [{"cat": "C", "var1": 94.53337893746139, "var2": 45.7573094474521}, {"cat": "E", "var1": 65.92327500197413, "var2": 53.03417406992416}, {"cat": "D", "var1": 52.33420881465709, "var2": 54.67417687172758}, {"cat": "B", "var1": 56.60850125771311, "var2": 43.262914732152254}, {"cat": "A", "var1": 40.84536899383492, "var2": 56.02464150248745}, {"cat": "E", "var1": 49.95558389337083, "var2": 35.4335154793

In [19]:
ret_df.filter(pl.col("task") == "compute_derived_value_exact").select(["chart_spec"]).head(1).item()
ret_df.filter(pl.col("task") == "compute_derived_value_exact").head(1)

id,image_path,question,true_label,options,task,task_details,encoding,variable_type,answer_type,num_marks,num_categories,chart_spec,image,split,canary_guid
u32,str,str,str,list[str],str,str,str,str,str,i8,i8,str,struct[2],str,str
101,"""synthetic_data/images/compute_…","""What is the average value of V…","""50.13""",,"""compute_derived_value_exact""","""{""subtask"": ""identify""}""","""position""","""quantitative""","""numeric""",5,5,"""{""config"": {""view"": {""continuo…","{b""\x89PNG\x0d\x0a\x1a\x0a\x00\x00\x00\x0dIHDR\x00\x00\x01\xc1\x00\x00\x01\xb0\x08\x02\x00\x00\x00^\xa7I\xdb\x00\x00\x19\x89IDATx\x9c\xed\xdd1P\xdbX\xc2\xc0q\xf1Mf\xec\x99+\x10\x15""…,null}","""synthetic_data""","""826ba6c6-5e2d-4b59-8684-77e497…"


## Generate component related 

In [None]:
# Introduction
introduction = rvt.component(type='markdown', 
                             path='vis-decode-retrieve-value/assets/introduction.md', 
                             component_name__= 'introduction')
print(introduction)
intro_seq = rvt.sequence(order='fixed', components = [introduction]) 
print(intro_seq) 

In [None]:
# # rvt.sequence(order = 'random').from_data([(1, 2), (2, 3)])
# # asdict([1, 23])
# new_df = pd.DataFrame({"id": [i + 1 for i in range(10)]}, {"val": [i + 1 for i in range(10)]})
# print(new_df)
# new_df.to_csv('data.csv', index=False)
# print(rvt.data("data.csv"))

In [None]:
response = rvt.response(
    id = "retrieve_value", 
    prompt = 'Your selected answer', 
    location = 'belowStimulus', 
    type = 'numerical', 
    required = True,
)

In [None]:
n_rows = 25

new_df = pd.DataFrame({"id": [i + 1 for i in range(n_rows)]})
# print(new_df)
new_df.to_csv('data.csv', index=False)
# print(rvt.data("data.csv"))

data_sequence = rvt.sequence(order = 'random', numSamples=n_rows).from_data(rvt.data("data.csv"))
print(data_sequence)

In [None]:
def retrieve_value_component_function(id): 
    """
    This function is used to generate the component for the retrieve value study. 
    """
    row = ret_df.filter(pl.col('id') == id)
    chart_spec_value = row.select(pl.col('chart_spec')).item()
    question = row.select(pl.col('question')).item()
    # print(json.loads(chart_spec_value))
    # print(question)
    # print(chart_spec_value)

    # get the spec for the given id 
    comp = rvt.component(
        component_name__ = f'retrieve_value_{id}',
        type = 'vega',
        response = [response],
        config = json.loads(chart_spec_value),
        instruction = f'{question}',
        instructionLocation = 'belowStimulus',
        withSidebar = False,
    )
    return comp

In [None]:
data_sequence.component(retrieve_value_component_function)

In [None]:
print(data_sequence)
# print(data_sequence.get_components()[0])

In [None]:
main_sequence = rvt.sequence(order='fixed',components=[introduction]) + data_sequence

study = rvt.studyConfig(
    schema="https://raw.githubusercontent.com/revisit-studies/study/v2.3.1/src/parser/StudyConfigSchema.json",
    uiConfig=ui_config,
    studyMetadata=study_metadata,
    sequence=main_sequence,
    importedLibraries = ['virtual-chinrest']
)
print(study)

In [None]:
# code if we are to use the rvt_server 

# process = rvt_server.serve()
# process.terminate()
# w = rvt.widget(study, server = True)

In [None]:
str(study)
# print(study)

## Save study 

In [None]:
# write out the study configuration 

# Write directly to file
with open('public/vis-decode-retrieve-value/config.json', 'w', encoding='utf-8') as f:
    # json.write(str(study), f, indent=2, ensure_ascii=False)
    f.write(str(study))

# Generating random charts with altair and disk? 

Based on _The Weighted Average Illusion:  Biases in Perceived Mean Position in Scatterplots_, the authors used the following: 

> To generate the x- and y-data, we used Poisson disk sampling [50] to produce 30 uniquely distributed point grids, with minimum distance between the boundaries of any two points set at 8 pixels. This methodology is similar to Gleicher et al. [34]. Each dataset always contained 30 marks, with the number of points selected in piloting.

In [133]:
import altair as alt 
import polars as pl
from scipy.stats import qmc # quasi monte carlo submodule 
import numpy as np

Note that QMC only provide an $n \times d$ array of numbers in $[0, 1]$. 
([source](https://docs.scipy.org/doc/scipy/reference/stats.qmc.html))

Let's first set the radius of distance apart to be $r = 1/10 = 0.1$: 

In [134]:
# Poisson disk sampling 
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.qmc.PoissonDisk.html

rng = np.random.default_rng()
engine = qmc.PoissonDisk(d=2, radius = 0.08, rng = None)
sample = engine.random(30).round(2) # round this to 2 

In [135]:
# turn sample into df for plotting
df = pl.DataFrame(sample)
df.head(5)

column_0,column_1
f64,f64
0.11,0.96
0.26,1.0
0.05,0.89
0.13,0.87
0.21,0.89


In [136]:
# plot using altair
(
    
    alt.Chart(df).mark_point(filled = True).encode(
        alt.X('column_0').scale(domain=(0, 1)),
        alt.Y('column_1').scale(domain=(0, 1))
    ).properties(
        # width = 500,
        # height = 500
    )
)

Let us scale things from $[0, 1]$ to $[0, 500]$ by directly manipuating the `sample`: 

In [137]:
df = pl.DataFrame(sample * 500)
# plot using altair
(
    
    alt.Chart(df).mark_circle(filled = True, size = np.pi * 12 * 12).encode(
        alt.X('column_0').scale(domain=(0, 500)),
        alt.Y('column_1').scale(domain=(0, 500))
    ).properties(
        width = 400,
        height = 400
    )
)

### A note on units

`Vega-altair`'s default unit is **pixels**. And from this [website](https://altair-viz.github.io/altair-viz-v4/user_guide/marks.html#:~:text=%2D%20For%20point%20/%20circle%20/%20square,to%20null%20to%20remove%20stroke.), the default `size` for `mark_point` and `mark_circle` is the pixel area, which is 30. So this means the radius is about 3 pixels ...

In [42]:
(30 / np.pi) ** (1/2)

3.0901936161855166

We might need to "pad" the generated data samples so that the actual points are not that close to the axes ...? 