# Overview

In this notebook, we will load m2c2kit data exported from our MongoDB collection titled `warehouse`, parse it into individual task dataframes, deduplicate and score.

## Configure Environment

In [None]:
#!pip install m2c2-datakit
!pip3 install m2c2-datakit

### Import M2C2 DataKit Library (it imports other libraries BTS...)

In [None]:
import m2c2_datakit as m2c2
m2c2.core.utils.get_package_version()

In [None]:
# get timestamp for saving files
ts_fn = m2c2.core.utils.get_filename_timestamp()
print(ts_fn)

## Parameters for Scoring

To be specified in a task-specific way.

In [None]:
# typical grouping for aggregation (can be edited)
grouping_for_aggregation = ["participant_id", "session_uuid", "session_id"]

# expected trials (will be different per task)
trials_expected = 20

# ontology for data filtering
rt_outlier_low = 500
rt_outlier_high = 10000

## Get and Parse Data

Eventually we will have various data loaders

- mongodb_export - from Nelson (what we have now)
- api_export - from backend API
- metricwire_export - from MW portal
- metricwire_api_export - from MW API

### Load Full JSON file

In [None]:
# load JSON data
df, grouped_dataframes, validation, activity_names = m2c2.loaders.mongodb.load_mongodb_export('../data/production-mongo-export/data_exported_120424_1010am.json')

# or folder of files from Metricwire Portal export (as of December 2024)
#df, grouped_dataframes, validation, activity_names = m2c2.loaders.metricwire.load_metricwire_export(filepath = "../../data/metricwire/unzipped/*/*/*.json")

print(f"Validation was successful: {validation}")

#### Extract Tasks of Interest

The schema of `df_symbol_search_raw` and `df_grid_memory_raw` is going to be identical to what the new API returns in its JSON return.

In [14]:
df_symbol_search_raw = grouped_dataframes.get('Symbol Search')
df_grid_memory_raw = grouped_dataframes.get('Grid Memory')

#### Drop Duplicates, and Make JSON Data Tabular

In [15]:
df_symbol_search_unnested_dedup = m2c2.core.parse.unnest_trial_level_data(df_symbol_search_raw, drop_duplicates=True)
df_grid_memory_unnested_dedup = m2c2.core.parse.unnest_trial_level_data(df_grid_memory_raw, drop_duplicates=True)

### Score Data
* If from metricwire, can run raw
* If production server run above

In [16]:
df_symbol_search_scored = m2c2.tasks.symbol_search.score_trials(df_symbol_search_unnested_dedup)

In [17]:
df_grid_memory_scored = m2c2.tasks.grid_memory.score_trials(df_grid_memory_unnested_dedup)

In [None]:
# Custom scores
# custom_scores = [
#     ("custom_metric_1", my_custom_func_1),
#     ("custom_metric_2", my_custom_func_2),
# ]

# df_grid_memory_scored_custom = m2c2.tasks.grid_memory.score_grid_memory_data(df_grid_memory_unnested_dedup, 
#                                                                              scoring_funcs=custom_scores)

### Summarize

In [None]:
# generic summary function (i.e., group by each participant and calculate summary statistics)
df_symbol_search_summary = m2c2.core.scoring.summarize_data(
                                        # primary arguments
                                        df=df_symbol_search_scored, 
                                        grouping=grouping_for_aggregation, 
                                        summarization_func=m2c2.tasks.symbol_search.summarize, 
                                        
                                        # additional arguments
                                        trials_expected=20)
df_symbol_search_summary

In [None]:
# generic summary function (i.e., group by each participant and calculate summary statistics)
df_grid_memory_summary = m2c2.core.scoring.summarize_data(
                                        # primary arguments
                                        df=df_grid_memory_scored, 
                                        grouping=grouping_for_aggregation, 
                                        summarization_func=m2c2.tasks.grid_memory.summarize, 
                                        
                                        # additional arguments
                                        trials_expected=4)
df_grid_memory_summary

### Visualize

In [None]:
df_symbol_search_filt = df_symbol_search_scored.copy()
df_symbol_search_filt = df_symbol_search_filt[['response_time_duration_ms', 'metric_accuracy']]
m2c2.core.plot.plot_distribution(df_symbol_search_filt) #main_vars=True, exp_vars=False)

df_grid_memory_filt = df_grid_memory_scored.copy()
df_grid_memory_filt = df_grid_memory_filt[['metric_error_distance_mean', 'metric_error_distance_sum', 'metric_error_distance_hausdorff']]
m2c2.core.plot.plot_distribution(df_grid_memory_filt)



## Export Data as CSV

In [None]:
# Export dataframes for Symbol Search
m2c2.core.export.export_dataframe(df_symbol_search_raw, 
    f"../output/m2c2_datakit_symbol_search_raw_{ts_fn}")
m2c2.core.export.export_dataframe(df_symbol_search_unnested_dedup, 
    f"../output/m2c2_datakit_symbol_search_unnested_dedup_{ts_fn}")
m2c2.core.export.export_dataframe(df_symbol_search_scored, 
    f"../output/m2c2_datakit_symbol_search_scored_{ts_fn}")
m2c2.core.export.export_dataframe(df_symbol_search_summary, 
    f"../output/m2c2_datakit_symbol_search_summary_{ts_fn}")

In [None]:
# Export dataframes for Grid Memory
m2c2.core.export.export_dataframe(df_grid_memory_raw, 
    f"../output/m2c2_datakit_grid_memory_raw_{ts_fn}")
m2c2.core.export.export_dataframe(df_grid_memory_unnested_dedup, 
    f"../output/m2c2_datakit_grid_memory_unnested_dedup_{ts_fn}")
m2c2.core.export.export_dataframe(df_grid_memory_scored, 
    f"../output/m2c2_datakit_grid_memory_scored_{ts_fn}")
m2c2.core.export.export_dataframe(df_grid_memory_summary, 
    f"../output/m2c2_datakit_grid_memory_summary_{ts_fn}")