# Group Evaluations
This notebook creates a new dataset where each record contains all drawings per subject per evaluation, including the evaluation results. The output is used for the QD-Grouped model.

In [None]:
import sys
from pathlib import Path

from tqdm import tqdm
import pandas as pd

from bananas.utils import images
from bananas.utils.arrays import unique
from bananas.dataset import DataSet, DataType, Feature
from coconuts.learners.convolution import CNNClassifier
from coconuts.learners.transfer_learning import TransferLearningModel
from coconuts.learners.image_classifier import ImageClassifier
from torchvision import models as torchvision_models

# Root path of project relative to this notebook
ROOT = Path('..')

sys.path.insert(1, str(ROOT / 'scripts'))
from datamodels import *
from utils import *

### Read subject data from local file

In [None]:
df = pd.read_csv(ROOT / 'datasets' / 'drawing_evaluations.csv').set_index('key')

# Convert non-primitive fields
df['processed_path'] = df['processed_path'].apply(lambda x: Path(x))
df['image_path'] = df['image_path'].apply(lambda x: Path(x))
df['template_path'] = df['template_path'].apply(lambda x: Path(x))
df['drawing_box'] = df['drawing_box'].apply(lambda x: Box.load(x))
df['template_box'] = df['template_box'].apply(lambda x: Box.load(x))

In [None]:
# Filter only keys that have all drawings
tpl_name_list = df['template_name'].unique().tolist()
tpl_name_list = ['casa', 'circulo', 'cruz', 'cuadrado', 'cubo', 'minimental', 'triangulo']
all_keys = [key for key in df.index.unique() if len(df.loc[[key]]) >= len(tpl_name_list)]
df = df.loc[all_keys]

# Remove all unnecessary columns from our dataset
feat_keys = ['processed_path']
group_columns = ['key', 'template_name', 'diagnosis']
df_templates = df.reset_index()[group_columns + feat_keys].groupby('template_name')

# Merge into a single dataset crossing template name and feature
df_features = pd.DataFrame(index=all_keys)
for group_key in df_templates.groups.keys():
    idx = df_templates.groups[group_key]
    group_feats = df.iloc[idx][feat_keys]
    group_feats.columns = ['%s_%s' % (col, group_key) for col in group_feats.columns]
    df_features = pd.merge(
        df_features, group_feats, left_index=True, right_index=True, how='outer')
    
# Add the diagnosis target feature back to the dataframe
df_features['diagnosis'] = df.reset_index().drop_duplicates('key').set_index('key')['diagnosis']

# Save it to local file
df_features.sort_index().dropna().to_csv(ROOT / 'datasets' / 'drawing_evaluations_grouped.csv')

### Display summary stats

In [None]:
summary_stats = []
for diag in df_features['diagnosis'].unique():
    count = sum(df_features['diagnosis'] == diag)
    summary_stats.append({
        'Diagnosis': diag,
        'Count': count,
        'Percent': count / len(df_features)})

pd.DataFrame.from_records(summary_stats).set_index('Diagnosis')