In [1]:
%matplotlib inline

import sys
from pathlib import Path

from tqdm import tqdm_notebook as tqdm
import pandas as pd

from bananas.utils import images
from bananas.utils.arrays import unique
from bananas.dataset import DataSet, DataType, Feature
from bulbasaur.learners.convolution import CNNClassifier
from bulbasaur.learners.transfer_learning import TransferLearningModel
from bulbasaur.learners.image_classifier import ImageClassifier
from torchvision import models as torchvision_models

# Root path of project relative to this notebook
ROOT = Path('..')

sys.path.insert(1, str(ROOT / 'scripts'))
from datamodels import *
from utils import *

### Read subject data from local file

In [2]:
df = pd.read_csv(ROOT / 'datasets' / 'subject_diagnosis.csv').set_index('key')

# Convert non-primitive fields
df['processed_path'] = df['processed_path'].apply(lambda x: Path(x))
df['image_path'] = df['image_path'].apply(lambda x: Path(x))
df['template_path'] = df['template_path'].apply(lambda x: Path(x))
df['drawing_box'] = df['drawing_box'].apply(lambda x: Box.load(x))
df['template_box'] = df['template_box'].apply(lambda x: Box.load(x))

df.head()

Unnamed: 0_level_0,image_path,template_name,template_path,template_box,drawing_box,processed_path,diagnosis,pathological
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
002_1,drawings/CASA/casaPsic_002Ev1.pdf_pg-18.jpg,casa,templates/casa.png,19384359281,36123592317,processed/casaPsic_002Ev1.pdf_pg-18.jpg,SANO,0
002_1,drawings/CIRCULO/circuloPsic_002Ev1.pdf_pg-17.jpg,circulo,templates/circulo.png,22346331154,1629178186,processed/circuloPsic_002Ev1.pdf_pg-17.jpg,SANO,0
002_1,drawings/MINIMENTAL/minimentalPsic_002Ev1.pdf_...,minimental,templates/minimental.png,1618128110,2150373113,processed/minimentalPsic_002Ev1.pdf_pg-3.jpg,SANO,0
002_1,drawings/PICO/picoPsic_002Ev1.pdf_pg-16.jpg,pico,templates/pico.png,131104427152,45183489255,processed/picoPsic_002Ev1.pdf_pg-16.jpg,SANO,0
002_1,drawings/CRUZ/cruzPsic_002Ev1.pdf_pg-17.jpg,cruz,templates/cruz.png,2134363195,3790591220,processed/cruzPsic_002Ev1.pdf_pg-17.jpg,SANO,0


In [3]:
# Filter only keys that have all drawings
tpl_name_list = df['template_name'].unique().tolist()
tpl_name_list = ['casa', 'circulo', 'cruz', 'cuadrado', 'cubo', 'minimental', 'triangulo']
all_keys = [key for key in df.index.unique() if len(df.loc[[key]]) >= len(tpl_name_list)]
df = df.loc[all_keys]

# Remove all unnecessary columns from our dataset
feat_keys = ['processed_path']
group_columns = ['key', 'template_name', 'diagnosis']
df_templates = df.reset_index()[group_columns + feat_keys].groupby('template_name')

# Merge into a single dataset crossing template name and feature
df_features = pd.DataFrame(index=all_keys)
for group_key in df_templates.groups.keys():
    idx = df_templates.groups[group_key]
    group_feats = df.iloc[idx][feat_keys]
    group_feats.columns = ['%s_%s' % (col, group_key) for col in group_feats.columns]
    df_features = pd.merge(
        df_features, group_feats, left_index=True, right_index=True, how='outer')
    
# Normalize all feature columns
df_features = df_features.dropna()
for col in df_features.columns:
    df_features[col] = df_features[col].apply(lambda x: str(ROOT / x))

# Add the diagnosis target feature back to the dataframe
# Surely there must be a better way to do this... But this will work for now
df_features['diagnosis'] = df.reset_index().drop_duplicates('key').set_index('key')['diagnosis']

# Save it to local file
df_features.sort_index().to_csv(ROOT / 'datasets' / 'subject_drawings_grouped.csv')

df_features.head()

Unnamed: 0,processed_path_casa,processed_path_circulo,processed_path_cruz,processed_path_cuadrado,processed_path_cubo,processed_path_minimental,processed_path_muelle,processed_path_pico,processed_path_triangulo,diagnosis
002_1,../processed/casaPsic_002Ev1.pdf_pg-18.jpg,../processed/circuloPsic_002Ev1.pdf_pg-17.jpg,../processed/cruzPsic_002Ev1.pdf_pg-17.jpg,../processed/cuadradoPsic_002Ev1.pdf_pg-17.jpg,../processed/cuboPsic_002Ev1.pdf_pg-18.jpg,../processed/minimentalPsic_002Ev1.pdf_pg-3.jpg,../processed/muellePsic_002Ev1.pdf_pg-16.jpg,../processed/picoPsic_002Ev1.pdf_pg-16.jpg,../processed/trianguloPsic_002Ev1.pdf_pg-17.jpg,SANO
002_2,../processed/casaPsic_002Ev2.pdf_pg-10.jpg,../processed/circuloPsic_002Ev2.pdf_pg-9.jpg,../processed/cruzPsic_002Ev2.pdf_pg-9.jpg,../processed/cuadradoPsic_002Ev2.pdf_pg-9.jpg,../processed/cuboPsic_002Ev2.pdf_pg-10.jpg,../processed/minimentalPsic_002Ev2.pdf_pg-3.jpg,../processed/muellePsic_002Ev2.pdf_pg-8.jpg,../processed/picoPsic_002Ev2.pdf_pg-8.jpg,../processed/trianguloPsic_002Ev2.pdf_pg-9.jpg,SANO
003_1,../processed/casaPsic_003Ev1.pdf_pg-16.jpg,../processed/circuloPsic_003Ev1.pdf_pg-14.jpg,../processed/cruzPsic_003Ev1.pdf_pg-14.jpg,../processed/cuadradoPsic_003Ev1.pdf_pg-14.jpg,../processed/cuboPsic_003Ev1.pdf_pg-16.jpg,../processed/minimentalPsic_003Ev1.pdf_pg-5.jpg,../processed/muellePsic_003Ev1.pdf_pg-15.jpg,../processed/picoPsic_003Ev1.pdf_pg-15.jpg,../processed/trianguloPsic_003Ev1.pdf_pg-14.jpg,SANO
003_3,../processed/casaPsic_003Ev3.pdf_pg-16.jpg,../processed/circuloPsic_003Ev3.pdf_pg-14.jpg,../processed/cruzPsic_003Ev3.pdf_pg-14.jpg,../processed/cuadradoPsic_003Ev3.pdf_pg-14.jpg,../processed/cuboPsic_003Ev3.pdf_pg-16.jpg,../processed/minimentalPsic_003Ev3.pdf_pg-4.jpg,../processed/muellePsic_003Ev3.pdf_pg-18.jpg,../processed/picoPsic_003Ev3.pdf_pg-18.jpg,../processed/trianguloPsic_003Ev3.pdf_pg-14.jpg,SANO
004_1,../processed/casaPsic_004Ev1.pdf_pg-9.jpg,../processed/circuloPsic_004Ev1.pdf_pg-8.jpg,../processed/cruzPsic_004Ev1.pdf_pg-8.jpg,../processed/cuadradoPsic_004Ev1.pdf_pg-8.jpg,../processed/cuboPsic_004Ev1.pdf_pg-9.jpg,../processed/minimentalPsic_004Ev1.pdf_pg-23.jpg,../processed/muellePsic_004Ev1.pdf_pg-7.jpg,../processed/picoPsic_004Ev1.pdf_pg-7.jpg,../processed/trianguloPsic_004Ev1.pdf_pg-8.jpg,SANO


### Display summary stats

In [4]:
summary_stats = []
for diag in df_features['diagnosis'].unique():
    count = sum(df_features['diagnosis'] == diag)
    summary_stats.append({
        'Diagnosis': diag,
        'Count': count,
        'Percent': count / len(df_features)})

summary_stats = pd.DataFrame.from_records(summary_stats).set_index('Diagnosis')
summary_stats

Unnamed: 0_level_0,Count,Percent
Diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1
SANO,198,0.533693
DCLNA,92,0.247978
DCLM,74,0.199461
DCLA,3,0.008086
BAJA,3,0.008086
BAJA EA,1,0.002695
