# Extract Evaluations
This notebook uses the TSV data encoding patient diagnosis. The <patient, evaluation> key is derived from the image filename. This notebook also uses the output from the `Process Drawing` notebook and essentially appends the patient diagnosis data as a column for its output.

In [None]:
import sys
from pathlib import Path

import numpy as np
import pandas as pd
from tqdm import tqdm

# Root path of project relative to this notebook
ROOT = Path('..')

sys.path.insert(1, str(ROOT / 'scripts'))
from datamodels import *
from utils import *

### Load image drawing metadata

In [None]:
# Read and merge data sources
df = pd.read_csv(ROOT / 'datasets' / 'image_processed.csv').set_index('image_path')

# Convert non-primitive fields
df['drawing_box'] = df['drawing_box'].apply(lambda x: Box.load(x))
df['template_box'] = df['template_box'].apply(lambda x: Box.load(x))

### Read diagnosis data from local file

In [None]:
ev = pd.read_csv(ROOT / 'datasets' / 'evaluations.tsv', sep='\t')
ev['diagnosis'] = ev['DIAG'].map(lambda diag: diag.strip().upper())
ev['pathological'] = ev['diagnosis'].isin(('DCLNA', 'DCLM', 'DCLA')).astype(int)
ev['key'] = ev['ID'].map(lambda x: '%03d' % x) + '_' + ev['EV'].map(str)
ev = ev[['diagnosis', 'pathological', 'key']].set_index('key')
diag = {key: not row['pathological'] for key, row in ev.iterrows()}

### Merge evaluation and processed dataset

In [None]:
df = df.reset_index()
df['key'] = df['image_path'].apply(lambda x: PatientRecord.build_key(Path(x)))
df = pd.merge(df.set_index('key'), ev, left_index=True, right_index=True, how='inner')
df.sort_values('key').to_csv(ROOT / 'datasets' / 'drawing_evaluations.csv')

### Display summary statistics

In [None]:
def percent_str(val: float) -> str:
    return ('%.02f' % (val * 100)) + '%'

In [None]:
template_stats = []
for name in df['template_name'].unique():
    count = sum(df['template_name'] == name)
    template_stats.append({
        'Drawing Category': name,
        'Count': count,
        'Percent': percent_str(count / len(df))})

template_stats = pd.DataFrame.from_records(template_stats).set_index('Drawing Category')

In [None]:
diagnosis_stats = []
for diag in df['diagnosis'].unique():
    count = sum(df['diagnosis'] == diag)
    diagnosis_stats.append({
        'Diagnosis': diag,
        'Count': count,
        'Percent': percent_str(count / len(df))})

pd.DataFrame.from_records(diagnosis_stats).set_index('Diagnosis')