# Extract Patient Diagnosis
This notebook uses the TSV data encoding patient diagnosis. The <patient, evaluation> key is derived from the image filename. This notebook also uses the output from the `Extract Image Drawing` notebook and essentially appends the patient diagnosis data as a column for its output.

In [1]:
%matplotlib inline

import sys
from pathlib import Path

import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm

# Root path of project relative to this notebook
ROOT = Path('..')

sys.path.insert(1, str(ROOT / 'scripts'))
from datamodels import *
from utils import *

### Load image drawing metadata

In [2]:
# Read and merge data sources
df = pd.read_csv(ROOT / 'datasets' / 'image_processed.csv').set_index('image_path')

# Convert non-primitive fields
df['drawing_box'] = df['drawing_box'].apply(lambda x: Box.load(x))
df['template_box'] = df['template_box'].apply(lambda x: Box.load(x))

df.head()

Unnamed: 0_level_0,template_name,template_path,template_box,drawing_box,processed_path
image_path,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
drawings/CASA/casaPsic_025Ev4.pdf_pg-12.jpg,casa,templates/casa.png,25467420264,368284302,processed/casaPsic_025Ev4.pdf_pg-12.jpg
drawings/CASA/casaPsic_135Ev2.pdf_pg-10.jpg,casa,templates/casa.png,25088416285,329251323,processed/casaPsic_135Ev2.pdf_pg-10.jpg
drawings/CASA/casaPsic_004Ev4.pdf_pg-13.jpg,casa,templates/casa.png,2488414205,142187390444,processed/casaPsic_004Ev4.pdf_pg-13.jpg
drawings/CASA/casaPsic_220Ev3.pdf_pg-8.jpg,casa,templates/casa.png,19790363287,31738565332,processed/casaPsic_220Ev3.pdf_pg-8.jpg
drawings/CASA/casaPsic_029Ev3.pdf_pg-8.jpg,casa,templates/casa.png,264114430311,7450322344,processed/casaPsic_029Ev3.pdf_pg-8.jpg


### Read diagnosis data from local file

In [3]:
ev = pd.read_csv(ROOT / 'drawings' / 'evaluations.tsv', sep='\t')
ev['diagnosis'] = ev['DIAG'].map(lambda diag: diag.strip().upper())
ev['pathological'] = ev['diagnosis'].isin(('DCLNA', 'DCLM', 'DCLA')).astype(int)
ev['key'] = ev['ID'].map(lambda x: '%03d' % x) + '_' + ev['EV'].map(str)
ev = ev[['diagnosis', 'pathological', 'key']].set_index('key')
diag = {key: not row['pathological'] for key, row in ev.iterrows()}
ev.head()

Unnamed: 0_level_0,diagnosis,pathological
key,Unnamed: 1_level_1,Unnamed: 2_level_1
002_1,SANO,0
002_2,SANO,0
002_3,SANO,0
003_1,SANO,0
003_2,SANO,0


### Merge evaluation and processed dataset

In [4]:
df = df.reset_index()
df['key'] = df['image_path'].apply(lambda x: PatientRecord.build_key(Path(x)))
df = pd.merge(df.set_index('key'), ev, left_index=True, right_index=True, how='inner')
df.sort_values('key').to_csv(ROOT / 'datasets' / 'subject_diagnosis.csv')

df.head()

Unnamed: 0_level_0,image_path,template_name,template_path,template_box,drawing_box,processed_path,diagnosis,pathological
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
002_1,drawings/CASA/casaPsic_002Ev1.pdf_pg-18.jpg,casa,templates/casa.png,19384359281,36123592317,processed/casaPsic_002Ev1.pdf_pg-18.jpg,SANO,0
002_1,drawings/CIRCULO/circuloPsic_002Ev1.pdf_pg-17.jpg,circulo,templates/circulo.png,22346331154,1629178186,processed/circuloPsic_002Ev1.pdf_pg-17.jpg,SANO,0
002_1,drawings/MINIMENTAL/minimentalPsic_002Ev1.pdf_...,minimental,templates/minimental.png,1618128110,2150373113,processed/minimentalPsic_002Ev1.pdf_pg-3.jpg,SANO,0
002_1,drawings/PICO/picoPsic_002Ev1.pdf_pg-16.jpg,pico,templates/pico.png,131104427152,45183489255,processed/picoPsic_002Ev1.pdf_pg-16.jpg,SANO,0
002_1,drawings/CRUZ/cruzPsic_002Ev1.pdf_pg-17.jpg,cruz,templates/cruz.png,2134363195,3790591220,processed/cruzPsic_002Ev1.pdf_pg-17.jpg,SANO,0


### Display summary stats

In [5]:
summary_stats = []
for diag in df['diagnosis'].unique():
    count = sum(df['diagnosis'] == diag)
    summary_stats.append({
        'Diagnosis': diag,
        'Count': count,
        'Percent': count / len(df)})

summary_stats = pd.DataFrame.from_records(summary_stats).set_index('Diagnosis')
summary_stats

Unnamed: 0_level_0,Count,Percent
Diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1
SANO,1984,0.50101
DCLNA,1017,0.256818
DCLM,851,0.214899
DCLA,56,0.014141
BAJA,33,0.008333
BAJA EA,9,0.002273
NO EXISTE,10,0.002525
