In [2]:
import numpy as np
import collections
import csv
import pandas as pd 
import textwrap
from dateutil.parser import parse
from datetime import datetime

import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 

# Find Diagnosis File Reversions

This part finds the patients who reverted (had AD and then either were diagnosed later with MCI or CN) and did not end up (temporally) with a diagnosis of AD. 

In [4]:
DX_CURREN = {'1':'NL', "2": 'MCI', "3": 'AD', "":""}
DX_CHANGE = {'1':"Stable:NL to NL",'2':"Stable:MCI to MCI",'3':"Stable:AD to AD",'4':"Conv:NL to MCI",'5':"Conv:MCI to AD",'6':"Conv:NL to AD", '7':"Rev:MCI to NL",'8':"Rev:AD to MCI",'9':"Conv:AD to NL","":""}

patient_diagnosis_dict = {}
with open('../../Assessments/DXSUM_PDXCONV_ADNIALL.csv') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader) #skip header
    for row in reader:
        RID = int(row[2])
        EXAMDATE = datetime.strptime(row[8], '%m/%d/%Y')
        Dx_curren = row[10]
        Dx_change = row[9]
        DXOTHDEM = row[47]

        if RID not in patient_diagnosis_dict:
            patient_diagnosis_dict[RID] = []

        #use the DXCURREN or DXCHANGE, depending on which is present
        if Dx_curren != "" and Dx_change == "":
            patient_diagnosis_dict[RID].append([EXAMDATE, DX_CURREN[Dx_curren]])
        elif Dx_change != "" and Dx_curren == "":
            patient_diagnosis_dict[RID].append([EXAMDATE, DX_CHANGE[Dx_change]])
        else:
            assert 1 == 0

always_CN = set()
always_AD = set()
AD = set()
always_MCI = set()
reverts = set()

count = 0 
#take the most recent diagnosis information 
for patient in patient_diagnosis_dict:
    count += 1
    exams = sorted(patient_diagnosis_dict[patient])
    diagnoses = [i[1] for i in exams]
               
    normal_opts = ['NL', 'MCI', 'Stable:NL to NL',  'Stable:MCI to MCI', 'Conv:NL to MCI', 'Rev:MCI to NL']

    valid_transitions = {'AD':["Stable:AD to AD","Conv:MCI to AD","AD"], #note, allow MCI-->AD because still end up at AD (allow for coding error)
                         "Stable:AD to AD":["Stable:AD to AD","Conv:MCI to AD","AD"],
                         "Conv:MCI to AD":["Stable:AD to AD","Conv:MCI to AD",'AD'],
                     "Conv:NL to AD":["Stable:AD to AD","Conv:NL to AD",'AD']}
    reverter = 0
    #diagnosis file explicit reverts
    if any(i in ['Rev:AD to MCI','Rev:AD to NL'] for i in diagnoses):
        reverter = 1
         
    #If have AD, make sure next visit they don't revert
    for i in range(len(diagnoses)):
        for j in range(i+1,len(diagnoses)):
            if diagnoses[i] in ['AD' ,'Stable:AD to AD', 'Conv:MCI to AD', 'Conv:NL to AD'] and diagnoses[j] not in valid_transitions[diagnoses[i]]:
                reverter = 1
    
    #Only count as reversion patient if the ultimate diagnosis is not AD
    if reverter == 1 and diagnoses[-1] not in ['AD',"Stable:AD to AD","Conv:MCI to AD","Conv:NL to AD"]:
        reverts.add(patient)
        
    #if only normal (CN or MCI)
    elif all(i in normal_opts for i in diagnoses):
        always_CN.add(patient)
    
    elif all(i == 'AD' or i == 'Stable:AD to AD' for i in diagnoses):
        always_AD.add(patient)
        
    elif all(i == 'MCI' or i == 'Stable:MCI to MCI' for i in diagnoses):
        always_MCI.add(patient)
       
    else:
        AD.add(patient)
        

#make sure each patient is in one set
assert len(list(always_AD))+len(list(reverts))+len(list(always_CN))+len(list(always_MCI))+len(list(AD)) == count

# Reverted Patients

In [8]:
print len(list(reverts)), "Patients with reversion in Diagnosis File:", sorted(list(reverts))

14 Patients with reversion in Diagnosis File: [167, 429, 555, 1226, 2210, 2367, 4005, 4114, 4426, 4434, 4641, 4706, 4746, 4899]


# Check Labels

Check the labels in the LongitudinalDataAnalysis file based off of the classifications made above. Row of data being printed means there is a mismatch. (Patient reverted will print if we specify to not exclude the reversion patients and this does not mean anything is incorrect)

In [12]:
#open Longitduinal file and compare 
with open('../../Code/LongitudinalDataAnalysis.csv') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader) #skip header
    for row in reader:
        RID = int(float(row[0]))
        
        if RID in reverts:
            print RID,"Patient reverted"
            
        #if should be a 1, and is a zero
        if RID in always_AD or RID in AD: 
            if row[-1] == '0' or row[-1] == 0:
                print row 
                
        #if should be a 0, and is a 1
        if RID in always_CN or RID in always_MCI:
            if row[-1] == '1' or row[-1] == 1:
                print row 

167 Patient reverted
429 Patient reverted
555 Patient reverted
1226 Patient reverted
2210 Patient reverted
2367 Patient reverted
4005 Patient reverted
4114 Patient reverted
4426 Patient reverted
4434 Patient reverted
4641 Patient reverted
4706 Patient reverted
4746 Patient reverted
4899 Patient reverted
