# Reports Fields Extractor and stats displayer
----------------
## DESCRIPTION
From a folder of pdf and doc/docx files, this notebook will extract all pertinent informations and build a CSV database. The information is mainly extracted from the conclusion, but some information like sedation can be extracted from the rest of the document body.

This notebook is specifically tailored to work on Coma Science Group's patients reports, but it can provide a good basis for a custom tailored reports extractor by adapting some of the code (in particular the "Conclusion" section detection) or by adapting your report to fit the template.

Part of this notebook was reused to make the python module easy_textract (available on pypi).

By Stephen Larroque @ Coma Science Group, GIGA Research, University of Liege
2017-2019
Creation date: 2017-01-29
License: MIT
v1.7.1

## INSTALL NOTE
You need to pip install textract and pandas and install tesseract for your platform before launching this script.
Tested on Python 2.7.11

## TODO
* Provide a reports template.
* check that the exclude ref rejects all false detection of coma (because of coma science group or nociception/nociceptive coma scale)
* fix sedation NA if possible
* clinical and other modalities: use finditer and accept multiple positions (assign diagnosis to the closest modality on the left, not on the right)
* accident_date use in priority the "xx years xx months xx days post..." instead of date because more reliable (or we can test, if it is > acquisition_date_end then the accident_date is wrong! ex DeVaal)
* accident_etiology use finditer and append with '/' separator all etiologies found
* non deterministic script, some rows in csv change after running the same code! try to find why -> Should be fixed, was probably because of unidecode not supporting relative import (so sometimes it used the fallback lib unicodedata)

In [None]:
# Forcefully autoreload all python modules
%load_ext autoreload
%autoreload 2

In [None]:
path_to_folder_of_reports = r'G:\Topreproc\Reports\reports_all'

## Reports Fields Extractor

In [None]:
import os, sys
cur_path = os.path.realpath('.')
sys.path.append(os.path.join(cur_path, 'csg_fileutil_libs'))  # for unidecode, because it does not support relative paths (yet? they need to use __import__(path, globals(), level=2))

import re
import six
import shutil
import textract

import csg_fileutil_libs.dateutil.parser as dateutil_parser
import csg_fileutil_libs.langdetect as langdetect

from collections import OrderedDict
from tempfile import mkdtemp, mkstemp
from textract.parsers.utils import ShellParser

from csg_fileutil_libs.aux_funcs import disambiguate_names, _unidecode, _tqdm, recwalk


##### Auxiliary functions #####

def replace_buggy_accents(s, encoding=None):
    """Fix weird encodings that even ftfy cannot fix"""
    # todo enhance speed? or is it the new regex on name?
    dic_replace = {
        '\xc4\x82\xc2\xa8': 'e',
        'ĂŠ': 'e',
        'Ăť': 'u',
        'â': 'a',
        'Ă´': 'o',
        'Â°': '°',
        'â': "'",
        'ĂŞ': 'e',
        'ÂŤ': '«',
        'Âť': '»',
        'Ă': 'a',
        'AŠ': 'e',
        'AŞ': 'e',
        'A¨': 'e',
        'A¨': 'e',
        'Ă': 'E',
        'â˘': '*',
        'č': 'e',
        '’': '\'',
    }
    for pat, rep in dic_replace.items():
        if encoding:
            pat = pat.decode(encoding)
            rep = rep.decode(encoding)
        s = s.replace(pat, rep)
    return s

def date_fr2en(s):
    """Convert french month names into english so that dateutil.parse works"""
    s = s.lower()
    rep = {
        'jan\w+': 'jan',
        'fe\w+': 'feb',
        'mar\w+': 'march',
        'av\w+': 'april',
        'mai\w+': 'may',
        'juin\w+': 'june',
        'juil\w+': 'july',
        'ao\w+': 'august',
        'se\w+': 'september',
        'oc\w+': 'october',
        'no\w+': 'november',
        'de\w+': 'december',
    }
    for m, r in rep.items():
        s = re.sub(m, r, s)
    return s

def calculate_age(birthdate, acquisitiondate):
    # The correct way to do this is to calculate the age directly from the dates, by subtracting the two years, and then subtracting one if the current month/day precedes the birth month/day.
    # from http://stackoverflow.com/a/765862/1121352
    return acquisitiondate.year - birthdate.year - ((acquisitiondate.month, acquisitiondate.day) < (birthdate.month, birthdate.day))


##### Custom text filtering classes #####

class TextDiagnosticFilter(object):
    '''Test if a diagnosis should be rejected or excluded given text patterns'''
    def __init__(self, text, exclude_patterns, reject_patterns, confirm_patterns):
        #self.text = text
        self.exclude_patterns = exclude_patterns
        self.reject_patterns = reject_patterns
        self.confirm_patterns = confirm_patterns
        # Construct list of exclusion sentences positions
        self.pos_excludes = []
        for excl_pat in exclude_patterns:
            for m in re.finditer(excl_pat.replace(' ', '\s+'), text):
                self.pos_excludes.append(m)
        # Find all reject and confirmation terms (text patterns that reject or accept the diagnosis)
        self.pos_rejects = [match.span(0)[0] for match in re.finditer('('+'|'.join(reject_patterns).replace(' ', '\s+')+')', text)]
        self.pos_confirms = [match.span(0)[0] for match in re.finditer('('+'|'.join(confirm_patterns).replace(' ', '\s+')+')', text)]
        # Find positions of all dots (to stop reject terms)
        self.pos_dots = [match.span(0)[0] for match in re.finditer('(\.+)', text)]

    def reject_filter(self, start_pos, pos_end):
        '''Limit reject terms to only a specified start and end boundary'''
        # Filter reject/confirm terms that are outside of boundaries
        self.pos_rejects_filt = filter(lambda x: start_pos <= x <= pos_end, self.pos_rejects)
        self.pos_confirms_filt = filter(lambda x: start_pos <= x <= pos_end, self.pos_confirms)
        self.pos_dots_filt = filter(lambda x: start_pos <= x <= pos_end, self.pos_dots)

    def reject_test(self, state_pos, wholetext=False, bidirectional=False):
        '''Test if the given position is AFTER one of the reject terms (and that there is no confirm term in-between)'''
        rejected = False
        if self.pos_rejects:
            if wholetext:
                pos_confirms = self.pos_confirms
                pos_rejects = self.pos_rejects
                pos_dots = self.pos_dots
            else:  # account only for reject and confirm terms in a specified start and end position
                pos_confirms = self.pos_confirms_filt
                pos_rejects = self.pos_rejects_filt
                pos_dots = self.pos_dots_filt

            for pos_reject in pos_rejects:
                # Reject term found before the diagnosis and no confirm term in-between: skip this diagnosis
                if pos_reject < state_pos and not any(filter(lambda pos_confirm: pos_reject < pos_confirm < state_pos, pos_confirms)) and not any(filter(lambda pos_dot: pos_reject < pos_dot < state_pos, pos_dots)):
                # One-liner to do the same: if not any(filter(lambda pos_reject: pos_reject < pos_diag and not any(filter(lambda pos_confirm: pos_reject < pos_confirm < pos_diag, pos_confirms)), pos_rejects)):  # skip if there was a reject term anywhere before the diagnosis
                    rejected = True
                    break
                # Same if reject term found after diagnosis and no confirm term in-between
                if bidirectional and pos_reject > state_pos and not any(filter(lambda pos_confirm: state_pos < pos_confirm < pos_reject, pos_confirms)) and not any(filter(lambda pos_dot: state_pos < pos_dot < pos_reject, pos_dots)):
                    rejected = True
                    break
        return rejected

    def exclude_test(self, pos_diag):
        '''Test if the given position is INSIDE one of the matched excludes'''
        if (not any(self.pos_excludes) or not any([excl.start() <= pos_diag <= excl.end() for excl in self.pos_excludes if excl])):
            return False
        else:
            return True


##### Custom extraction classes for textract #####

class MyDocParser(ShellParser):
    """Extract text from doc files using antiword (need to be placed in C:\antiword\antiword.exe or ~/antiword/antiword)."""

    def extract(self, filename, procpath=None, **kwargs):
        if procpath is None:
            if os.name == 'nt':
                procpath = 'C:/antiword/antiword.exe'
            else:
                procpath = '~/antiword/antiword'
        stdout, stderr = self.run([procpath, filename])
        return stdout

class MyOCRParser(ShellParser):
    """Extract text from various image file formats or pdf containing scan images using tesseract-ocr (compatible with tesseract v3.02.02, only version currently available on Windows)"""
    
    def extract(self, filename, **kwargs):
        if filename.endswith('.pdf'):
            return self.extract_pdf(filename, **kwargs)
        else:
            return self.extract_image(filename, **kwargs)

    def extract_image(self, filename, **kwargs):
        """Extract text from various image file formats using tesseract-ocr (compatible with tesseract v3.02.02, only version currently available on Windows)"""
        # TODO: if proportion of image wrong, resize automatically to fit A4 proportions using PILLOW! if width > percentage_threshold, downsize width, else if width <, then downsize height.
        filename = os.path.abspath(filename)  # tesseract need absolute paths!
        dirpath = os.path.dirname(filename)
        # Create a temporary output txt file for tesseract
        tempfilefh, tempfilepath = mkstemp(suffix='.txt')  # tesseract < 3.03 do not support "stdout" argument, so need to save into a file
        os.close(tempfilefh)  # close to allow writing to tesseract
        tempfile = tempfilepath[:-4]  # remove suffix to supply as argument to tesseract, because tesseract always append '.txt'
        # if language given as argument, specify language for tesseract to use
        if 'language' in kwargs:
            args = ['tesseract', filename, tempfile, '-l', kwargs['language']]
        else:
            args = ['tesseract', filename, tempfile]

        stdout, _ = self.run(args)
        # Read the results of extraction
        with open(tempfilepath, 'rb') as f:
            res = f.read()
        # Remove temporary output file
        os.remove(tempfilepath)
        return res

    def extract_pdf(self, filename, **kwargs):
        """Extract text from pdfs using tesseract (per-page OCR)."""
        temp_dir = mkdtemp()
        base = os.path.join(temp_dir, 'conv')
        contents = []
        try:
            stdout, _ = self.run(['pdftoppm', filename, base])  # from poppler, http://poppler.freedesktop.org

            for page in sorted(os.listdir(temp_dir)):
                page_path = os.path.join(temp_dir, page)
                page_content = self.extract_image(page_path, **kwargs)
                contents.append(page_content)
            return six.b('').join(contents)
        finally:
            shutil.rmtree(temp_dir)


##### Main reports fields extraction functions #####

def extract_report_fields(report_path, root_path=None, ocr=False, return_text=False):
    """Extract patient's fields from a PDF report"""
    patient_fields = {}
    # List of accepted languages (to exclude gibberish pdf)
    langs_ok = ['fr', 'en', 'nl']
    langs_ok_prob = 0.9  # probability of confidence necessary to not reject the lang
    # Extract text from document and remove accentuated characters and strip blank spaces
    if report_path.endswith('.doc'):  # .doc filetype needs antiword (not docx, use textract directly!)
        docparser = MyDocParser()
        report_text = _unidecode(replace_buggy_accents(docparser.process(report_path, 'utf8').decode('utf8'), 'utf8')).lower()
    else:  # other filetypes should be supported as-is
        try:
            report_text = _unidecode(replace_buggy_accents(textract.process(report_path).decode('utf8'), 'utf8')).lower().strip()
            # Failed to decode anything from document (maybe pdf contains only image and no text? Can try to use tesseract with textract but lot of work for not much...)
            if not report_text:
                raise ValueError('No text extractable from the specified file.')
            else:
                lang_check = langdetect.detect_langs(report_text)[0]
                if lang_check.lang not in langs_ok or lang_check.prob < langs_ok_prob:
                    raise ValueError('No text extractable or language unrecognized from the specified file.')
        except Exception as exc:
            # Try to decode using OCR
            if ocr:
                ocrparser = MyOCRParser()
                #report_text = _unidecode(replace_buggy_accents(textract.process(report_path, method='tesseract', language='fra').decode('utf8'), 'utf8')).lower().strip()  # Should work, but does not on Windows because you need tesseract v3.03 with support for "stdout", which is currently unavailable on Windows...
                report_text = _unidecode(replace_buggy_accents(ocrparser.process(report_path, 'utf8').decode('utf8'), 'utf8')).lower().strip()
            if not ocr or not report_text:  # Failed again, raise the exception!
                raise
    report_text = re.sub('[ \t\f\v]+', ' ', report_text)  # replace abusive spaces
    report_text = re.sub('[\n\r]+', '\n', report_text)  # replace abusive line breaks
    report_text = re.sub('(\r?\s?\n\r?\s?)+', '\n', report_text)  # replace abusive line breaks
    # Failed to decode anything from document, raise exception
    if not report_text:
        raise ValueError('No text extractable from the specified file.')
    # Detect language
    try:
        lang = langdetect.detect(report_text)
    except Exception as exc:
        lang = 'fr'

    # Mandatory fields, if missing, we need to fail loudly
    patient_fields['report_lang'] = lang
    patient_fields['report_path'] = report_path if not root_path else os.path.relpath(report_path, root_path)
    # Get name, the most important field
    try:
        pts_name = re.search('(concern(e|ing|s)?|betreft|patient\s*:\s*)\s*(:\s+)?(rapport.+?|etude.+?de\s+)?((mrs?\.|monsieur|madame|mademoiselle)\s*)?((\s*[a-zA-Z\-]+(,?|\s+|\-+|/)){2,6}?)([,.\r\n]|\s+nee?\s+|\s*patient|\s+agee?\s+|\s*[\(]|\s*\d)', report_text).group(7)
        if len(pts_name) < 4:
            raise AttributeError
    except AttributeError as exc:
        #pts_name = re.search('\s+(([a-zA-Z\-]+(,?\s|\-+|/)){2,4}?)\W{,5}(\s+nee?\s+|patient|agee?\s+|\s*[\(]).{,15}\d+', report_text, re.S).group(1)
        pts_name = re.search('\s+(([a-zA-Z\-]+(,?\s|\-+|/)){2,4}?)(\W{,5}|[0-9\.\-/]+)(\s+(nee?|born)\s+|patient|agee?\s+|\s*[\(])(.{,15}\d+)?', report_text, re.S).group(1)
    # Clean up the name (cannot perfectly detect, but if at least we match the name, we can delete the rest more easily)
    pts_name = re.sub('(^|\s+)(geboren(\s+op|\W+)?|born(\s+on|\W+|$)|(nas|ny|nee?)\s+le|(chef\s+)?de\s+service|pet\s*\-+\s*scan\s+\-+\s+|(coma\s+)?science\s+group|patiente?s?)', '', pts_name)  # remove "born on" and other wrongly matched sentences
    patient_fields['name'] = re.sub('\-+', '-', re.sub('\s+', ' ', pts_name)).strip().replace('\r', '').replace('\n', '').replace('\t', '').replace(',', ' ').replace('  ', ' ').strip()  # clean up spaces, punctuation and double dashes in name

    if lang == 'fr':
        try:
            patient_fields['birthdate'] = re.search(pts_name+'.+?(((nee?\s*(le|en|12)?|ans\W+?)\s*).*?)?[\(\s]?((?!<[^\d])\d{1,2}[-/.]\d{1,2}[-/.]\d{2,4}|\d{4})', report_text, re.S).group(5)
            # if the birthdate is only a single number (eg, 100) and not a year (eg, 1974), then we matched the wrong number because this is not a date format. Retry without "ne le"
            if re.match('^\d+$', patient_fields['birthdate']) and not re.match('^(19|20)\d{2}$', patient_fields['birthdate']):
                raise AttributeError()
        except AttributeError as exc:
            # Optional middle part ("nee le ...")
            patient_fields['birthdate'] = re.search(pts_name+'.+?((nee?\s*(le|en|12)?|ans\W+?|[\(])\s*)?[\(\s]?((\d+[-/.]\d+[-/.])?\d{2,4})', report_text, re.S).group(4)
    elif lang == 'nl':
        patient_fields['birthdate'] = re.search('betreft.+?(geboren(\s+op)\s*)(\d+([-/.]\d+[-/.]\d+)?)', report_text, re.S).group(3)
    else:  # lang == 'en'
        bd1 = re.search('(years?(\s+|-)?old|born).+?(\d+[-/.]\d+[-/.]\d+)', report_text, re.S)
        bd2 = re.search('(\d+[-/.]\d+[-/.]\d+)\s+(born|years)', report_text, re.S)
        if bd2 and bd2.span(1)[0] < bd1.span(3)[0]:
            patient_fields['birthdate'] = bd2.group(1)
        else:
            patient_fields['birthdate'] = bd1.group(3)

    # Optional fields, can be fetched from DICOM for most of them
    # Find gender
    gender = 'NA'
    if lang == 'fr':
        if 'patiente' in report_text: # and report_text.find('patiente') <= report_text.find('patient'):  # sometimes there are mistakes in subsequent pages, but first page is usually safe
            gender = 'F'
        elif 'patient' in report_text:
            gender = 'M'
    elif lang == 'en':
        if 'female' in report_text or 'woman' in report_text or ' she ' in report_text or ' her ' in report_text:
            gender = 'F'
        elif ' male ' in report_text or ' man ' in report_text or ' his ' in report_text or ' he ' in report_text:
            gender = 'M'
    elif lang == 'nl':
        if 'vrouw' in report_text:
            gender = 'F'
        elif ' man ' in report_text:
            gender = 'M'
    patient_fields['gender'] = gender

    # Age
    try:
        patient_fields['age'] = re.search('(\d+)(\s+|-)(ans|years?(\s+|-)old)', report_text).group(1)
    except AttributeError as exc:
        try:
            # try without any space but with parentheses
            patient_fields['age'] = re.search('[\(]\s*(\d+)(\s|-)?(ans|years?(\s+|-)old)\s*[\)]', report_text).group(1)
        except AttributeError as exc:
            patient_fields['age'] = 'NA'
    # Accident's date and etiology ("post" something)
    try:
        #match_etiology = re.search(r'((accident|trauma|arrest|arret\s+cardiaque|(months|years)\s+(?!old)|post[\-\s]+(?!hospital))([\(\)a-zA-Z0-9\<\>\-]+\s+){1,11}?).*?(le|on)?(\d+(/|-)\s*\d+(/|-)\s*\d+|\d+\s*\w+\s*\d+)', report_text, re.S)
        match_etiology = re.search(r'(((accident|trauma|cardiac\s+arrest|arrest|arret\s+cardiaque|(months|years)\s+(?!old)(?=[\(\)a-zA-Z0-9\<\>\-])|post[\-\s]+(?!hospital))([\(\)a-zA-Z0-9\<\>\-]+\s+){,11}?)[^\.]{,100}?)(\s+(le|on)\s+)?(\d+(/|-)\s*\d+(/|-)\s*\d+|\d+\s*\w+\s*\d+)', report_text, re.S)
        patient_fields['accident_date'] = match_etiology.group(8)
        patient_fields['accident_etiology'] = match_etiology.group(1).replace('\n', ' ').strip()
    except AttributeError as exc:
        patient_fields['accident_date'] = 'NA'
        patient_fields['accident_etiology'] = 'NA'

    # Get report date if available (in filename)
    report_filename = os.path.basename(os.path.normpath(report_path))
    try:
        patient_fields['report_date'] = re.findall('(\d{0,4}(19|20)\d{2})', report_filename)[-1][0]
    except IndexError as exc:
        # Get report date in header "liege, le xx month xxxx"
        try:
            patient_fields['report_date'] = re.search('liege\s*,?\s*le\s+(\d+\s+[a-zA-Z0-9]+\s+\d+)', report_text).group(1)
        except AttributeError as exc:
            patient_fields['report_date'] = 'NA'

    # Get acquisition date
    try:
        # TODO: Get from DICOM! Problem: there are multiple sessions for some patients, so it's not bulletproof...
        patient_acquisition_dates = re.search(r'((du|from)\s+(\w+\s+){,2})?((\d+[\-/\.\s]+)?(\d+[\-/\.\s]+)?\d+)\s+(au|to)\s+((\d+[\-/\.\s]+)?(\d+[\-/\.\s]+|\w+\s+)\d+)', report_text)
        patient_acquisition_start = patient_acquisition_dates.group(4)
        patient_acquisition_end = patient_acquisition_dates.group(8)
        patient_fields['acquisition_date'] = '%s - %s' % (patient_acquisition_start, patient_acquisition_end)
        patient_fields['acquisition_date_end'] = '%s' % (patient_acquisition_end)
        # TODO: compute age from acquisition if missing age: acquisition_date - birth_date
    except AttributeError as exc:
        try:
            patient_acquisition_dates = re.search(r'((evalu\w+)(\s+\w+){,10}\s+le\s+|(perform\w+)(\s+\w+){,10}\s+on\s+(the\s+)?|rmn.{,30}?)((\d+[th\-/\.\s]+)?(\d+[\-/\.\s]+|\w+\s+)\d+)', report_text)
            patient_acquisition_end = patient_acquisition_dates.group(7)
            patient_fields['acquisition_date'] = '%s - %s' % (patient_acquisition_end, patient_acquisition_end)
            patient_fields['acquisition_date_end'] = '%s' % (patient_acquisition_end)
        except AttributeError as exc:
            patient_fields['acquisition_date'] = 'NA'
            patient_fields['acquisition_date_end'] = 'NA'

    # Get sedation usage
    mrisedation = 'NA'
    if lang == 'fr':
        if re.search('(irm|imagerie)\s+fonctionnelle.+?(avec|sous)\s+sedation', report_text):
            mrisedation = 'yes'
        elif re.search('(irm|imagerie)\s+fonctionnelle.+?(sans|pas\s+de|aucune)\s+sedation', report_text):
            mrisedation = 'no'
        elif re.search('(sous|avec)\s+(sedation|anesthesi)', report_text):
            mrisedation = 'yes'
        elif re.search('(sans|pas\s+de|aucune)\s+(sedation|anesthesi)', report_text):
            mrisedation = 'no'
        elif re.search('patiente?\s+(sedatee?|anesthesie)', report_text):
            mrisedation = 'yes'
        elif re.search('patiente?\s+non\s+(sedatee?|anesthesie)', report_text):
            mrisedation = 'no'
    elif lang == 'en':
        if re.search('functional\s+(mri|imagery).+?(with|under)\s+sedation', report_text):
            mrisedation = 'yes'
        elif re.search('functional\s+(mri|imagery).+?(without|no)\s+sedation', report_text):
            mrisedation = 'yes'
        elif re.search('((with|under)\s+sedation|\s+sedated)', report_text):
            mrisedation = 'yes'
        elif re.search('((without|no)\s+sedation|unsedated)', report_text):
            mrisedation = 'no'
        elif re.search('\s+(sedated|anesthetized)\s+patient', report_text):
            mrisedation = 'yes'
    elif lang == 'nl':
        if re.search('(niet|zonder(\s+dat)?|geen)\s+(verdoving|sedatie)', report_text):
            mrisedation = 'no'
        elif re.search('\s+(verdoving|sedatie)', report_text):
            mrisedation = 'yes'
    patient_fields['mri_sedation'] = mrisedation

    # Get final diagnosis, should be mandatory but might fail, in this case check manually the reports
    # Extract all "conclusion parts" positions (start and end)
    if lang == 'nl':
        idx_s_all = [m.start() for m in re.finditer('(^|\s+)conclusie[^,.]', report_text)]
        end_of_report = report_text.find('met collegiale groeten')
    else:
        idx_s_all = [m.start() for m in re.finditer('(^|\s+)(conclusion|conlcusion)[^,.]', report_text)]  # avoid "en conclusion, ..."
        end_of_report = report_text.find('confraternellement')
    # If can't find confraternellement, try to find Laureys's signature
    if not end_of_report or end_of_report == -1:
        end_of_report = report_text.find('(sincere|confidentially,)')  # sincerely/sincerement
        if not end_of_report or end_of_report == -1:
            #end_of_report = report_text.rfind('laureys')
            end_of_report = len(report_text)

    # Find the "conclusion" that is the closest to "confraternellement" (because there can be several conclusion parts for each modality, but we are looking for the final conclusion part that is doing a summary of all the results)
    #idx_s = 0 # in case there is no "conclusion" part
    idx_s_id = -1
    idx_s_all.insert(0, 0) # in case there is no "conclusion" part, then search through the whole document
    for i in range(len(idx_s_all)):
        if idx_s_all[i] > end_of_report:
            break
        else:
            #idx_s = idx_s_all[i]
            idx_s_id = i

    # Define blacklist sentences (ie, any diagnosis found in these sentences will be excluded, such as bibliographic citations)
    # Note: spaces will be automatically replaced by '\s+'
    # Note2: must be all lowercase! Else no detection!
    exclude_patterns = ['from\s+unresponsive\s+wakefulness\s+to\s+minimally\s+conscious\s+plus\s+and\s+functional\s+locked-in\s+syndromes',
                'indique\s+l\'emergence\s+de\s+l\'etat\s+de\s+conscience\s+minimale',
                'indique\s+un\s+etat\s+de\s+conscience\s+minimale',
                'ecm+\s*:\s*etat\s+de\s+conscience\s+minimale\s+plus\s+\(presence\s+d’une\s+reponse\s+a\s+la\s+commande\)',
                'ecm-\s*:\s*etat\s+de\s+conscience\s+minimale\s+moins\s+\(pas\s+de\s+reponse\s+a\s+la\s+commande\)',
                'comportements\s+associes\s+au\s+diagnostic\s+d’etat\s+de\s+conscience\s+minimale',
                'ev\s*:\s*etat\s+vegetatif\s+/\s+syndrome\s+d’eveil\s+non\s+repondant',
                'who\s+emerge\s+from\s+the\s+(mcs|minimal(ly)?\s+conscious\s+state)\.',
                'indicates?\s+(the\s+)?emergence\s+(of|from)\s+minimally\s+conscious\s+state',
                'indicates?\s+(a|the)\s+minimally\s+conscious\s+state(\s+\(mcs\))?',
                '(nocicepti(on|ve)\s+coma\s+scale|coma\s+science\s+group)',
                'mcs\s*-: patient shows non-reflexive behavior',
                'mcs+: patient shows command following',
                'mcs+: minimally conscious state plus',
                'uws/vs: unresponsive wakefulness syndrome\s*/\s*vegetative state',
                'denotes emergence (from|of) (mcs|minimally conscious state)',
                'denotes (mcs|minimally conscious state)',
                'vs/uws\s*:\s*vegetative state/unresponsive wakefulness syndrome',
                'functional connectivity in the default network during resting state is preserved in a vegetative but not in a brain dead patient',
                'magnetic resonance spectroscopy and diffusion tensor imaging in coma survivors: promises and pitfalls',
                'centre d\'etude du coma',
                '(\s+|\n|^)coma@chu.ulg.ac.be',
                'www.comascience.org',
                '(consultations?[\s\<"]+post-coma|post-coma[\s\<"]+consultation)',
                'echelle\s+([\-a-zA-Z0-9]+\s+){1,9}coma',
                '\s+coma recovery scale[\-\s]+revised',
                'differences in neuroanatomy of the vegetative state\s*: insights from diffusion tensor imaging and functional implications',
                'visual fixation in the vegetative state\s*: an observational case series pet study',
                'et al [^\n]+',  # try to skip any bibliographic reference
                'protocole? .{1,30} res?ponse .{1,20} (command|demand)',
                'traduisant une reponse a la commande via la modulation de l\'activite neuronale',
                'aucune reponse a la commande',
                '3\s*-\s*poursuite visuelle',
                'diagnostic de sortie\s*:',
                'afin de tenter d\'obtenir une reponse a la commande',
                ]
    # Find all reject and confirmation terms (text patterns that reject or accept the diagnosis)
    # Note: spaces will be automatically replaced by '\s+'
    reject_patterns = ['incompatible', '(not|pas)\s+(compatible|possible)', '(?<!cannot) reject', 'inconsistent', 'inconsistant', 'refut(e|ant)', 'exclu(t|e)?\s', 'atypi(cal|que)', 'que de celui typiquement observe', 'is not', 'in contrast to', 'absence (d|of)', 'pas observe', 'aucun', 'n\'a pas ', 'pas .{,20} evidence', 'failed']
    confirm_patterns = ['(is|est)\s+compatible', 'confirm', 'classiquement\s+observe', 'observed', 'conclude', 'conclue', 'plus proche de', 'but is diagnosed as', 'oriente vers', 'presence (d|of)', 'preserv', 'avons pu observe', 'cannot reject']
    #reject_pattern_regex = '|'.join(['(?<!'+r+')' for r in reject_pattern])
    # Match with one of possible states
    # Note: order matters here: we want to test the most specific first (eg, mcs+) to the least specific (mcs)
    # Note2: must be all lowercase! Else no detection!
    diag_states = OrderedDict((  # be careful with dashes, place them first or escape, eg "[\s-]" is bad, prefer "[-\s]" or "[\-\s]"
            ('(partial\s+locked[-\s]+in|locked[-\s]+in\s+(partiel|.+?incomplet)|partiele\s+locked[-\s]+in|\s+plis(\s+|\.))', 'partial LIS'),
            ('(functional\s+locked[-\s]+in|locked[-\s]+in\s+fonctionnel|\s+flis(\s+|\.))', 'functional LIS'),
            ('(complete\s+locked[-\s]+in|locked[-\s]+in\s+complet|\s+clis(\s+|\.))', 'CLIS'),
            ('(locked[-\s]+in|\s+lis(\s+|\.))', 'LIS'),
            ('(emergence|sortie.{2,9}etat\s+de\s+conscience\s+minim|emerged|\s+emcs?(\s+|\.))', 'EMCS'),
            ('(consci(en|ou)\w+\s+)?(minimale?\s+plus|minimal(ly)?\s+conscious\s+state\s+plus|\s+(mcs|ecm)(\+|\s+plus))', 'MCS+'),  # Note: mcs+ and mcs- pattern must start at the same place as mcs pattern, else mcs might get precedence (because the position of the pattern mcs in the text will precede the pattern mcs+/-)
            ('(res?pon(ses?|d\w+)\s+(\w+\s+){1,7}(command|demand))', 'MCS+'),
            ('(consci(en|ou)\w+\s+)?(minimale?\s+moins|minimal(ly)?\s+conscious\s+state\s+minus|\s+(mcs|ecm)(\-|\s+(minus|moins)))', 'MCS-'),
            ('(visual\s+(\w+\s+){,4}pursuit|poursuite\s+(\w+\s+){,4}visuelle)', 'MCS-'),
            ('(conscience\s+minimale?|minimal(ly)?\s+conscious|\s+mcs(\s+|\.)|ecm(\s+|\.))', 'MCS'),
            ('(non[-\s]+repondant|unresponsive\s+wakefulness|\s+uws(\s+|\.))', 'UWS'),
            ('(not\s+observe\s+any\s+sign\s+of\s+consciousness|aucun\s+signe\s+de\s+conscience)', 'UWS'),
            ('(vegetatif|vegetative|\s+vs(\s+|\.)|aucun\s+signe\s+de\s+conscience)', 'UWS'),
            ('(\s+enr\s+)', 'UWS'),
            ('(?<!nociception)\s+coma(?!\s+science\s+group)', 'coma'),  # match coma only if not followed by science group (using negative lookahead)
            ))

    # Get the best clinical or paraclinical diagnosis (keep the highest consciousness level found, except if it's inside an exclude group/blacklist)
    # This part is critical to do first, because it will allow to scan where we can find at least one diagnosis, and thus define the right "conclusion part" to scan.
    # Backtrack to previous conclusions parts if the current one does not contain any diagnosis (eg, "conclusions internistiques")
    patient_fields['best_clinical_paraclinical_diagnosis'] = 'NA'
    for idx_s in idx_s_all[:idx_s_id+1][::-1]:  # we reverse the list to backtrack from the lowest conclusion to the first in document (cannot be below "confraternellement")
        # Extract the conclusion part (we will retry with other potential conclusion parts - including the whole document - until we find at least one diagnosis - so if the best diagnosis is NA it means there is nothing to find in the whole document)
        report_conclusion = report_text[idx_s:end_of_report]
        #print(repr(report_conclusion))  # debugline
        # Load the diagnostic filter object (and construct the list of exclusion/rejection/confirmation sentences positions)
        diag_filter_conclusion = TextDiagnosticFilter(report_conclusion, exclude_patterns, reject_patterns, confirm_patterns)
        # Match with one of possible states
        patient_diag = 'NA'
        for state_name, state_abbrv in diag_states.items():
            match = re.search(state_name, report_conclusion)
            # Check if there is a match for this diagnosis anywhere in the conclusion
            if match and match.start() >= 0:
                pos_diag = match.start()
                # Check if the match is not in an excluded excerpt (references, CRS-R scale comments, etc.) nor after a rejection term ('absence of', 'is not', etc.)
                if not diag_filter_conclusion.exclude_test(pos_diag) and not diag_filter_conclusion.reject_test(pos_diag, wholetext=True):
                    #print(match.group(0))  # debugline
                    #print(match.start())  # debugline
                    patient_diag = state_abbrv
                    break
        # Found a diagnosis? Store it and cut the loop, the conclusion part is good enough!
        if patient_diag != 'NA':
            # Since we loop from the best diagnosis to the worst one (because we use an OrderedDict), the first diagnosis found is the best (most optimistic) one
            patient_fields['best_clinical_paraclinical_diagnosis'] = patient_diag
            break
        # else we continue to other conclusion parts

    # Get conflicting diagnoses (between clinical and a modality) - search only in the conclusion (not in previous sections)
    patient_diag_clin_pet = []
    patient_diag_clin_mri = []
    try:
        pos_clinical = re.search('clinical|clinique|behaviou?r|comportement|diagnostic\s+evaluation|compte\s+tenu\s+de\s+la\s+presence', report_conclusion).start()  # TODO: use re.finditer and allow for a list of clinical pos, eg for Amakrane
        # TODO: also scan full sections, eg: "evaluations comportementales"
    except AttributeError as exc:
        pos_clinical = -1
        patient_diag_clin_pet.append('NA')
        patient_diag_clin_mri.append('NA')

    try:
        pos_pet = re.search('(\s+|/|-)(pet|tep|fdg)(\s+|/|-)', report_conclusion).start()
    except AttributeError as exc:
        pos_pet = -1

    try:
        pos_mri = re.search('\W+(magnetic|irmf?|f?mri|rmn)\W+', report_conclusion).start()
    except AttributeError as exc:
        pos_mri = -1

    # Construct list of all diagnoses found in the conclusion (with their text position)
    patient_alldiags_conclusion = {}
    for state_name, state_abbrv in diag_states.items():
        #all_matchs = re.finditer('.*'+state_name, report_conclusion)
        all_matchs = re.finditer(state_name, report_conclusion)
        if all_matchs:
            for match in all_matchs:
                if match:
                    try:
                        start_idx = match.span(0)[0]
                    except IndexError as exc:
                        continue
                    # Keep highest diagnosis for this text position (eg, MCS+ and reject MCS) and exclude references
                    if not start_idx in patient_alldiags_conclusion and not diag_filter_conclusion.exclude_test(match.start()):
                        patient_alldiags_conclusion[start_idx] = state_abbrv

    # Assign diagnoses by modality (and reject if there are reject terms and no confirmation term in-between)
    # Init
    pos_modalities = {'clinical': pos_clinical,
                      'pet': pos_pet,
                      'mri': pos_mri
                     }
    patient_diags_per_modality = {}
    for modality in pos_modalities.keys():
        patient_diags_per_modality[modality] = set()
    # Assign (and reject)
    if len(patient_alldiags_conclusion) > 1:
        pos_starts = pos_modalities.values()
        for modality, start_pos in pos_modalities.items():
            if start_pos >= 0:
                # Define boundaries: from the modality marker to the next modality marker (or end of text)
                # here we construct the end of conclusion text about this modality part
                pos_starts_filt = list(pos_starts)
                pos_starts_filt.remove(start_pos)
                try:
                    pos_end = min(filter(lambda x: x >= 0 and x > start_pos, pos_starts_filt))
                # Last item, last position, greatest one, so we just set the total length of the report's conclusion
                except ValueError:
                    pos_end = len(report_conclusion)
                # Filter reject/confirm terms that are outside of boundaries
                diag_filter_conclusion.reject_filter(start_pos, pos_end)
                # Loop through all diagnoses found in the conclusion, and assign to the closest modality (if not rejected)
                for state_pos, state_abbrv in patient_alldiags_conclusion.items():
                    # Assign to this modality if it is between the boundaries of this part (clinical part, pet part, etc.) and not rejected
                    if start_pos <= state_pos <= pos_end and not diag_filter_conclusion.reject_test(state_pos):
                        # TODO: Add only if not a subterm of something we already detected (eg, MCS should not be added if we know MCS+)
                        patient_diags_per_modality[modality].add(state_abbrv)

    patient_fields['diagnoses_clinical'] = '/'.join(patient_diags_per_modality['clinical'])
    patient_fields['diagnoses_pet'] = '/'.join(patient_diags_per_modality['pet'])
    patient_fields['diagnoses_fmri'] = '/'.join(patient_diags_per_modality['mri'])

    # Get the final (clinical) diagnosis = best CRS-R result = the gold standard
    # Note: we cannot scan the CRS-R table, we only scan the conclusion part
    # Algorithm: use the best clinical diagnosis if any found, else extract the first diagnosis found in the conclusion part
    patient_clin_diag = 'NA'
    if patient_fields['diagnoses_clinical']:
        # Get ordered set of possible diagnoses, first being the best (most optimistic)
        diag_states_vals = diag_states.values()
        # Order clinical diagnoses from best to worst
        clinical_diag_sorted = sorted(patient_fields['diagnoses_clinical'].split('/'), key=lambda x: diag_states_vals.index(x))
        # Extract the first = best diagnosis
        patient_clin_diag = clinical_diag_sorted[0]
    # Else, extract the first diagnosis found in the conclusion that is not rejected
    elif patient_alldiags_conclusion:
        for pos_diag in sorted(patient_alldiags_conclusion):
            if not diag_filter_conclusion.reject_test(pos_diag, wholetext=True):
                patient_clin_diag = patient_alldiags_conclusion[pos_diag]
                break
        #patient_clin_diag = patient_alldiags_conclusion[sorted(patient_alldiags_conclusion)[0]]
    patient_fields['final_diagnosis'] = patient_clin_diag

    # OLD Algorithm: extract the first diagnosis found in the conclusion part, and check if it is consistent with what was found for the clinical modality diagnosis (if not NAN)
    #if patient_alldiags_conclusion:
    #    patient_clin_diag = patient_alldiags_conclusion[sorted(patient_alldiags_conclusion)[0]]
    #if patient_fields['diagnoses_clinical'] and patient_clin_diag not in patient_fields['diagnoses_clinical']:
    #    patient_clin_diag = patient_fields['diagnoses_clinical'].split('/')[0]
    #patient_fields['final_diagnosis'] = patient_clin_diag

    # Diagnosis at admission (before our team diagnosed the patient)
    patterns_admission_diag = [
                                'diagnostic du corps medical a l\'admission',
                                'diagnostic (à|a) l\'admission',
                                'diagnostic actuel',
                                'diagnosis (on|at) admission',
                                'diagnose bij aankomst'
                                'diagnos(is|ic|e).{,30}(admission|aankomst)',
                              ]
    admission_diag = 'NA'
    # Load the diagnostic filter object for the whole text
    diag_filter = TextDiagnosticFilter(report_text, exclude_patterns, reject_patterns, confirm_patterns)
    # Find the "admission diagnosis" part
    admission_diag_match = re.search('(' + '|'.join(patterns_admission_diag).replace(' ', '\s+') + ')', report_text)
    if admission_diag_match:
        # Construct list of all diagnoses found in the whole text (with their text position)
        patient_alldiags = {}
        for state_name, state_abbrv in diag_states.items():
            #all_matchs = re.finditer('.*'+state_name, report_conclusion)
            all_matchs = re.finditer(state_name, report_text)
            if all_matchs:
                for match in all_matchs:
                    if match:
                        try:
                            start_idx = match.span(1)[0]
                        except IndexError as exc:
                            continue
                        # Keep highest diagnosis for this text position (eg, MCS+ and reject MCS) and exclude references
                        if not start_idx in patient_alldiags and not diag_filter.exclude_test(match.start()):
                            patient_alldiags[start_idx] = state_abbrv
        # Get the first diagnosis found just after the admission diagnosis marker
        admission_diag_match_start = admission_diag_match.start()
        admission_diag_pos = filter(lambda x: x > admission_diag_match_start, sorted(patient_alldiags))
        if admission_diag_pos:
            admission_diag = patient_alldiags[admission_diag_pos[0]]
    patient_fields['admission_diagnosis'] = admission_diag

    # Atypical pattern?
    if re.search('((pattern|metaboli).{1,50}atypique|atypical.{1,50}(pattern|metaboli)|surprenant|surprising|(pas|non|not)\s+(typique|typical))', report_text):
        patient_fields['atypical_pattern'] = 'True'
    else:
        patient_fields['atypical_pattern'] = 'False'

    # Epileptic?
    if re.search('(epilepsia|epileptic|epileptique|epilepsie|epilepsy)', report_conclusion):
        patient_fields['epileptic'] = 'True'
    else:
        patient_fields['epileptic'] = 'False'

    # Mental disorders?
    mental_disorders = []
    mental_disorders_match = re.finditer('(suicide|depressi(on|ve)|hallucination|schizo|bipolar|aphas[\w\-]+|alzheimer|parkinson|akineto[\s\-]*mutique|anosognos[\w\-]+)', report_text)
    if mental_disorders_match:
        for m in mental_disorders_match:
            mental_disorders.append(m.group(0))
    patient_fields['mental_disorders'] = '/'.join(set(mental_disorders))

    # Zolpidem tested?
    if re.search('zolpidem', report_text):
        patient_fields['zolpidem_mention'] = 'True'
        if report_text.count('zolpidem') > 2:
            # If Zolpidem mentioned only once or twice, probably just anamnese or suggestion of treatment and bibliographic ref, but no test
            patient_fields['zolpidem_tested'] = 'True'
            # Zolpidem respondent? (proceed by elimination, to reduce likelihood of false negatives, since real positives are rare)
            if re.search('zolpidem.{,400}?(aucune\s+amelioration|aucun\s+changement|aucune\s+evolution|pas\s+.{1,15}amelioration|disparition|disappear|no\s+beneficial|not\s+observe|any\s+amelioration|no\s+.{,30}(therapeutic|effect|change|improvement))', report_text, re.S):
                if re.search('zolpidem.{,400}?montre\s+capable', report_text, re.S):
                    # Special case (no change in diagnostic because already EMCS, but still better performances after zolpidem)
                    patient_fields['zolpidem_respondent'] = 'True'
                else:    
                    patient_fields['zolpidem_respondent'] = 'False'
            else:
                patient_fields['zolpidem_respondent'] = 'True'
            # Zolpidem anti-respondent?
            if re.search('zolpidem.{,400}?(disparition|disappear|diminution|decrease|dimin)', report_text, re.S):
                patient_fields['zolpidem_antirespondent'] = 'True'
            else:
                patient_fields['zolpidem_antirespondent'] = 'False'
        else:
            patient_fields['zolpidem_tested'] = 'False'
            patient_fields['zolpidem_respondent'] = 'False'
        mzolp = re.search('zolpidem', report_text)
        patient_fields['zolpidem_context'] = report_text[mzolp.start()-50:mzolp.end()+1200]
    else:
        patient_fields['zolpidem_mention'] = 'False'
        patient_fields['zolpidem_tested'] = 'False'
        patient_fields['zolpidem_context'] = ''
        patient_fields['zolpidem_respondent'] = 'False'
        patient_fields['zolpidem_antirespondent'] = 'False'

    # Had a pet?
    if re.search('(\s+|/|-)(pet|tep|fdg)(\s+|/|-)', report_text):
        patient_fields['had_pet'] = 'True'
    else:
        patient_fields['had_pet'] = 'False'

    # Had a mri?
    if re.search('\W+(magnetic|irmf?|f?mri|rmn)\W+', report_text):
        patient_fields['had_mri'] = 'True'
    else:
        patient_fields['had_mri'] = 'False'

    # Get number of days since accident (ie, between accident and acquisition)
    try:
        # First way: try to get the days directly from the text content if specified
        m_acc_delay = re.search(r'[^\d]((\d+)\s*(ans?|years?))?(\s*(et|and|,)?\s*)((\d+)\s*(months?|mois))?(\s*(et|and|,)?\s*)((\d+)\s*(days?|jours?))?.{,10}?post', report_text, re.S)
        acc_years = m_acc_delay.group(2)
        acc_months = m_acc_delay.group(7)
        acc_days = m_acc_delay.group(12)
        # Calculate number of days from years, months and days
        if acc_days is None:
            acc_days = 0
        if acc_months is None:
            acc_months = 0
        if acc_years is None:
            acc_years = 0
        diff_acq_acc = int(acc_years) * 365 + int(acc_months) * 30 + int(acc_days)
        # If number is 0 then we misdetected probably
        if diff_acq_acc == 0:
            raise AttributeError()
        else:
            patient_fields['acquisition_minus_accident_days'] = '%i' % diff_acq_acc
    except AttributeError as exc:
        # Else, try to compute time between acquisition and accident dates (to detect acute
        try:
            if patient_fields['acquisition_date_end'] == 'NA' or patient_fields['accident_date'] == 'NA':
                raise ValueError()
            # Parse strings into date objects (always consider first integer to be the day if 3 integers date, and also allow fuzzy matching)
            acq_date = dateutil_parser.parse(date_fr2en(patient_fields['acquisition_date_end']), dayfirst=True, fuzzy=True)
            acc_date = dateutil_parser.parse(date_fr2en(patient_fields['accident_date']), dayfirst=True, fuzzy=True)
            # Save the difference number of days
            patient_fields['acquisition_minus_accident_days'] = '%i' % (acq_date - acc_date).days
        except ValueError as exc:
            patient_fields['acquisition_minus_accident_days'] = 'NA'

    # Acute? (less than one month between acquisition and injury)
    if patient_fields['acquisition_minus_accident_days'] != 'NA':
        diff_acq_acc_days = int(patient_fields['acquisition_minus_accident_days'])
        if 0 <= diff_acq_acc_days < 32:
            patient_fields['acute'] = 'True'
        elif diff_acq_acc_days >= 32:
            patient_fields['acute'] = 'False'
        else:  # misdetection bug, number of days is negative
            patient_fields['acute'] = 'NA'
    else:
        patient_fields['acute'] = 'NA'

    # If age is missing, try to calculate it from other fields (acquisition_date - birthdate)
    try:
        if patient_fields['age'] == 'NA' and patient_fields['acquisition_date_end'] != 'NA' and patient_fields['birthdate'] != 'NA':
            # Parse strings into date objects
            acq_date = dateutil_parser.parse(date_fr2en(patient_fields['acquisition_date_end']), dayfirst=True, fuzzy=True)
            bir_date = dateutil_parser.parse(date_fr2en(patient_fields['birthdate']), dayfirst=True, fuzzy=True)
            # Save the difference number of years
            patient_fields['age'] = '%i' % calculate_age(bir_date, acq_date)
    except ValueError as exc:
        pass

    # Ophtalmologic report?
    if re.search('ophtalmo', report_text) or re.search('oogheelk', report_text):
        patient_fields['ophtalmologic_report'] = 'True'
    else:
        patient_fields['ophtalmologic_report'] = 'False'

    # Nociception report?
    if re.search('nociception.{1,40}coma', report_text):
        patient_fields['nociception_report'] = 'True'
    else:
        patient_fields['nociception_report'] = 'False'

    # Debug
    #print(pos_modalities)
    #print('Confirms', diag_filter_conclusion.pos_confirms)
    #print('Rejects', diag_filter_conclusion.pos_rejects)
    #print(patient_alldiags_conclusion)
    #print(patient_diags_per_modality)
    #print(report_conclusion)
    #for pos in patient_alldiags_conclusion.keys():
        #print('----')
        #print(report_conclusion[pos:pos+200])
    #for pos in patient_alldiags.keys():
        #print('----')
        #print(report_text[pos:pos+200])

    # Remove any line break in any field (easier to save as a csv)
    for key in patient_fields.keys():
        patient_fields[key] = patient_fields[key].replace('\n', ' ')

    # All done, return the extracted fields
    if not return_text:
        return patient_fields
    else:
        return patient_fields, report_text, report_conclusion

def extract_report_fields_all(reports_root_dir, filetype=None, ocr=False, tolerant=False, verbose=False):
    results = {}
    conflicts = []
    errors = []
    total = 0
    for report_dir, report_filename in recwalk(reports_root_dir, folders=False, filetype=filetype):
        total += 1
    for report_dir, report_filename in _tqdm(recwalk(reports_root_dir, folders=False, filetype=filetype), total=total, leave=True, unit='reports'):
        report_path = os.path.join(report_dir, report_filename)
        pts = None
        if verbose:
            print('* Processing file: %s' % report_path)
        try:
            pts = extract_report_fields(report_path, reports_root_dir, ocr=ocr)
        except Exception as exc:
            if 'Syntax Warning: May not be a PDF file' in str(exc) or 'File is not a zip file' in str(exc) or 'No text extractable' in str(exc) or 'Unsupported image type' in str(exc):
                pts = None
                if verbose:
                    print(str(exc))
                pass
            else:
                if tolerant:
                    print(str(exc))
                    pass
                else:
                    raise
        if pts is None:
            errors.append(report_path)
            if verbose:
                print('* Warning: error reading file %s, might not contain any text or unrecognized format, skipping file.' % report_path)
        else:
            pts_id = '%s_%s' % (pts['name'], pts['age']) # TODO: patient id should be name + scandate (from DICOM!)
            if pts_id not in results.keys():
                results[pts_id] = pts
            else:
                conflicts.append( (pts['report_path'], results[pts_id]['report_path']) )
                if verbose:
                    print('* Warning: conflict detected, two reports have the same patient name and date: %s and %s. Both will be saved anyway.' % (results[pts_id]['report_path'], pts['report_path']))
                # Save under another id
                pts_id = '%s_%s_%s' % (pts['name'], pts['age'], pts['report_date']) # first just try to append report date
                if pts_id in results.keys(): # if it does not work, append the report's path (should be unique)
                    pts_id = pts_id + pts['report_path']
                results[pts_id] = pts
    return (results, conflicts, errors)

In [None]:
# Extract fields of only one report (good for debug)
report_path = '../reports_all/some-patients-report.pdf'
patients_fields, report_text, report_conclusion = extract_report_fields(report_path, return_text=True)
patients_fields

In [None]:
# Debug by accessing the preprocessed report_text directly
#report_path = '../reports_all/specific-file.pdf'
#patient_fields, report_text, report_conclusion = extract_report_fields(report_path, ocr=True, return_text=True)

In [None]:
# Main program
verbose = False
ocr = True # takes a lot more time with OCR
tolerant = True # skip errors
print('== Patients PDF reports fields extractor ==')
print('Extracting patients fields from reports, please wait...')
all_patients_fields, conflicts, errors = extract_report_fields_all(path_to_folder_of_reports, filetype=['.pdf', '.doc', '.docx'], ocr=ocr, tolerant=tolerant, verbose=verbose)
print('Total reports processed: %i' % len(all_patients_fields))
#print(all_patients_fields)
# Fake unit test for missing diagnoses
#for pts_key in all_patients_fields.keys():
#    all_patients_fields[pts_key]['best_clinical_paraclinical_diagnosis'] = 'NA'

# Display missing diagnoses
missing_diagnoses = [patient_fields['name'] for patient_fields in all_patients_fields.values() if patient_fields['best_clinical_paraclinical_diagnosis'] == 'NA']
if not missing_diagnoses:
    print('No missing diagnosis, congratulations!')
else:
    print('Missing diagnoses: %i reports: %s' % (len(missing_diagnoses), ', '.join(sorted(missing_diagnoses))))

# Display unreadable (error) reports
if errors:
    print('Total number of unreadable reports: %i. Here is the detailed list:' % len(errors))
    for err in errors:
        print('* %s' % err)
# Display conflicts (ie, two reports with similar fields)
if conflicts:
    print('Total number of conflicting reports: %i. Here is the detailed list:' % len(conflicts))
    for conf in conflicts:
        print('* %s with %s' % (conf[1], conf[0]))

In [None]:
# Save all extracted fields to a csv file!
from csg_fileutil_libs.aux_funcs import save_dict_as_csv

output_file = 'all_patients_fields.csv'
fields_order = ['name', 'gender', 'age', 'final_diagnosis', 'mri_sedation']
save_dict_as_csv(all_patients_fields, output_file, fields_order, csv_order_by='name', verbose=True)
print('All results saved to csv file: %s' % output_file)

------------------------
## Disambiguate names

In [None]:
import pandas as pd
cf = pd.read_csv('all_patients_fields.csv', sep=';').fillna('')

In [None]:
# Disambiguate names
dist_threshold = 0.2
cf = disambiguate_names(cf, dist_threshold=dist_threshold)

In [None]:
# Save cleaned up disambiguated names back to the csv
cf.sort_values(['name'], inplace=True)  # reorder by name
cf.to_csv('all_patients_fields.csv', sep=';', na_rep='NA', index=False)
print('Disambiguated csv correctly saved.')

-----------
## Reading the database and display interesting stats

In [None]:
print('List of potential hypometabolic MCS (UWS*):')
count = 0
for rowid, c in cf.iterrows():
    # If clinical diagnosis = MCS+/- or EMCS, but paraclinical diagnoses are UWS
    if ('MCS' in c['diagnoses_clinical'] or 'MCS' in c['final_diagnosis']) and ('UWS' in c['diagnoses_pet'] or 'UWS' in c['diagnoses_fmri']):
        print('* ' + c['name'] + ': ' + c['report_path'])
        count += 1
print('Total: %i' % count)
print('\n')

print('Patients with an atypical pattern:')
atypical = cf[cf['atypical_pattern'] == True]
for rowid, c in atypical.iterrows():
    print('* ' + c['name'] + ': ' + c['report_path'])
print('Total atypical: %i' % len(atypical))
print('\n')

print('Patients with epilepsia:')
epileptic = cf[cf['epileptic'] == True]
for rowid, c in epileptic.iterrows():
    print('* ' + c['name'] + ': ' + c['report_path'])
print('Total epileptic: %i' % len(epileptic))
print('\n')

print('List of surely conflicting diagnoses between clinical and paraclinical:')
countconflicts1 = 0
for rowid, c in cf.iterrows():
    if c['final_diagnosis'] != c['best_clinical_paraclinical_diagnosis']:
        print('* ' + c['name'] + ': clinical: ' + c['final_diagnosis'] + ' ; paraclinical: ' + c['best_clinical_paraclinical_diagnosis'] + ' ; path: ' + c['report_path'])
        countconflicts1 += 1
print('Total: %i' % countconflicts1)
print('\n')

print('List of potential conflicting diagnoses between clinical and paraclinical:')
countconflicts2 = 0
for rowid, c in cf.iterrows():
    clin_diags = c['diagnoses_clinical'].strip().split('/')
    # Print only patients with a clinical diagnosis
    if clin_diags and clin_diags[0]:
        # Construct list of paraclinical diagnoses
        para_diags = []
        para_diags.extend(c['diagnoses_pet'].split('/'))
        para_diags.extend(c['diagnoses_fmri'].split('/'))
        if para_diags and para_diags[0]:
            # If there is any paraclinical diagnosis not in the clinical diagnoses, we print!
            if any([para_diag for para_diag in para_diags if para_diag not in clin_diags]):
                print('* ' + c['name'] + ': clinical: ' + c['diagnoses_clinical'] + ' ; pet: ' + c['diagnoses_pet'] + ' ; fmri: ' + c['diagnoses_fmri'] + ' ; path: ' + c['report_path'])
                countconflicts2 += 1
print('Total: %i' % countconflicts2)
print('\n')

In [None]:
print('Patients with NAN sedation:')
sedated = cf[cf['mri_sedation'] == '']
for rowid, c in sedated.iterrows():
    print('* ' + c['name'] + ': ' + c['report_path'])
print('Total NAN sedation: %i' % len(sedated))

In [None]:
print('Patients without sedation:')
notsedated = cf[cf['mri_sedation'] == 'no']
for rowid, c in notsedated.iterrows():
    print('* ' + c['name'] + ': ' + c['report_path'])
print('Total without sedation: %i' % len(notsedated))

In [None]:
print('Patients with missing final diagnosis:')
missing = cf[cf['final_diagnosis'] == '']
missing_best = cf[cf['best_clinical_paraclinical_diagnosis'] == '']
for rowid, c in missing.iterrows():
    print('* ' + c['name'] + ': ' + c['report_path'])
print('Total missing final diagnosis: %i (but for best diagnosis: only %i missing)' % (len(missing), len(missing_best)))

In [None]:
print('List of potential MCS* (clinical UWS but paraclinical MCS):')
count = 0
for rowid, c in cf.iterrows():
    # If clinical diagnosis = MCS+/- or EMCS, but paraclinical diagnoses are UWS
    if 'UWS' in c['diagnoses_clinical'] and ('MCS' in c['diagnoses_pet'] or 'UWS' in c['diagnoses_fmri']):
        print('* ' + c['name'] + ': ' + c['report_path'])
        count += 1
print('Total: %i' % count)
print('\n')

In [None]:
print('List of EMCS:')
emcs = cf[cf['best_clinical_paraclinical_diagnosis'] == 'EMCS']
print(len(emcs.ix[:,['name']]))
emcs.ix[:,['name', 'report_path']]

In [None]:
print('List of clinically non-EMCS but paraclinically EMCS (these are bugs to correct!):')
emcs_clin = cf[cf['final_diagnosis'] == 'EMCS']
emcs_bugs = emcs[~emcs.isin(emcs_clin)].dropna(how='all')
print(len(emcs_bugs.ix[:,['name']]))
emcs_bugs.ix[:,['name', 'report_path']]

In [None]:
print('Patients potentially Zolpidem-respondents:')
zolpidem_resp = cf[cf['zolpidem_respondent'] == True]
for rowid, c in zolpidem_resp.iterrows():
    print('* ' + c['name'] + ': ' + c['report_path'])
print('Total potential zolpidem-respondents: %i' % len(zolpidem_resp))
print('\n')

In [None]:
print('Patients potentially Zolpidem-antirespondents:')
zolpidem_antiresp = cf[cf['zolpidem_antirespondent'] == True]
for rowid, c in zolpidem_antiresp.iterrows():
    print('* ' + c['name'] + ': ' + c['report_path'])
print('Total potential zolpidem-antirespondents: %i' % len(zolpidem_antiresp))
print('\n')

In [None]:
print('Patients tested with Zolpidem but could not be detected as respondent nor anti-respondents (probably just non-responders):')
zolpidem_test = cf[(cf['zolpidem_tested'] == True) & (cf['zolpidem_respondent'] == False) & (cf['zolpidem_antirespondent'] == False)]
for rowid, c in zolpidem_test.iterrows():
    print('* ' + c['name'] + ': ' + c['report_path'])
print('Total patients tested with zolpidem but neither respondent nor antirespondent (probably just non-responders): %i' % len(zolpidem_test))
print('\n')

In [None]:
print('Patients with an ophtalmologic report:')
ophtalmo = cf[cf['ophtalmologic_report'] == True]
for rowid, c in ophtalmo.iterrows():
    print('* ' + c['name'] + ': ' + c['report_path'])
print('Total patients with an ophtalmologic report: %i' % len(ophtalmo))
print('\n')

In [None]:
print('Patients with a nociception coma scale report:')
noci = cf[cf['nociception_report'] == True]
for rowid, c in noci.iterrows():
    print('* ' + c['name'] + ': ' + c['report_path'])
print('Total patients with a nociception coma scale report: %i' % len(noci))
print('\n')

------------------------------------
## Code tests

In [None]:
try:
    import Image
except ImportError:
    from PIL import Image
import pytesseract
print(pytesseract.image_to_string(Image.open(report_path), lang='fra'))

In [None]:
report_path = 'reports_bad/some-file.pdf'
#report_path = 'reports_all/test.png'
ocrparser = MyOCRParser()
report_text = _unidecode(ocrparser.process(report_path, 'utf8').decode('utf8')).lower()
print(report_text)

In [None]:
_, report_text, report_conclusion = extract_report_fields(report_path, return_text=True, ocr=True)

In [None]:
print(report_text)

In [None]:
match_etiology = re.search(r'((accident|trauma|arrest|arret\s+cardiaque|(months|years)\s+|post[\-\s]+(?!hospital))([\(\)a-zA-Z0-9\<\>\-]+\s+){1,9}).+?(le|on)?(\d+(/|-)\s*\d+(/|-)\s*\d+|\d+\s*\w+\s*\d+)', report_text, re.S)
match_etiology.group(1)
match_etiology.group(6)

In [None]:
patient_acquisition_dates = re.search(r'(du|from)\s+((\d+[\-/\.\s]+)?(\d+[\-/\.\s]+)?\d+)\s+(au|to)\s+((\d+[\-/\.\s]+)?(\d+[\-/\.\s]+|\w+\s+)\d+)', report_text)
patient_acquisition_start = patient_acquisition_dates.group(2)
patient_acquisition_end = patient_acquisition_dates.group(6)
patient_fields['acquisition_date'] = '%s - %s' % (patient_acquisition_start, patient_acquisition_end)
patient_fields['acquisition_date']