### Import modules

In [None]:
import re
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm

### Import data (text files)

In [None]:
## Read data
rawtexts = pd.read_csv('filename.csv')  

## Filter for transfusion reaction workup reports
name = "UCSF MEDICAL CENTER  DEPARTMENT OF LABORATORY MEDICINE  DIVISION OF TRANSFUSION MEDICINE  TRANSFUSION REACTION WORKUP"
reports = rawtexts[rawtexts['note_text'].str.startswith(name, na = False)]
print('shape:' + str(reports.shape))
print('id: ' + str(reports.Deid_Key.nunique()))


### Rule-based NLP: 

#### Step 1-2: Identify Key Mentions and Extract Texts



In [6]:

df = pd.DataFrame(columns = ['_id','note_date','note_text','Date_Time','Date', 
                             'Impression','ImpressionLong', 'Case definition',
                             'Severity','Imputability','ProductID', 
                             'ProductType','ProductInfo'])

for i in tqdm(range(len(reports))): 
    try:
        df = df.append({'_id':0,'note_date':0,'note_text':0, 'Date_Time':0,'Date':0,
                        'Impression':0,'ImpressionLong':0,'Case definition':0,
                        'Severity':0,'Imputability':0,'ProductID':0,'ProductType':0,
                        'ProductInfo':0 }, ignore_index=True)
        df.iloc[i, 0] = reports.iloc[i, :]['Deid_Key']
        df.iloc[i, 1] = reports.iloc[i, :]['note_date']
        df.iloc[i, 2] = reports.iloc[i, :]['note_text']
        s = str(reports.iloc[i, :]['note_text']) 
        df.iloc[i, 3] = str(re.compile('of transfusion reactions:(.[0-9]*.[0-9]*.[0-9]*.[0-9]+)|of transfusion reaction:(.[0-9]*.[0-9]*.[0-9]*.[0-9]+)|Date of transfusion reaction (.[0-9]*.[0-9]*.[0-9]*.[0-9]+)|Issue date/time:(.[0-9]*.[0-9].+) Chief.*|Date of Transfusion Reaction Workup:(.[0-9]*.[0-9].+)|Issue date/time:(.[0-9]*.[0-9].+)', re.IGNORECASE).findall(s))[2:-2]
        d = df.iloc[i, 3]
        df.iloc[i, 4] = str(re.compile('([0-9]+.[0-9]*.[0-9]+)').findall(d))[2:-2]
### IMPRESSION       
        df.iloc[i, 5] = str(re.compile(r'Impression:\s*(?:\S+\s*){0,10}|Impression\s*(?:\S+\s*){0,10}|Transfusion-associated adverse reaction\*\*\*\*\*:\s*(?:\S+\s*){0,10}|Transfusion-associated adverse reaction\*:\s*(?:\S+\s*){0,10}|Transfusion-associated adverse,reaction\*:\s*(?:\S+\s*){0,10}').findall(s))[2:-2]  
        df.iloc[i, 6] = re.compile(r'Impression:\s*(?:\S+\s*){0,50}|Impression\s*(?:\S+\s*){0,50}|Transfusion-associated adverse reaction\*\*\*\*\*:\s*(?:\S+\s*){0,50}|Transfusion-associated adverse reaction\*:\s*(?:\S+\s*){0,50}|Transfusion-associated adverse,reaction\*:\s*(?:\S+\s*){0,50}').findall(s)
### CASE DEFINITION
        df.iloc[i, 7] = str(re.compile(r'Case definition:\s*(\S+)').findall(s))[2:-2]
### SEVERITY
        df.iloc[i, 8] = str(re.compile(r'Severity:\s*(\S+)').findall(s))[2:-2]
### IMPUTABILITY
        df.iloc[i, 9] = str(re.compile(r'Imputability:\s*(\S+)').findall(s))[2:-2]
### PRODUCT        
        df.iloc[i, 10] = str(re.compile(r' *W\s*(\d+.\d+.\d+.\d+)').findall(s))[2:-2]  
        df.iloc[i, 11] = str(re.compile(r' *W[0-9 ]+\(([^()]*)\)').findall(s))[2:-2]
        df.iloc[i, 12] = str(re.compile(r'Product\s*(?:\S+\s*){0,25}').findall(s))[2:-2] 

        i+=1
    except:
        pass




  0%|          | 0/4334 [00:00<?, ?it/s]

#### Step 3: Extraction standardization

In [8]:
## Standardizing transfusion reactions

def standardize_btar(cases, colname, newcolname):
    cases[newcolname] = np.nan
    cases.loc[((cases[colname].str.contains('Febrile Non-Hemolytic', case=False, na=False))|
                      (cases[colname].str.contains('febrile,  non-hemolytic transfusion reaction', case=False, na=False))|
                      (cases[colname].str.contains('Febrile, non-hemolytic transfusion reaction', case=False, na=False))|
                      (cases[colname].str.contains('Febrile,Non-Hemolytic Transfusion Reaction \(FNHTR\)', case=False, na=False))|
                      (cases[colname].str.contains('Febrile-Non Hemolytic Reaction \(TR20, 999.8\)', case=False, na=False))|
                      (cases[colname].str.contains('Febrile nonhemolytic transfusion reaction', case=False, na=False))|
                      (cases[colname].str.contains('Febrile,non-hemolytic transfusion reaction', case=False, na=False))|
                      (cases[colname].str.contains('Febrile Non hemolytic transfusion reaction', case=False, na=False))), newcolname] = 'FNHTR'
    cases.loc[((cases[colname].str.contains('Allergic Transfusion', case=False, na=False))|
                       (cases[colname].str.contains('Allergic reaction', case=False, na=False))|
                      (cases[colname].str.contains('Allergic,Transfusion Reaction', case=False, na=False))|
                      (cases[colname].str.contains('allergic transfusion reaction', case=False, na=False))|
                      (cases[colname].str.contains('Anaphylactic Transfusion Reaction', case=False, na=False))|
                      (cases[colname].str.contains('Anaphylactoid transfusion reaction', case=False, na=False))|
                      (cases[colname].str.contains('Anaphylactic,Transfusion Reaction', case=False, na=False))), newcolname] = 'ATR'
    cases.loc[cases[colname].str.contains('Allergic reaction  Cas', case=False, na=False), newcolname] = 'ATR'
    cases.loc[cases[colname].str.contains(' allergic reaction \[TR33\] ', case=False, na=False), newcolname] = 'ATR'

    cases.loc[cases[colname].str.contains('Delayed Serologic Transfusion', case=False, na=False), newcolname] = 'DSTR'
    cases.loc[cases[colname].str.contains('Delayed Serology Transfusion', case=False, na=False), newcolname] = 'DSTR'

    cases.loc[((cases[colname].str.contains('Transfusion Associated Circulatory', case=False, na=False))|
                      (cases[colname].str.contains('Transfusion Associated Cardiac Overload', case=False, na=False))|
                      (cases[colname].str.contains('Transfusion-Associated Circulatory', case=False, na=False))|
                      (cases[colname].str.contains('Transfusion-associated cardiac overload', case=False, na=False))|
                      (cases[colname].str.contains('Transfusion,Associated Circulatory Overload', case=False, na=False))|
                      (cases[colname].str.contains('TACO', case=True, na=False))), newcolname] = 'TACO'
    cases.loc[cases[colname].str.contains('Transfusion Associated Dyspnea', case=False, na=False), newcolname] = 'TAD'
    cases.loc[cases[colname].str.contains('Transfusion-Associated Dyspnea', case=False, na=False), newcolname] = 'TAD'

    cases.loc[cases[colname].str.contains('Delayed Hemolytic Transfusion', case=False, na=False), newcolname] = 'DHTR'
    cases.loc[((cases[colname].str.contains('Hypotensive Transfusion', case=False, na=False))|
                       (cases[colname].str.contains('Hypotension related to transfusion', case=False, na=False))), newcolname] = 'HTR'

    cases.loc[cases[colname].str.contains('Acute Hemolytic Transfusion', case=False, na=False), newcolname] = 'AHTR'

    cases.loc[cases[colname].str.contains('Transfusion-Transmitted Infection', case=False, na=False), newcolname] = 'TTI'

    cases.loc[((cases[colname].str.contains('Transfusion Related Acute Lung Injury', case=False, na=False))|
                      (cases[colname].str.contains('Transfusion-Related Acute Lung Injury', case=False, na=False))), newcolname] = 'TRALI'

    cases.loc[cases[colname].str.contains('Post-Transfusion Purpura', case=False, na=False), newcolname] = 'PTP'


    cases.loc[((cases[colname].str.contains('Other Transfusion Reaction', case=False, na=False))|
                      (cases[colname].str.contains('Other Transfusion,Reaction', case=False, na=False))), newcolname] = 'Other/Unknown'
    cases.loc[((cases[colname].str.contains('Unknown Transfusion Reaction', case=False, na=False))|
                      (cases[colname].str.contains('Unknown,Transfusion Reaction', case=False, na=False))|
                      (cases[colname].str.contains('Transfusion-associated adverse reaction\*: Unknown', case=True, na=False))|
                      (cases[colname].str.contains('Unknown reaction', case=True, na=False))), newcolname] = 'Other/Unknown'
                       
                       
    return cases

df = standardize_btar(df, 'BTAR_extraction', 'BTAR_standardized')

## Standardizing reaction dates
df[['date_', 'time_']] = df['Date'].str.split("',",1, expand=True)
df['date'] = df['date_'].str.replace(" '", "").str.replace("-", "/")
df['date'] = pd.to_datetime(all_case['date'],errors='coerce')

## Standardizing case definition
df.loc[df['Case definition'].str.contains('definit', case=False, na=False), 'case definition'] = 'Definitive'
df.loc[df['Case definition'].str.contains('probab', case=False, na=False), 'case definition'] = 'Probable'
df.loc[df['Case definition'].str.contains('possib', case=False, na=False), 'case definition'] = 'Possible'


## Standardizing severity
df.loc[((df['Severity'].str.contains('non severe', case=False, na=False))|
       (df['Severity'].str.contains('non-severe', case=False, na=False))), 'severity'] = 'Non-Severe'
df.loc[df['Severity'].str.contains('severe', case=False, na=False), 'severity'] = 'Severe'
df.loc[df['Severity'].str.contains('life', case=False, na=False), 'severity'] = 'Life-Threatening'
df.loc[df['Severity'].str.contains('death', case=False, na=False), 'severity'] = 'Death'
df.loc[df['Severity'].str.contains('not', case=False, na=False), 'severity'] = 'Not Determined'



## Standardizing imputability
df.loc[df['Imputability'].str.contains('definit', case=False, na=False), 'imputability'] = 'Definite'
df.loc[df['Imputability'].str.contains('definit', case=False, na=False), 'imputability'] = 'Probable'
df.loc[df['Imputability'].str.contains('probab', case=False, na=False), 'imputability'] = 'Possible'
df.loc[df['Imputability'].str.contains('possib', case=False, na=False), 'imputability'] = 'Doubtful'
df.loc[df['Imputability'].str.contains('rule', case=False, na=False), 'imputability'] = 'Ruled Out'
df.loc[df['Imputability'].str.contains('not', case=False, na=False), 'imputability'] = 'Not Determined'




### Export tabular data

In [None]:
## save data 

df.to_csv("filename.csv", index = None, header=True)

