## Labeling Path Reports 

In [52]:
#import tools
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
from matplotlib import style
import re
import difflib as dl
import nltk
from sklearn.model_selection import train_test_split

In [53]:
#import data
data = pd.read_csv("PathReport&Labels.csv")
data.columns = ["Path Report", "Label"]
numPatients = data.shape[0]
path_reports = pd.read_csv("Path Reports Complete.csv")
split_reps = pd.read_csv("Path Reports by Specimens for All").drop("Unnamed: 0", axis=1)
split_reps = split_reps.fillna('')

In [54]:
# natural language processing tools

# Negation Processing, applies _not identifier to words following a negation word
def negate_sequence(tokens):
    """
    Detects negations and transforms negated words into "not_" form.
    """
    negation = False
    delims = "?.!:;-"
    result = []
    for token in tokens:
        if any(c == token for c in delims):
            negation = False

        negated = "not_" + token if negation else token
        result.append(negated)

        if any(neg == token.lower() for neg in ["not", "n't", "no", "without", "negative", "treated"]):
            negation = True
            result[-1] = "not_" + result[-1]
            if len(result) > 1:
                result[-2] = "not_" + result[-2]

    return result

## Classify Each Biopsy according to Path Tree

In [59]:
def classify_regex(pathRep):
    fx_path_rep = re.compile(r'(\=|\-)').sub(' ', pathRep)
    pathRep = ' '.join(negate_sequence(nltk.word_tokenize(fx_path_rep.lower())))
    
    obs = re.split(re.compile(r"[0-9][ ]?\. "), pathRep)
    #if len(obs) > 1:
    #    obs = obs[1:]
    obs = [re.compile(r'(\.|\;|\-|\,)').sub(' ', ' ' + ob + ' ') for ob in obs] 
    
    print(obs)
    # pad with spaces to match markers that come at beginning of strings and remove dashes and commas
    classes = []
    for ob in obs:
        classes.append(classify_breast_ob(ob))
    return classes

atyp_markers = ["flat epithelial atypia", 
                "atypical ductal hyperplasia", 
                "atypical lobular hyperplasia"]

fibro_markers = ["fibroadenoma", "phyllodes"]

ben_markers = ["papilloma", "usual ductal hyperplasia", 
                "apocrine metaplasia", "radial scar",
                "sclerosing adenosis", 
                "pseudoangiomatous stromal hyperplasia",
                "cyst", "mastitis", "benign"]

def classify_breast_ob(ob):
    print(ob)
    label = ["na", "na", "na"]

    # Lymphoma
    if re.search(" lymphoma", ob):
        label[0] = "lymphoma"
        return label
    
    

    # Breast Cancer and Metastases
    if re.search(" lcis", ob) or re.search(" lobular carcinoma in situ", ob):
        label = ["breast cancer", "lobular", "in situ"]
        return label
    elif re.search(" dcis", ob) or re.search(" ductal carcinoma in situ", ob):
        label = ["breast cancer", "ductal", "in situ"]
        return label
    elif re.search(" idc", ob) or re.search(" invasive ductal carcinoma", ob):
        label = ["breast cancer", "ductal", "invasive"]
        return label
    elif re.search(" ilc", ob) or re.search(" invasive lobular carcinoma", ob):
        label = ["breast cancer", "ductal", "invasive"]
        return label
    elif re.search(" lobular carcinoma", ob):
        label = ["breast cancer", "lobular", "na"]
        return label
    elif re.search(" ductal carcinoma", ob):
        label = ["breast cancer", "ductal", "na"]
        return label
    elif re.search(" low grade neoplasm", ob):
        label = ["breast cancer", "neoplasm", "na"]
        return label
    
    if re.search(" carcinoma| cancer| malign| adenocarcinoma| Adenocarcinoma", ob):
        label[0] = "breast cancer"
        if re.search(re.compile("[ -]invasive|[ -]infiltrating"), ob):
            label[2] = "invasive"
        elif re.search(re.compile("in[ -]situ"), ob):
            label[2] = "in situ"
        if re.search(" duct(al)?", ob):
            label[1] = "ductal"
        elif re.search("lobular", ob):
            label[1] = "lobular"
        print(label)
        return label
    if re.search(" metastati(c|s)", ob):
            #if organ == "breast":    
        label[2] = "metastasis"
            #else:
            #    label[0] = "metastasis from non-bc"
    
    

    # Atypical
    for marker in atyp_markers:
        if re.search(" " + marker + " ", ob):
            label[0] = "atypical"
            label[1] = marker
            return label
    
    # Fibroepithelial
    for marker in fibro_markers:
        if re.search(" " + marker + " ", ob):
            label[0] = "fibroepithelial"
            label[1] = marker
            return label
            
    # Benign
    for marker in ben_markers:
        if re.search(" " + marker + " ", ob):
            label[0] = "benign"
            label[1] = marker
            return label
            
    return label
            
    #lymph_marker = re.compile(" lymphoma")
    #mets_marker = re.compile(" metastasis")
    #inv_breast_marker = [re.compile(marker) for marker in []]
    

## Merge labels for each specimen into 1 label by taking the worst of each label

In [66]:
def get_single_label(obs_labels):
    label = ["na", "na", "na"]
    first_level = [labels[0] for labels in obs_labels]
    second_level = [labels[1] for labels in obs_labels]
    
    try: 
        if "metastasis" in [labels[2] for labels in obs_labels]:
            label[2] = "metastasis"
        if "lymphoma" in first_level:
            label[0] = "lymphoma"
        #elif "metastasis from non-bc" in first_level:
        #    label[0] = "metastasis from non-bc"
        elif "breast cancer" in first_level:
            label[0] = "breast cancer"
            if "neoplasm" in [labels[1] for labels in obs_labels]:
                label[1] = "neoplasm"
            
            second_level = [labels[2] for labels in obs_labels]
            third_level = [labels[1] for labels in obs_labels]
            if "metastasis" in second_level:
                label[2] = "metastasis"
            else:
                if "invasive" in second_level:
                    label[2] = "invasive"
                    third_level = [third_level[ind] for ind in range(len(obs_labels)) 
                                   if second_level[ind] == "invasive"]
                elif "in situ" in second_level:
                    label[2] = "in situ"
                    third_level = [third_level[ind] for ind in range(len(obs_labels)) 
                                   if second_level[ind] == "in situ"]

                if "ductal" in third_level:
                    label[1] = "ductal"
                elif "lobular" in third_level:
                    label[1] = "lobular"
        elif "atypical" in first_level:
            label[0] = "atypical"
            for marker in atyp_markers:
                if marker in second_level:
                    label[1] = marker
        elif "fibroepithelial" in first_level:
            label[0] = "atypical"
            for marker in fibro_markers:
                if marker in second_level:
                    label[1] = marker
        elif "benign" in first_level:
            label[0] = "benign"
            for marker in ben_markers:
                if marker in second_level:
                    if marker != 'benign':
                        label[1] = marker
    except:
        print(obs_labels)
                
    return tuple(label)

## Exporting

In [67]:
split_reps["Single Label"] = split_reps["All Labels"].apply(get_single_label)
split_reps.to_csv("k3.csv")

In [68]:
labeled_split_reps = split_reps.drop("Rad Label", axis=1).rename(index=str, 
    columns={"Patient": "Patient ID", 
             "Biopsy Description": "Path Report I",
             "Path Report": "Path Report II",
             "Laterality": "Laterality [Derived]",
             "Biopsy Source": "Organ [Derived]",
             "All Labels": "All Labels [Derived]",
             "Single Label": "Single Label [Derived]"})

In [1]:
#get a single label instead of the 3-part labels which indicated only cancer/atypical/benign/none without detail
def get_binary_label(patient_data):
    laterality = []
    for i, specimen in patient_data.iterrows():
        if specimen["Single Label [Derived]"][0] == "breast cancer" or specimen["Single Label [Derived]"][0] == "lymphoma" or specimen["Single Label [Derived]"][2]=="metastasis":
            laterality.append(specimen["Laterality [Derived]"])
    if "right" in laterality and "left" in laterality:
        return "Bilateral Positive"
    elif "right" in laterality:
        return "Right Positive"
    elif "left" in laterality:
        return "Left Positive"
    elif laterality:
        return "Positive NOS"
    else:
        return "Negative"

def flatten_list(nestedl):
    [item for sublist in nestedl for item in sublist]

patient_labels, binary_labels = [], []
for patID in range(numPatients):
    reps = labeled_split_reps[(labeled_split_reps["Patient ID"] == patID)]
    labels = reps["Single Label [Derived]"].tolist()
    single_label = get_single_label(labels)
    patient_labels.append(single_label)
    binary_labels.append(get_binary_label(reps))
    
    

labeled_data = data
labeled_data["Single Label"] = patient_labels
labeled_data["Binary Label [Derived]"] = binary_labels
labeled_data

NameError: name 'numPatients' is not defined

In [71]:
# Testing
k5 = labeled_data[labeled_data["Label"] != labeled_data["Binary Label [Derived]"]]
k5.to_csv("k6.csv")

In [72]:
#don't use this cell
#k7 = labeled_data[labeled_data["Label"] == labeled_data["Binary Label [Derived]"]]
#k7.to_csv('first_test.csv')


In [73]:
# Testing
ob = labeled_data.iloc[1030]["Path Report"]

classify_breast_ob(ob)
if re.search(" carcinoma| cancer| malign| adenocarcinoma| Adenocarcinoma", ob):
    print('hi')

Left chest near scar, fine needle aspirate: Adenocarcinoma, consistent with breast primary. See comment.
['breast cancer', 'na', 'na']
hi


In [270]:
#exporting
labeled_split_reps.to_csv("Labeled Path Reports (Specimen).csv")
labeled_data.to_csv("Labeled Path Reports (Entire Report).csv")

Analysis of Processed Data Charts

In [271]:
# Testing
labeled_data[labeled_data["Label"] != labeled_data["Binary Label [Derived]"]]

Unnamed: 0,Path Report,Label,Single Label,Binary Label [Derived]
28,"A. Breast, needle-localization excision biopsy...",Right Positive,"(breast cancer, ductal, na)",Positive NOS
344,"Right Breast, 10:00 o'clock, Fine Needle Aspir...",Negative,"(breast cancer, na, na)",Right Positive
348,"A. Left nipple, re-excision: Benign breast tis...",Left Positive,"(benign, na, na)",Negative
669,CONSULT SLIDE FROM WESTERN PATHOLOGY CONSULTAN...,Negative,"(breast cancer, na, invasive)",Left Positive
863,"1. Breast, left, excisional biopsy: Highly aty...",Left Positive,"(na, na, na)",Negative
965,"A. Right axillary lymph nodes, excision: Metas...",Right Positive,"(benign, na, na)",Negative
984,"A. Node, site not further specified, sentinel ...",Negative,"(breast cancer, na, na)",Left Positive
1030,"Left chest near scar, fine needle aspirate: Ad...",Left Positive,"(na, na, na)",Negative
1057,"A. Right Chest Wall, Soft Tissue, Fine Needle ...",Right Positive,"(na, na, na)",Negative
1207,"A. BREAST, LEFT, FINE NEEDLE ASPIRATION: B cel...",Bilateral Positive,"(lymphoma, na, na)",Left Positive


In [419]:
split_reps.groupby("Single Label").agg(['count'])

Unnamed: 0_level_0,Patient,Biopsy Description,Path Report,Rad Label,Laterality,Biopsy Source,All Labels
Unnamed: 0_level_1,count,count,count,count,count,count,count
Single Label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
"(atypical, atypical ductal hyperplasia, na)",73,73,73,73,73,73,73
"(atypical, atypical lobular hyperplasia, na)",91,91,91,91,91,91,91
"(atypical, fibroadenoma, na)",316,316,316,316,316,316,316
"(atypical, flat epithelial atypia, na)",35,35,35,35,35,35,35
"(atypical, phyllodes, na)",10,10,10,10,10,10,10
"(benign, apocrine metaplasia, na)",165,165,165,165,165,165,165
"(benign, cyst, na)",208,208,208,208,208,208,208
"(benign, mastitis, na)",4,4,4,4,4,4,4
"(benign, na, na)",1144,1144,1144,1144,1144,1144,1144
"(benign, papilloma, na)",50,50,50,50,50,50,50


In [420]:
def print_full(x):
    pd.set_option('display.max_rows', len(x))
    display(x)
    pd.reset_option('display.max_rows')
split_reps[split_reps["Single Label"] == ("benign", "cyst", "na")]#["Path Report"][4023]

Unnamed: 0,Patient,Biopsy Description,Path Report,Rad Label,Laterality,Biopsy Source,All Labels,Single Label
109,70,"Skin, right breast, 1st incision, excision",Skin with scar and epidermal inclusion cyst. ...,Negative,right,breast,"[[benign, cyst, na]]","(benign, cyst, na)"
141,96,"Left Breast, 6","00, 3 cm from nipple, Fine Needle Aspiration. ...",Negative,left,breast,"[[benign, cyst, na]]","(benign, cyst, na)"
165,113,"Left breast #1, fine needle aspiration",Benign cyst fluid; see comment.,Negative,left,breast,"[[benign, cyst, na]]","(benign, cyst, na)"
166,113,"Left breast #2, fine needle aspiration",Benign cyst fluid; see comment.,Negative,left,breast,"[[benign, cyst, na]]","(benign, cyst, na)"
167,113,"Left breast #3, fine needle aspiration",Benign cyst fluid; see comment.,Negative,left,breast,"[[benign, cyst, na]]","(benign, cyst, na)"
168,113,"Left breast #4, fine needle aspiration",Benign cyst fluid; see comment.,Negative,left,breast,"[[benign, cyst, na]]","(benign, cyst, na)"
232,161,"Right Breast, Fine Needle Aspiration",Simple cyst and fibrocystic change; see comment.,Negative,right,breast,"[[benign, cyst, na]]","(benign, cyst, na)"
246,172,"Right Breast #2, Fine Needle Aspiration",Benign simple cyst.,Negative,right,breast,"[[benign, cyst, na]]","(benign, cyst, na)"
274,193,"Left breast, fine needle aspiration",Benign cyst fluid; see comment.,Negative,left,breast,"[[benign, cyst, na]]","(benign, cyst, na)"
320,222,"BREAST, RIGHT 10","00, FINE NEEDLE ASPIRATION. Benign cyst with ...",Negative,right,breast,"[[benign, cyst, na]]","(benign, cyst, na)"
