In [65]:
#import statements
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
from matplotlib import style
import re
import difflib as dl
import nltk
from sklearn.model_selection import train_test_split

In [66]:
#import data

##### PUT THE FILE NAME OF THE INPUT FILE HERE (below) #####

data = pd.read_csv("new_reports.csv", encoding='latin-1')

##### PUT THE FILE NAME OF THE INPUT FILE HERE (above) #####

data = data.replace(to_replace='bresat', value='breast', regex=True)
data = data.replace(to_replace='cancinoma', value='carcinoma', regex=True)
data = data.replace(to_replace='å', value='', regex=True)
data = data.replace(to_replace='Ê', value='', regex=True)
data.columns = ["Patient ID", "Path Report"]
numPatients = data.shape[0]

In [67]:
letters = "A[).:]| B[).:]| C[).:]| D[).:]| E[).:]| F[).:]| G[).:]| H[).:]| I[).:]| J[).:]| K[).:]"
#split by specimen
for i in range(0, len(data)):
    if data['Path Report'].iloc[i][0:2] not in ['A.', 'A)', 'A:']:
        data['Path Report'].iloc[i] = 'A. ' + data['Path Report'].iloc[i]
splitReps = []
for pathRep in data["Path Report"]:
    splitRep = re.split(letters, pathRep)
    if len(splitRep) > 1:
        splitRep = splitRep[1:]
    splitReps.append(splitRep)
data['splitReps'] = splitReps

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [68]:
#setup DataFrame for biopsy data, extract information and place in appropriate columns
biopData = pd.DataFrame(columns = ["Patient", "Biopsy Description", "Path Report"])

patIds = range(data.shape[0])
bioType, pathRep, patients = [], [], []

for patId in patIds:
    patient = data.iloc[patId]
    for rep in patient[2]:
        #split report into biopsy description and path report
        splitRep = re.split(re.compile(r"\:[^0-9]|\. "), rep)
        if len(splitRep[0]) < 2:
            splitRep = re.split(re.compile(r"\:[^0-9]"), rep)
        
        if len(splitRep) > 1:
            bioType.append(splitRep[0])
            pathRep.append('. '.join(splitRep[1:]))
            patients.append(data.iloc[patId]['Patient ID'])
            
biopData["Patient"] = patients
biopData["Biopsy Description"] = bioType
biopData["Path Report"] = pathRep

In [82]:
# Extracting laterality, biopsy source, and labels
lats, organs = [], []
for biop in biopData["Biopsy Description"]:
    biop = nltk.word_tokenize(biop.lower())
    
    if len(dl.get_close_matches("left", biop)) > 0:
        lats.append("left")
    elif len(dl.get_close_matches("right", biop)) > 0:
        lats.append("right")
    else:
        lats.append("na")

    if len(dl.get_close_matches("breast", biop)) > 0:
        organs.append("breast")
    elif len(dl.get_close_matches("nipple", biop)) > 0:
        organs.append("breast")
    elif len(dl.get_close_matches("mastectomy", biop)) > 0:
        organs.append("breast")
    elif len(dl.get_close_matches("lymph", biop)) > 0:
        organs.append("lymph node")
    elif len(dl.get_close_matches("skin", biop)) > 0:
        organs.append("skin")
    elif len([word for wordList in [dl.get_close_matches(br, biop) 
            for br in ["axilla", "uterus", "fallopian", "ovary", "adnexa"]] 
              for word in wordList]) > 0:
        organs.append("uterus")
    else:
        organs.append("na")
biopData["Laterality"] = lats
biopData["Biopsy Source"] = organs

numSamples = biopData.shape[0]

biopData
#keeping only breast specimens per research specification 
biopData2 = biopData[biopData["Biopsy Source"] == 'breast']
#fix indexing issue due to dropped specimens
k = biopData2.reset_index()
split_reps = k.drop(labels='index', axis=1)



In [96]:
# natural language processing tools

# Negation Processing, applies _not identifier to words following a negation word
def negate_sequence(tokens):
    """
    Detects negations and transforms negated words into "not_" form.
    """
    negation = False
    delims = "?.!:;-"
    result = []
    for token in tokens:
        if any(c == token for c in delims):
            negation = False

        negated = "not_" + token if negation else token
        result.append(negated)

        if any(neg == token.lower() for neg in ["not", "n't", "no", "without", "negative", "treated"]):
            negation = True
            result[-1] = "not_" + result[-1]
            if len(result) > 1:
                result[-2] = "not_" + result[-2]

    return result

def classify_regex(pathRep):
    fx_path_rep = re.compile(r'(\=|\-)').sub(' ', pathRep)
    pathRep = ' '.join(negate_sequence(nltk.word_tokenize(fx_path_rep.lower())))
    
    obs = re.split(re.compile(r"[0-9][ ]?\. "), pathRep)
    #if len(obs) > 1:
    #    obs = obs[1:]
    obs = [re.compile(r'(\.|\;|\-|\,)').sub(' ', ' ' + ob + ' ') for ob in obs] 
    
    # pad with spaces to match markers that come at beginning of strings and remove dashes and commas
    classes = []
    for ob in obs:
        classes.append(classify_breast_ob(ob))
    return classes

atyp_markers = ["flat epithelial atypia", 
                "atypical ductal hyperplasia", 
                "atypical lobular hyperplasia"]

fibro_markers = ["fibroadenoma", "phyllodes"]

ben_markers = ["papilloma", "usual ductal hyperplasia", 
                "apocrine metaplasia", "radial scar",
                "sclerosing adenosis", 
                "pseudoangiomatous stromal hyperplasia",
                "cyst", "mastitis", "benign"]

def classify_breast_ob(ob):
    label = ["na", "na", "na"]

    # Lymphoma
    if re.search(" lymphoma", ob):
        label[0] = "lymphoma"
        return label
    
    

    # Breast Cancer and Metastases
    if re.search(" lcis", ob) or re.search(" lobular carcinoma in situ", ob):
        label = ["breast cancer", "lobular", "in situ"]
        return label
    elif re.search(" dcis", ob) or re.search(" ductal carcinoma in situ", ob):
        label = ["breast cancer", "ductal", "in situ"]
        return label
    elif re.search(" idc", ob) or re.search(" invasive ductal carcinoma", ob):
        label = ["breast cancer", "ductal", "invasive"]
        return label
    elif re.search(" ilc", ob) or re.search(" invasive lobular carcinoma", ob):
        label = ["breast cancer", "ductal", "invasive"]
        return label
    elif re.search(" lobular carcinoma", ob):
        label = ["breast cancer", "lobular", "na"]
        return label
    elif re.search(" ductal carcinoma", ob):
        label = ["breast cancer", "ductal", "na"]
        return label
    elif re.search(" low grade neoplasm", ob):
        label = ["breast cancer", "neoplasm", "na"]
        return label
    
    if re.search(" carcinoma| cancer| malign| adenocarcinoma| Adenocarcinoma", ob):
        label[0] = "breast cancer"
        if re.search(re.compile("[ -]invasive|[ -]infiltrating"), ob):
            label[2] = "invasive"
        elif re.search(re.compile("in[ -]situ"), ob):
            label[2] = "in situ"
        if re.search(" duct(al)?", ob):
            label[1] = "ductal"
        elif re.search("lobular", ob):
            label[1] = "lobular"
        return label
    if re.search(" metastati(c|s)", ob):
            #if organ == "breast":    
        label[2] = "metastasis"
            #else:
            #    label[0] = "metastasis from non-bc"
    
    

    # Atypical
    for marker in atyp_markers:
        if re.search(" " + marker + " ", ob):
            label[0] = "atypical"
            label[1] = marker
            return label
    
    # Fibroepithelial
    for marker in fibro_markers:
        if re.search(" " + marker + " ", ob):
            label[0] = "fibroepithelial"
            label[1] = marker
            return label
            
    # Benign
    for marker in ben_markers:
        if re.search(" " + marker + " ", ob):
            label[0] = "benign"
            label[1] = marker
            return label
            
    return label
            
    #lymph_marker = re.compile(" lymphoma")
    #mets_marker = re.compile(" metastasis")
    #inv_breast_marker = [re.compile(marker) for marker in []]
    
def get_single_label(obs_labels):
    label = ["na", "na", "na"]
    first_level = [labels[0] for labels in obs_labels]
    second_level = [labels[1] for labels in obs_labels]
    
    try: 
        if "metastasis" in [labels[2] for labels in obs_labels]:
            label[2] = "metastasis"
        if "lymphoma" in first_level:
            label[0] = "lymphoma"
        #elif "metastasis from non-bc" in first_level:
        #    label[0] = "metastasis from non-bc"
        elif "breast cancer" in first_level:
            label[0] = "breast cancer"
            if "neoplasm" in [labels[1] for labels in obs_labels]:
                label[1] = "neoplasm"
            
            second_level = [labels[2] for labels in obs_labels]
            third_level = [labels[1] for labels in obs_labels]
            if "metastasis" in second_level:
                label[2] = "metastasis"
            else:
                if "invasive" in second_level:
                    label[2] = "invasive"
                    third_level = [third_level[ind] for ind in range(len(obs_labels)) 
                                   if second_level[ind] == "invasive"]
                elif "in situ" in second_level:
                    label[2] = "in situ"
                    third_level = [third_level[ind] for ind in range(len(obs_labels)) 
                                   if second_level[ind] == "in situ"]

                if "ductal" in third_level:
                    label[1] = "ductal"
                elif "lobular" in third_level:
                    label[1] = "lobular"
        elif "atypical" in first_level:
            label[0] = "atypical"
            for marker in atyp_markers:
                if marker in second_level:
                    label[1] = marker
        elif "fibroepithelial" in first_level:
            label[0] = "atypical"
            for marker in fibro_markers:
                if marker in second_level:
                    label[1] = marker
        elif "benign" in first_level:
            label[0] = "benign"
            for marker in ben_markers:
                if marker in second_level:
                    if marker != 'benign':
                        label[1] = marker
    except:
        print(obs_labels)
                
    return tuple(label)

def fix_label(t):
    t = str(t)
    t = re.sub(r"\('", "", t)
    t = re.sub(r"'\)", "", t)
    t = re.sub(r"', '", "_", t)
    return t

In [95]:
type(str(labeled_split_reps['Single Label [Derived]'].values[0]))

str

In [99]:
split_reps['All Labels'] = split_reps['Path Report'].apply(classify_regex)
split_reps["Single Label"] = split_reps["All Labels"].apply(get_single_label)
patient_labels, binary_labels = [], []
labeled_split_reps = split_reps.rename(index=str, 
    columns={"Patient": "Patient ID", 
             "Biopsy Description": "Path Report I",
             "Path Report": "Path Report II",
             "Laterality": "Laterality [Derived]",
             "Biopsy Source": "Organ [Derived]",
             "All Labels": "All Labels [Derived]",
             "Single Label": "Single Label [Derived]"})
labeled_split_reps = labeled_split_reps.drop('All Labels [Derived]', axis=1)
labeled_split_reps['Single Label [Derived]'] = labeled_split_reps['Single Label [Derived]'].apply(fix_label)

In [101]:
##### PUT THE FILE NAME OF THE OUTPUT FILE HERE (below) #####

labeled_split_reps.to_csv('new_output_from_preprocess_clean.csv')

##### PUT THE FILE NAME OF THE OUTPUT FILE HERE (above) #####