## Splitting Data from Path Reports

In [2]:
#import statements
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
from matplotlib import style
import re
import difflib as dl
import nltk
from sklearn.model_selection import train_test_split

In [3]:
#import data
data = pd.read_csv("PathReport&Labels.csv")
data = data.replace(to_replace='bresat', value='breast', regex=True)
data = data.replace(to_replace='cancinoma', value='carcinoma', regex=True)
data.columns = ["Path Report", "Label"]
numPatients = data.shape[0]

In [4]:
letters = "A[).:]| B[).:]| C[).:]| D[).:]| E[).:]| F[).:]| G[).:]| H[).:]| I[).:]| J[).:]| K[).:]"

In [6]:
#split by specimen
for i in range(0, len(data)):
    if data['Path Report'].iloc[i][0:2] not in ['A.', 'A)', 'A:']:
        data['Path Report'].iloc[i] = 'A. ' + data['Path Report'].iloc[i]
splitReps = []
for pathRep in data["Path Report"]:
    splitRep = re.split(letters, pathRep)
    if len(splitRep) > 1:
        splitRep = splitRep[1:]
    splitReps.append(splitRep)

In [7]:
#cell for testing issues with splitting of individual specimens
re.split("A[).:] | B[).:]|C[).:]", "A:Left Breast at 3:00 , Fine Needle Aspiration: Adenocarcinoma, see comment. B:Left Breast at 2:30, Fine Needle Aspiration: Adenocarcinoma, see comment.")

['A:Left Breast at 3:00 , Fine Needle Aspiration: Adenocarcinoma, see comment.',
 'Left Breast at 2:30, Fine Needle Aspiration: Adenocarcinoma, see comment.']

In [12]:
#setup DataFrame for biopsy data, extract information and place in appropriate columns
biopData = pd.DataFrame(columns = ["Patient", "Biopsy Description", "Path Report", "Rad Label", "Laterality"])

In [13]:
patIds = range(data.shape[0])
bioType, pathRep, patients, labels = [], [], [], []

for patId in patIds:
    patient = data.iloc[patId]
    for rep in patient[2]:
        #split report into biopsy description and path report
        splitRep = re.split(re.compile(r"\:|\. "), rep)
        if len(splitRep[0]) < 2:
            splitRep = re.split(re.compile(r"\:"), rep)
        
        if len(splitRep) > 1:
            bioType.append(splitRep[0])
            pathRep.append('. '.join(splitRep[1:]))
            patients.append(patId)
            labels.append(patient["Label"])
            
biopData["Patient"] = patients
biopData["Biopsy Description"] = bioType
biopData["Path Report"] = pathRep
biopData["Rad Label"] = labels

In [15]:
# Extracting laterality, biopsy source, and labels
lats, organs = [], []
for biop in biopData["Biopsy Description"]:
    biop = nltk.word_tokenize(biop.lower())

    if len(dl.get_close_matches("left", biop)) > 0:
        lats.append("left")
    elif len(dl.get_close_matches("right", biop)) > 0:
        lats.append("right")
    else:
        lats.append("na")

    if len(dl.get_close_matches("breast", biop)) > 0:
        organs.append("breast")
    elif len(dl.get_close_matches("nipple", biop)) > 0:
        organs.append("breast")
    elif len(dl.get_close_matches("mastectomy", biop)) > 0:
        organs.append("breast")
    elif len(dl.get_close_matches("lymph", biop)) > 0:
        organs.append("lymph node")
    elif len(dl.get_close_matches("skin", biop)) > 0:
        organs.append("skin")
    elif len([word for wordList in [dl.get_close_matches(br, biop) 
            for br in ["axilla", "uterus", "fallopian", "ovary", "adnexa"]] 
              for word in wordList]) > 0:
        organs.append("uterus")
    else:
        organs.append("na")
biopData["Laterality"] = lats
biopData["Biopsy Source"] = organs

numSamples = biopData.shape[0]

biopData
#keeping only breast specimens per research specification 
biopData2= biopData[biopData["Biopsy Source"] == 'breast']
#fix indexing issue due to dropped specimens
k = biopData2.reset_index()
k2 = k.drop(labels='index', axis=1)


"30 o'clock, 2 cm from nipple), needle biopsy.  Invasive ductal carcinoma, consistent with recurrent ductal carcinoma of the breast; see comment."

In [16]:
#Testing
biopData

Unnamed: 0,Patient,Biopsy Description,Path Report,Rad Label,Laterality,Biopsy Source
0,0,"Breast, left, simple mastectomy",1. Invasive lobular carcinoma in a background...,Left Positive,left,breast
1,0,"Lymph node, sentinel node #1, biopsy",No tumor in one lymph node (0/1).,Left Positive,na,lymph node
2,0,"Lymph node, sentinel node #2, biopsy",No tumor in one lymph node (0/1).,Left Positive,na,lymph node
3,0,"Lymph node, nonsentinel node #3, biopsy",No tumor in one lymph node (0/1).,Left Positive,na,lymph node
4,0,"Breast, right, simple mastectomy",1. Sclerosing papilloma; see comment. 2. Radi...,Left Positive,right,breast
5,1,"Left breast, ""mass at 12 o'clock 3 cm from ni...",1. Invasive ductal carcinoma; see comment. 2....,Left Positive,left,breast
6,1,"Right breast, 10 o'clock 4 cm from nipple, ne...",Dense sclerotic fibrous tissue with scant ben...,Left Positive,right,breast
7,2,"Right axillary contents, excision",No carcinoma in 10 lymph nodes (0/10).,Negative,right,uterus
8,2,"Skin, right axilla, excision",Intradermal nevus.,Negative,right,skin
9,2,"Right nipple, excision",1. Skin of nipple with underlying lactiferous...,Negative,right,breast


In [18]:
#Testing
k2.iloc[4744]

Patient                                                            3190
Biopsy Description                 Skin, left superior breast, excision
Path Report            Skin and breast tissue with no significant pa...
Rad Label                                                Right Positive
Laterality                                                         left
Biopsy Source                                                    breast
Name: 4744, dtype: object

In [19]:
#export completed 
data.to_csv("Path Reports Complete.csv")
k2.to_csv("Path Reports (By Specimens).csv")
biopData.to_csv('Path Reports by Specimens for All')