## Tools for NLP

In [1]:
#import statements
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
from matplotlib import style
import re
import difflib as dl
import nltk
from sklearn.model_selection import train_test_split

## Accuracy, Precision, and Recall

In [2]:
def accuracy(data):
    return sum(data["Label"] == data["Extracted Label"]) / data.shape[0]

def precision(data):
    """Of the samples that we identified as positive, the proportion of samples that were actually positive 
        [tP/number classified as positive]
        Param: - results is a list of labels returned by the classifier
              - reference is a list of correct labels of test set"""
    class_pos = data[["Positive" in label for label in data["Extracted Label"]]]
    
    TP = sum(["Positive" in label for label in class_pos["Label"]])

    return TP / class_pos.shape[0]

def recall(data):
    """Proportion of positive samples that we caught [tP/number of actual positives]
        Param: - results is a list of labels returned by the classifier
              - reference is a list of correct labels of test set"""
    
    actual_pos = data[["Positive" in label for label in data["Label"]]]
    
    TP = sum(["Positive" in label for label in actual_pos["Extracted Label"]])
    
    return TP / actual_pos.shape[0]

(accuracy(data), precision(data), recall(data))

NameError: name 'data' is not defined

## Naive Bayes Classifier

Use Naive Bayes to classify the path reports that regex can't correctly identify (labels = ilc, idc, fibro, etc.)
a.k.a. classify misSamples

In [None]:
# Reports that weren't classified correctly
misPatIDs = data[data["Label"] != data["Extracted Label"]].index.values
misSamples = biopData.iloc[[row for row in range(numSamples) if biopData.iloc[row]["Patient"] in misPatIDs]]
misSamples

In [None]:
# corrSamples = Correctly labeled samples
corrPatIDs = data[data["Label"] == data["Extracted Label"]].index.values
corrSamples = biopData.iloc[[row for row in range(numSamples) if biopData.iloc[row]["Patient"] in corrPatIDs]]
corrSamples = corrSamples.reset_index().drop("index", 1)
corrSamples

# Featurize the path reports
- Develop a featurization of the correct reports
- Instead of just counting frequencies of all tokens, we will use n-grams, and identify the most useful words


## Method 1: Use entire vocabulary to determine feature set

In [None]:
def accuracy(classified, reference):
    assert len(classified) == len(reference)
    numData = len(classified)
    numClassifiedCorrect = sum([classified[i] == reference[i] for _ in range(len(reference))])
    return numClasifiedCorrect

def precision(classified, reference):
    """Of the samples that we identified as a x, the proportion of samples that were actually x 
        [tP/number classified as x]
    Args: lists of classified labels and reference labels"""
    labels = set(reference)
    
    precs = {}
    for label in labels:
        classifiedAsLabel = [i for i in range(len(classified)) if classified[i] == label]
        TP = sum([reference[i] == label for i in classifiedAsLabel])
        precs[label] = TP / len(classifiedAsLabel)
    return precs

def recall(classified, reference):
    """Of the samples that were x, the proportion of samples that we classified as x
        [tP/number classified as x]
    Args: lists of classified labels and reference labels"""
    labels = set(reference)
    
    recs = {}
    for label in labels:
        labeled = [i for i in range(len(reference)) if reference[i] == label]
        TP = sum([classified[i] == label for i in labeled])
        recs[label] = TP / len(labeled)
    return recs

def getPRBoW(pathRep, vocab):
    bow = {}
    for word in vocab:
        bow[word] = pathRep.count(' '.join(word))
    return bow

def test_feature_set(features):
    corrSamples["Path Report BoW"] = [getPRBoW(pathRep, features) for pathRep in corrSamples["Path Report"]]

    prFeatureSet = [(corrSamples.iloc[i]["Path Report BoW"], corrSamples.iloc[i]["Cancer Type"]) 
                for i in range(corrSamples.shape[0])]

    train, test = train_test_split(prFeatureSet, test_size=0.2)
    classifier = nltk.NaiveBayesClassifier.train(train)
    testSet = classifier.classify_many([rep[0] for rep in test])
    referenceSet = [rep[1] for rep in test]

    classifier.show_most_informative_features(30)
    return nltk.classify.accuracy(classifier, test), precision(testSet, referenceSet), recall(testSet, referenceSet)


In [None]:
test_feature_set(vocab)

## Method 2: Take the top k most common tokens

In [None]:
from collections import Counter

k = 2000
k_most_common_tokens = [token[0] for token in Counter(compiledReps).most_common(k)]

test_feature_set(k_most_common_tokens)

## Method 3: Take the top k most common tokens from each class

In [None]:
k = 500

features = []

corrSamples["Reports Tokenized"] = tokPathReps
for cancer in cancer_types:
    w_cancer = corrSamples[corrSamples["Cancer Type"] == cancer]
    cancer_reps_tok = w_cancer["Reports Tokenized"]
    cancer_tokens = [token for pathRep in cancer_reps_tok for token in pathRep]
    cancer_tokens_freq = Counter(cancer_tokens)
    features.extend([elem[0] for elem in cancer_tokens_freq.most_common(k)])
features = set(features)
features

In [None]:
corrSamples["Path Report BoW"] = [getPRBoW(pathRep, features) for pathRep in corrSamples["Path Report"]]

prFeatureSet = [(corrSamples.iloc[i]["Path Report BoW"], corrSamples.iloc[i]["Cancer Type"]) 
            for i in range(corrSamples.shape[0])]

train, test = train_test_split(prFeatureSet, test_size=0.2)
classifier = nltk.NaiveBayesClassifier.train(train)
testSet = classifier.classify_many([rep[0] for rep in test])
referenceSet = [rep[1] for rep in test]

classifier.show_most_informative_features(30)

In [None]:
nltk.classify.accuracy(classifier, test), precision(testSet, referenceSet), recall(testSet, referenceSet)

## Method 4: Featurize using 2-grams and 3-grams (take top k most common)

In [None]:
k = 500
from nltk.util import ngrams

tokPathReps = corrSamples["Path Report"].apply(lambda s: negate_sequence(nltk.word_tokenize(s)))
two_grams = [bigram for tokPathRep in tokPathReps for bigram in list(ngrams(tokPathRep, 2))]
three_grams = [trigram for tokPathRep in tokPathReps for trigram in list(ngrams(tokPathRep, 3))]
two_grams_set = set(two_grams)
three_grams_set = set(three_grams)

vocab_ngrams = vocab.union(two_grams_set, three_grams_set)

In [None]:
test_feature_set(vocab_ngrams)

### Modify getCancerLabel to utilize NaiveBayesClassifier to catch weirdly phrased path reports

In [None]:
backupClassifier = nltk.NaiveBayesClassifier.train(prFeatureSet)

def getCancerTypeBayes(rep):
    rep = ' '.join(negate_sequence(nltk.word_tokenize(rep))).lower()
    
    for i in range(len(cancer_types)): #cancer in cancer_types:
        if re.search(cancer_re[i], rep): #cancer in rep: #fuzzySubstrMatch(rep, cancer, 0.95):
            return cancer_types[i]
        
    return backupClassifier.classify(getPRBoW(rep))

In [None]:
biopData["Cancer Type Bayes"] = biopData["Path Report"].apply(getCancerTypeBayes)

In [None]:
biopData

In [None]:
labels = []
for patID in range(numPatients):
    labels.append(radLabel(patID))
data["Extracted Label Bayes"] = labels

## Method 5: Naive Bayes

In [None]:
biopData


def getCancerLabel(biop):
    #patID = biop["Patient"]
    #label = data.iloc[patID]["Label"]
    label = biop["Patient Label"]
    laterality = biop["Laterality"]
    
    # To see possible labels run data.groupby("Label").count()
    
    if label == "Negative":
        return "Negative"
    if laterality == "left":
        if label == "Bilateral Positive" or label == "Left Positive":
            return "Positive"
        return "Negative"
    elif laterality == "right":
        if label == "Bilateral Positive" or label == "Right Positive":
            return "Positive"
        return "Negative"
    else:
        return None

biopData["Biopsy Label"] = [getCancerLabel(biopData.iloc[i]) for i in range(numSamples)]

In [None]:
biopData

In [None]:
biopData[biopData["Patient"] == 7]

In [None]:
nltk.word_tokenize("This will be a complex sentence; compound f.t.w.")

In [None]:
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *

In [None]:
# Negating 
def negate_sequence(tokens):
    """
    Detects negations and transforms negated words into "not_" form.
    """
    negation = False
    delims = "?.,!:;"
    result = []
#Here rather then applying split, we can directly feed our extracted symptoms list
    #words = text.split()
    prev = None
    pprev = None
    for token in tokens:
        # stripped = word.strip(delchars)
        #stripped = word.strip(delims).lower()
        if any(c == token for c in delims):
            negation = False
        
        negated = "not_" + token if negation else token
        result.append(negated)

        if any(neg == token.lower() for neg in ["not", "n't", "no"]):
            negation = not negation   

    return result

In [None]:
biopData["Path Report Tokenized"] = biopData["Path Report"].apply(lambda s: negate_sequence(nltk.word_tokenize(s)))
# Can try double negation flip, set parameter double_neg_flip = True

In [None]:
# Constructing feature representation of Path Reports. The feature set is a dictionary where
# the keys are words, and the values are frequencies

compiledReps = []
for i in range(numSamples):
    compiledReps += biopData.iloc[i]["Path Report Tokenized"]

vocab = set(compiledReps)

def getPRBoW(pathRep):
    bow = {}
    for word in vocab:
        bow[word] = pathRep.count(word)
    return bow

biopData["Path Report BoW"] = biopData["Path Report Tokenized"].apply(getPRBoW)

In [None]:
breastData = biopData[biopData["Biopsy Source"] == "breast"].dropna()
num_brsamples = breastData.shape[0]

brfeatureset = [(breastData.iloc[i]["Path Report BoW"], breastData.iloc[i]["Biopsy Label"]) 
            for i in range(num_brsamples)]

In [None]:
breastData.reset_index()#.iloc[10]["Path Report"]

In [None]:
s = breastData.reset_index().iloc[5]["Path Report"]
s

In [None]:
"Hello".lower()

In [None]:
negate_sequence(nltk.word_tokenize(breastData.reset_index().iloc[14]["Path Report"]))

## Regex Matching/Preprocessing

In [None]:
numPos = sum(biopData["Biopsy Label"] == "Positive")
numNeg = sum(biopData["Biopsy Label"] == "Negative")
numPosBr = sum(breastData["Biopsy Label"] == "Positive")
numNegBr = sum(breastData["Biopsy Label"] == "Negative")
(numPos, numNeg, numPosBr, numNegBr)

In [None]:
# Simply just classify path reps with 
biopData[["carcinoma" in rep for rep in biopData["Path Report"]]]["Patient"]#.tolist().count("Positive")

## Naive Bayes Classifier Setup

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(brfeatureset, test_size=0.2)

In [None]:
classifier = nltk.NaiveBayesClassifier.train(train)

In [None]:
testSet = classifier.classify_many([rep[0] for rep in test])

In [None]:
referenceSet = [rep[1] for rep in test]

In [None]:
(testSet.count("Positive"), referenceSet.count("Positive"))

In [None]:
print(nltk.classify.accuracy(classifier, test))

In [None]:
classifier.show_most_informative_features(30)

In [None]:
def precision(results, reference):
    """Proportion of samples that we identified as positive that were actually positive tP/total positive
        Param: - results is a list of labels returned by the classifier
              - reference is a list of correct labels of test set"""
    TP = sum([(results[i] == "Positive") and (reference[i] == "Positive") for i in range(len(reference))])
    FP = sum([(results[i] == "Positive") and (reference[i] == "Negative") for i in range(len(reference))])
    return TP / (TP + FP)

def recall(results, reference):
    """Proportion of positive samples that we caught tP/total actual positive
        Param: - results is a list of labels returned by the classifier
              - reference is a list of correct labels of test set"""
    TP = sum([(results[i] == "Positive") and (reference[i] == "Positive") for i in range(len(reference))])
    FN = sum([(results[i] == "Negative") and (reference[i] == "Positive") for i in range(len(reference))])
    return TP / (TP + FN)

In [None]:
(precision(testSet, referenceSet), recall(testSet, referenceSet))

In [None]:
num_brsamples = breastData.shape[0]
num_brsamples

In [None]:
def cross_validate(reps):
    precs, recs = [], []
    for _ in range(reps):
        train, test = train_test_split(brfeatureset, test_size=0.2)
        classifier = nltk.NaiveBayesClassifier.train(train)
        results = classifier.classify_many([rep[0] for rep in test])
        reference = [rep[1] for rep in test]
        precs.append(precision(results, reference))
        recs.append(recall(results, reference))
    plt.plot(range(1, reps + 1), precs)
    plt.plot(range(1, reps + 1), recs)
    plt.legend(['precision', 'recall'])
    plt.show()

In [None]:
cross_validate(3)

In [None]:
np.random.choice([1, 2, 3], 5, replacement = False)

In [None]:
[1, 2, 3].index(3)