In [41]:
import pandas as pd
import re
from collections import Counter
from nltk.tokenize import word_tokenize, sent_tokenize

In [42]:
data = pd.read_excel('train_data.xlsx', 'data')
data.head()

Unnamed: 0,De-Ided #,Text,Number of Lesions,expected_group,Key_Sentence,Location
0,2914,Type: MR Brain w AND w/oCont Date/Time: 10...,1,1,Extensive enhancing mass,impression
1,2968,Type: MR Brain w AND w/oCont Date/Time: 09...,1,1,enhancing metastatic lesion,impression
2,2990,Type: MR Brain w AND w/oCont Date/Time: 09...,1,1,enhancing cystic mass,impression
3,2992,Type: MR Brain w AND w/oCont Date/Time: 07...,1,1,enhancing lesion,impression
4,3147,Type: MR Brain w AND w/oCont Date/Time: 08...,1,1,Right parietal lesion,impression


In [43]:
def clean_report(report):
    """Extract and clean the whole report section from an MRI report.

        Args:
            report: MRI report string

        Returns:
            cleaned_report: all lower caps string 
            without any excess whitespace.
    """
    cleaned_report_1 = report.strip().lower()
    cleaned_report  = " ".join(cleaned_report_1.split())
    return cleaned_report

In [44]:
def clean_findings(report):
    """Extract and clean the FINDINGS section from an MRI report.

        Args:
            report: MRI report string

        Returns:
            findings: all lower caps string including the findings section
            without any excess whitespace.
    """
    pattern = re.compile(r'''FINDINGS:(.*)IMPRESSION''')
    findings_mo = re.search(pattern, report)
    cleaned_findings = findings_mo.group(1).strip().lower()
    findings = " ".join(cleaned_findings.split())
    return findings

In [45]:
def clean_impression(report):
    """Extract and clean the IMPRESSION section from an MRI report.

        Args:
            report: MRI report string

        Returns:
            findings: all lower caps string including the IMPRESSION section
            without any excess whitespace.
    """
    pattern = re.compile(r'''IMPRESSION:(.*)''')
    findings_mo = re.search(pattern, report)
    cleaned_findings = findings_mo.group(1).strip().lower()
    findings = " ".join(cleaned_findings.split())
    return findings

In [46]:
def group_by_count(report):
    """From the impression of an MRI report, count the occurance of key words

        Args:
            report: MRI report string

        Returns:
            group: equal to number of lesions if group is 1,2, or 3. Anything >3 is group 4
    """
    
    findings = clean_impression(report)
    #Tokenize into sentences
    tok_findings = word_tokenize(findings)
    
    #Count number of times a word in key_words occurs
    key_words = ['lesion', 'mass', 'metastasis', 'focus']
    counts = Counter()
    for word in tok_findings:
        if word in key_words:
            counts[word] +=1
            
    #Catagorize based on the number of occurances of the most common word
    try:
        lesions = counts.most_common(1)[0][1]
        if lesions >= 4:
            group = 4
        else:
            group = lesions
    except:
        group = 1
        
    return group

In [47]:
def group_by_keyword(report):
    """Read the entire MRI report and look for key words to indicate multiplicity, if found stop and generate
    group number. If not, proceed to group_by_count function

        Args:
            report: MRI report string

        Returns:
            group: 2,3,4 based on the presence of key words, else proceed to group_by_count
    """
    
    clean_test = clean_report(report)
    three_keywords = ['three', 'third lesion', '3rd lesion', '3 lesions']
    two_keywords = ['two', 'second lesion', '2nd lesion', '2 lesions']
    multiple_keywords = ['multiple', 'numerous', 'multifocal']

    if any(word in clean_test for word in multiple_keywords):
        group = 4
    elif any(word in clean_test for word in three_keywords):
        group = 3
    elif any(word in clean_test for word in two_keywords):
        group = 2 
    else:
        group = group_by_count(report)
        
    return group

In [48]:
data['calc_group'] = data['Text'].map(group_by_keyword)

In [49]:
from sklearn import metrics

In [50]:
print(metrics.classification_report(data['calc_group'], data['expected_group']))

             precision    recall  f1-score   support

          1       0.62      0.45      0.53        11
          2       0.45      0.45      0.45        11
          3       0.73      0.80      0.76        10
          4       0.81      0.93      0.87        14

avg / total       0.66      0.67      0.66        46



In [232]:
#data.to_csv('calc_train.csv')
#!open calc_train.csv

# NLTK Supervised Classification


In [380]:
keywords = ['three', 'third lesion', '3rd lesion', '3 lesions',
            'two', 'second lesion', '2nd lesion', '2 lesions',
           'multiple', 'numerous', 'multifocal',
           'mass', 'focus']

In [381]:
def document_features(document):
    document_words = set(document)
    features = {}
    for word in keywords:
        features['contains(%s)' % word] = (word in document_words)
        
    return features

In [382]:
zipped = zip(list(data['Text'].map(word_tokenize)), list(data['Group ']))
train_set = [(document_features(w),g) for (w,g) in zipped]
train_set[2]

({'contains(2 lesions)': False,
  'contains(2nd lesion)': False,
  'contains(3 lesions)': False,
  'contains(3rd lesion)': False,
  'contains(focus)': False,
  'contains(mass)': True,
  'contains(multifocal)': False,
  'contains(multiple)': False,
  'contains(numerous)': False,
  'contains(second lesion)': False,
  'contains(third lesion)': False,
  'contains(three)': False,
  'contains(two)': False},
 1)

In [383]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [384]:
classifier.most_informative_features(5)

[('contains(two)', True),
 ('contains(multiple)', True),
 ('contains(focus)', True),
 ('contains(mass)', False),
 ('contains(two)', False)]

In [385]:
nltk.classify.accuracy(classifier, train_set)

0.58695652173913049

# NLTK Reference

In [261]:
def gender_features(word):
    return{'last_letter': word[-1],
           'length':len(word),
           'first_letter':word[1]}

gender_features('Shrek')

{'first_letter': 'h', 'last_letter': 'k', 'length': 5}

In [235]:
from nltk.corpus import names
import random

In [249]:
names = ([(name, 'male') for name in names.words('male.txt')] + 
[(name, 'female') for name in names.words('female.txt')])

AttributeError: 'list' object has no attribute 'words'

In [248]:
random.shuffle(names)

In [262]:
featuressets = [(gender_features(n), g) for (n,g) in names]
train_set, test_test = featuressets[500:], featuressets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [263]:
print nltk.classify.accuracy(classifier, test_test)

0.758


In [264]:
classifier.show_most_informative_features(5)

Most Informative Features
             last_letter = u'a'           female : male   =     36.0 : 1.0
             last_letter = u'k'             male : female =     32.0 : 1.0
             last_letter = u'f'             male : female =     16.5 : 1.0
             last_letter = u'p'             male : female =     12.5 : 1.0
             last_letter = u'v'             male : female =     10.4 : 1.0


In [265]:
train_names = names[1500:]
devtest_names = names[500:1500]
test_names = names[:500]

In [266]:
train_set = [(gender_features(n),g) for (n,g) in train_names]
devtest_set = [(gender_features(n), g) for (n,g) in devtest_names]
test_set = [(gender_features(n), g) for (n,g) in test_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [267]:
print nltk.classify.accuracy(classifier, devtest_set)

0.769


In [268]:
errors = []
for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append( (tag,guess,name) )

In [301]:
#for (tag, guess,name) in sorted(errors):
    #print 'correct = %-8s guess = %-8s name = %-30s' % (tag, guess, name)

## Document Classification

In [270]:
from nltk.corpus import movie_reviews

In [285]:
documents = [(list(movie_reviews.words(fileid)), category)
            for category in movie_reviews.categories()
            for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

In [296]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = all_words.keys()[:2000]
word_features[:5]

[u'sucess', u'sonja', u'askew', u'woods', u'spiders']

In [287]:
def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
        
    return features

In [297]:
def document_features2(document):
    document_words = set(document)
    features = {}
    key_words = ['sans', 'wires']
    for word in key_words:
        features['contains(%s)' % word] = (word in document_words)
        
    return features

In [347]:
featuresets = [(document_features2(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [350]:
type(featuresets)

list

In [349]:
featuresets[0]

({'contains(sans)': False, 'contains(wires)': False}, u'neg')

In [299]:
print nltk.classify.accuracy(classifier, test_set)

0.5


In [300]:
classifier.show_most_informative_features(5)

Most Informative Features
          contains(sans) = True              neg : pos    =      9.0 : 1.0
         contains(wires) = True              neg : pos    =      6.3 : 1.0
          contains(sans) = False             pos : neg    =      1.0 : 1.0
         contains(wires) = False             pos : neg    =      1.0 : 1.0


In [366]:
test_set[0]

({'contains(sans)': False, 'contains(wires)': False}, u'neg')