# Identifying entities in notes

Goal of this notebook is to determine how successful entity identification is using


In [1]:
%matplotlib inline
from __future__ import print_function
import os
from pyspark import SQLContext
from pyspark.sql import Row
import pyspark.sql.functions as sql
#from pyspark.sql.functions import udf, length
import matplotlib.pyplot as plt
import numpy
import math
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
import pyspark.ml.feature as feature



In [2]:
# Load Processed Parquet
sqlContext = SQLContext(sc)
notes = sqlContext.read.parquet("../data/idigbio_notes.parquet")
total_records = notes.count()
print(total_records)
# Small sample of the df
notes = notes.sample(withReplacement=False, fraction=0.1)
notes.cache()
print(notes.count())

3232459
323405


In [3]:
for r in notes.head(20):
    print(r['document'] + "\n")

 9:00-14:00. Collected in mixed primary and secondary growth, immature or regenerating forest, on limestone forest floor. 

Study skin plus tissues. Not able to sex. Not aged. Specimen collected on joint California Academy of Sciences/Kunming Institute of Zoology Gaoligongshan (GLGS) Biodiversity Expedition. GLGS Locality No. 11, Net No. 2.  

 Roadside bank along pasture, brushy. 

 cloud forest under sappy fermenting bark 

This specimen is stored in the indet. Mycena box  

WHITE PINE-HEMLOCK-BEECH-MAPLE-OAK   HEAVY LITTER MANY LARGE TREES  

Small patch of plants. Shallow soil at base of granite canyon wall beside Buffalo Bill Reservoir Dam. 

Annual herb, 40 cm. tall, tiny purple & green flowers, infrequent.  

Substrate: soil and moss. AdditionalNotesPresent=Y.  

 ridge  for. litter berlese forest litter 

Notes with collection J. Elisha Mitchell Sci. Soc. 1946 62: 199.  

NO NOTES  

Lost in the earthquake and fire of 1906.  

  [Zimipan Hdq. K.222 10-31-45] [DeLong Hershberger

## Sentence detection

Does splitting in to sentences matter? Is there enough information to do this with a natural language library or should things like "," "[]", and "{}" be worked in to address semi-structured data?

## Tokenize documents


In [17]:
def tokenize(s):
    '''
    Take a string and return a list of tokens split out from it
    with the nltk library
    '''
    return nltk.tokenize.word_tokenize(s)

udf_tokenize = sql.udf(tokenize)

notes_w_tokens = notes.withColumn('tokens', udf_tokenize(notes['document']))

In [18]:
print(notes_w_tokens.head())

Row(uuid=u'18e8da12-02c7-4e08-b16d-c40a5d745f69', occurrenceID=u'b0a2c52a-b069-11e3-8cfe-90b11c41863e', catalogNumber=u'323300', county=u'Municipality of San Luis', institutionCode=u'KU', country=u'Philippines', countryCode=u'', stateProvince=u'Aurora Province', family=u'Scincidae', recordedBy=u'', order=u'Sauria', specificEpithet=u'abdictus', genus=u'Sphenomorphus', sex=u'', scientificName=u'Sphenomorphus abdictus aquilonius', year=u'', month=u'', fieldNotes=u'', occurrenceRemarks=u'', eventRemarks=u'9:00-14:00. Collected in mixed primary and secondary growth, immature or regenerating forest, on limestone forest floor.', document=u' 9:00-14:00. Collected in mixed primary and secondary growth, immature or regenerating forest, on limestone forest floor. ', document_len=122, fieldNotes_len=0, eventRemarks_len=120, occurrenceRemarks_len=0, tokens=u'[9:00-14:00, ., Collected, in, mixed, primary, and, secondary, growth, ,, immature, or, regenerating, forest, ,, on, limestone, forest, floor,

In [13]:
t = ["9:00-14:00", ".", "Collected"]
print(nltk.pos_tag(t))

[('9:00-14:00', 'CD'), ('.', '.'), ('Collected', 'NNP')]


In [None]:
#df = 

In [19]:
def part_of_speech(t):
    '''
    With a list of tokens, mark their part of speech and return
    a list of tuples.
    '''
    return nltk.pos_tag(t)

udf_part_of_speech = sql.udf(part_of_speech)
notes_w_tokens2 = notes_w_tokens.withColumn('pos', udf_part_of_speech(notes_w_tokens['tokens']))

In [20]:
print(notes_w_tokens2.head()['pos'])

[[Ljava.lang.Object;@5ce7a7c0, [Ljava.lang.Object;@7701b7da, [Ljava.lang.Object;@69e30e71, [Ljava.lang.Object;@40b1b01f, [Ljava.lang.Object;@2189d66e, [Ljava.lang.Object;@5bcc0ae2, [Ljava.lang.Object;@2a2d076c, [Ljava.lang.Object;@67516b7e, [Ljava.lang.Object;@804eb5c, [Ljava.lang.Object;@33247ba4, [Ljava.lang.Object;@71f00ccb, [Ljava.lang.Object;@1d968310, [Ljava.lang.Object;@3267c5f5, [Ljava.lang.Object;@228e6146, [Ljava.lang.Object;@5369a788, [Ljava.lang.Object;@7ace20f6, [Ljava.lang.Object;@768a0a37, [Ljava.lang.Object;@275fd1f1, [Ljava.lang.Object;@39146775, [Ljava.lang.Object;@32d63931, [Ljava.lang.Object;@142b56ee, [Ljava.lang.Object;@4d22ddc3, [Ljava.lang.Object;@579724dd, [Ljava.lang.Object;@540b80af, [Ljava.lang.Object;@1ad920e6, [Ljava.lang.Object;@41ce3abb, [Ljava.lang.Object;@2f914f93, [Ljava.lang.Object;@2717ac36, [Ljava.lang.Object;@749a6622, [Ljava.lang.Object;@44b191e8, [Ljava.lang.Object;@6179a33a, [Ljava.lang.Object;@7b95c53f, [Ljava.lang.Object;@5c990719, [Ljava.lan