# Identifying entities in notes

Goal of this notebook is to determine how successful entity identification is using


In [21]:
%matplotlib inline
from __future__ import print_function
import os
from pyspark import SQLContext
from pyspark.sql import Row
import pyspark.sql.functions as sql
import pyspark.sql.types as types
#from pyspark.sql.functions import udf, length
import matplotlib.pyplot as plt
import numpy
import math
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
import pyspark.ml.feature as feature

In [2]:
# Load Processed Parquet
sqlContext = SQLContext(sc)
notes = sqlContext.read.parquet("../data/idigbio_notes.parquet")
total_records = notes.count()
print(total_records)
# Small sample of the df
notes = notes.sample(withReplacement=False, fraction=0.1)
notes.cache()
print(notes.count())

3232459
322657


In [4]:
#for r in notes.head(20):
#    print(r['document'] + "\n")

notes.select(notes["document"]).show(10, truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|document                                                                                                                                                                                                       |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| 9:00-14:00. Collected in mixed primary and secondary growth, immature or regenerating forest, on limestone forest floor.                                                                                      |
|WHITE PINE-HEMLOCK-BEECH-MAPLE-OAK   HEAVY LITTER MANY LARGE TREES                                                                                             

## Sentence detection

Does splitting in to sentences matter? Is there enough information to do this with a natural language library or should things like "," "[]", and "{}" be worked in to address semi-structured data?

## Tokenize documents


In [14]:
def tokenize(s):
    '''
    Take a string and return a list of tokens split out from it
    with the nltk library
    '''
    return nltk.tokenize.word_tokenize(s)

tokens = tokenize("Hello, my name is Mace Windoo")
print(tokens)

['Hello', ',', 'my', 'name', 'is', 'Mace', 'Windoo']


In [28]:
udf_tokenize = sql.udf(tokenize, types.ArrayType(types.StringType()))

notes_w_tokens = notes.withColumn('tokens', udf_tokenize(notes['document']))
notes_w_tokens.select(notes_w_tokens["tokens"]).show(3, truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------------------+
|tokens                                                                                                                                           |
+-------------------------------------------------------------------------------------------------------------------------------------------------+
|[9:00-14:00, ., Collected, in, mixed, primary, and, secondary, growth, ,, immature, or, regenerating, forest, ,, on, limestone, forest, floor, .]|
|[WHITE, PINE-HEMLOCK-BEECH-MAPLE-OAK, HEAVY, LITTER, MANY, LARGE, TREES]                                                                         |
|[Annual, herb, ,, 40, cm, ., tall, ,, tiny, purple, &, green, flowers, ,, infrequent, .]                                                         |
+---------------------------------------------------------------------------------------------------------------

In [29]:
notes_w_tokens.printSchema()

root
 |-- uuid: string (nullable = true)
 |-- occurrenceID: string (nullable = true)
 |-- catalogNumber: string (nullable = true)
 |-- county: string (nullable = true)
 |-- institutionCode: string (nullable = true)
 |-- country: string (nullable = true)
 |-- countryCode: string (nullable = true)
 |-- stateProvince: string (nullable = true)
 |-- family: string (nullable = true)
 |-- recordedBy: string (nullable = true)
 |-- order: string (nullable = true)
 |-- specificEpithet: string (nullable = true)
 |-- genus: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- scientificName: string (nullable = true)
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)
 |-- fieldNotes: string (nullable = true)
 |-- occurrenceRemarks: string (nullable = true)
 |-- eventRemarks: string (nullable = true)
 |-- document: string (nullable = true)
 |-- document_len: integer (nullable = true)
 |-- fieldNotes_len: integer (nullable = true)
 |-- eventRemarks_len: integer (null

In [42]:
t = ["9:00-14:00", ".", "Collected"]
pos = nltk.pos_tag(t)
print(t)
print(type(t))
print(type(t[0]))
print(type(t[1]))

['9:00-14:00', '.', 'Collected']
<type 'list'>
<type 'str'>
<type 'str'>


In [None]:
#df = 

In [52]:
def part_of_speech(t):
    '''
    With a list of tokens, mark their part of speech and return
    a list dicts (no native tuple type in dataframes it seems).
    '''
    pos = nltk.pos_tag(t)
    retval = []
    for p in pos:
        retval.append({"word": p[0], "tag": p[1]})
    return retval

pos = part_of_speech(tokens)
print(pos)
print(type(pos))
print(type(pos[0]))
print(type(pos[0]["tag"]))
print(pos[0])

[{'tag': 'NNP', 'word': 'Hello'}, {'tag': ',', 'word': ','}, {'tag': 'PRP$', 'word': 'my'}, {'tag': 'NN', 'word': 'name'}, {'tag': 'VBZ', 'word': 'is'}, {'tag': 'NNP', 'word': 'Mace'}, {'tag': 'NNP', 'word': 'Windoo'}]
<type 'list'>
<type 'dict'>
<type 'str'>
{'tag': 'NNP', 'word': 'Hello'}


In [53]:
udf_part_of_speech = sql.udf(part_of_speech, types.ArrayType(
                                    types.MapType(
                                        types.StringType(),
                                        types.StringType()
                                    )
                                )
                            )

notes_w_tokens2 = notes_w_tokens.withColumn('pos', 
                                            udf_part_of_speech(notes_w_tokens['tokens']))

notes_w_tokens2.select(notes_w_tokens2["pos"]).show(3, truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|pos                                                                                                                                                                                                                                                                                                                                                                                 

In [54]:
notes_w_tokens2.printSchema()

root
 |-- uuid: string (nullable = true)
 |-- occurrenceID: string (nullable = true)
 |-- catalogNumber: string (nullable = true)
 |-- county: string (nullable = true)
 |-- institutionCode: string (nullable = true)
 |-- country: string (nullable = true)
 |-- countryCode: string (nullable = true)
 |-- stateProvince: string (nullable = true)
 |-- family: string (nullable = true)
 |-- recordedBy: string (nullable = true)
 |-- order: string (nullable = true)
 |-- specificEpithet: string (nullable = true)
 |-- genus: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- scientificName: string (nullable = true)
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)
 |-- fieldNotes: string (nullable = true)
 |-- occurrenceRemarks: string (nullable = true)
 |-- eventRemarks: string (nullable = true)
 |-- document: string (nullable = true)
 |-- document_len: integer (nullable = true)
 |-- fieldNotes_len: integer (nullable = true)
 |-- eventRemarks_len: integer (null

In [60]:
# Can we work with maps natively?
notes_w_tokens2.select(notes_w_tokens2["pos"][0]["word"]).show(3, truncate=False)
# YES!

+------------+
|pos[0][word]|
+------------+
|9:00-14:00  |
|WHITE       |
|Annual      |
+------------+
only showing top 3 rows



In [77]:
# Split out words by type
# Can't figure out how to access elements of a map in a filter so 
# build something that filters the lists for us.
def find_pos(pos, part):
    '''
    Take a list of dicts that represent words tagged with
    pos information and return a list of words that match
    the requested pos
    '''
    retval = []
    for p in pos:
        if p["tag"].startswith(part):
            retval.append(p["word"])
    return retval

print(find_pos(pos, "NN"))

['Hello', 'name', 'Mace', 'Windoo']


In [92]:
# Can't figure out how to pass a single string to a UDF
find_nouns_udf = sql.udf(lambda x: find_pos(x, "NN"), types.ArrayType(types.StringType()))


In [109]:
nouns = notes_w_tokens2\
    .select(sql.explode(find_nouns_udf(notes_w_tokens2["pos"])).alias("word"))
      
nouns.show(3)

+---------+
|     word|
+---------+
|Collected|
|   growth|
| immature|
+---------+
only showing top 3 rows



In [112]:
nouns\
    .na.drop().show()
    

#    .groupBy("word")\
#    .count()\
#    .show(3)
     #.orderBy("count", ascending=False)\

+--------------------+
|                word|
+--------------------+
|           Collected|
|              growth|
|            immature|
|              forest|
|           limestone|
|              forest|
|               floor|
|               WHITE|
|PINE-HEMLOCK-BEEC...|
|               HEAVY|
|              LITTER|
|                MANY|
|               LARGE|
|               TREES|
|                herb|
|                  cm|
|                tall|
|              purple|
|             flowers|
|          infrequent|
+--------------------+
only showing top 20 rows



In [None]:
# And some verbs