# Identifying entities in notes

Goal of this notebook is to determine how successful entity identification is using


In [32]:
%matplotlib inline
from __future__ import print_function
import os
from pyspark import SQLContext
from pyspark.sql import Row
import pyspark.sql.functions as sql
import pyspark.sql.types as types
#from pyspark.sql.functions import udf, length
import matplotlib.pyplot as plt
import numpy
import math
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
import pyspark.ml.feature as feature

In [33]:
# Load Processed Parquet
sqlContext = SQLContext(sc)
notes = sqlContext.read.parquet("../data/idigbio_notes.parquet")
total_records = notes.count()
print(total_records)
# Small sample of the df
notes = notes.sample(withReplacement=False, fraction=0.1)

print(notes.count())

3232459
324454


In [34]:
# Still have some problems with the document field having nulls and
# being empty. Not sure where nulls came from but likely the 
# empties are really whitespace
notes = notes.select(sql.trim(notes["document"]).alias("document"))\
    .dropna(subset="document")\
    .filter(sql.length("document") > 0)
notes.cache()  

print(notes.count())
notes.select(notes["document"])\
    .orderBy(notes["document"])\
    .show(10, truncate=False)
notes.select(notes["document"])\
    .orderBy(notes["document"], ascending=False)\
    .show(10, truncate=False)

324295
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|document                                                                                                                                                           |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|!                                                                                                                                                                  |
|!                                                                                                                                                                  |
|!                                                                                                                                                                 

## Sentence detection

Does splitting in to sentences matter? Is there enough information to do this with a natural language library or should things like "," "[]", and "{}" be worked in to address semi-structured data?

## Tokenize documents


In [41]:
from lib.tokens import Tokens
tokens = Tokens()
t = tokens.tokenize("Hello, my name is Mace Windoo")
print(t)

['Hello', ',', 'my', 'name', 'is', 'Mace', 'Windoo']


In [37]:
udf_tokenize = sql.udf(tokens.tokenize, types.ArrayType(types.StringType()))

notes_w_tokens = notes.withColumn('tokens', udf_tokenize(notes['document']))
for r in notes_w_tokens.select(notes_w_tokens["tokens"]).head(50):
    print(" | ".join(r["tokens"]))
    print("\n")

[ | USA | : | Ohio | , | Hocking | Co. | | | Little | Rocky | Hollow| | Gerdeman | BS | | | 29 | V | 1998 | | | ex | narceus | | | ( | Diplopoda | : | Spirobolidae | ) | | | BSG98-0529-4 | AL5523 | ] | [ | Iphiopsididae | | | Narceolaelaps | annularis | | | Kethley | 1978 | | | DN | | | det | . | : | Gerdeman | 1998 | | | Lactophenol | | | Hoyer | 's | | | OSAL0004567 | ]


LAB | BORN | , | AGE | 45 | WEEKS


Yellow | `` | G | '' | on | cap | .


PARTIAL | ALBINO. | ``


Freely | escaped | or | naturalized | .


berlese | riparian | ravine


Additional | data | on | card


level | , | peanut | butter | bait


Collected | Sept. | 1971 | .


BBP | 5.12-6


Common | . | Submerged | in | Sheyenne | River | .


NOTEBY | LLAMA | , | NOTEDATE | 14-Jun-08 | : | LLAMA | taxa | ( | Formicidae | , | Curculionidae | , | Diptera | , | Hymenoptera | , | Hemiptera | , | Myriapoda/Arachnida | , | Staphylinidae | , | other | Coleoptera | ) | extracted | ; | July | 2008 | , | Ecosur | .


On | Acer | .


In [38]:
notes_w_tokens.printSchema()

root
 |-- document: string (nullable = true)
 |-- tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [42]:
from lib.pos_tags import PosTags
pos_tags = PosTags()

pos = pos_tags.tag(t)
print(pos)
print(type(pos))
print(type(pos[0]))
print(type(pos[0]["tag"]))
print(pos[0])

[{'tag': 'NNP', 'word': 'Hello'}, {'tag': ',', 'word': ','}, {'tag': 'PRP$', 'word': 'my'}, {'tag': 'NN', 'word': 'name'}, {'tag': 'VBZ', 'word': 'is'}, {'tag': 'NNP', 'word': 'Mace'}, {'tag': 'NNP', 'word': 'Windoo'}]
<type 'list'>
<type 'dict'>
<type 'str'>
{'tag': 'NNP', 'word': 'Hello'}


In [49]:
udf_part_of_speech = sql.udf(pos_tags.tag, types.ArrayType(
                                    types.MapType(
                                        types.StringType(),
                                        types.StringType()
                                    )
                                )
                            )

notes_w_tokens2 = notes_w_tokens.withColumn('pos', 
                                            udf_part_of_speech(notes_w_tokens['tokens']))

for r in notes_w_tokens2.select(notes_w_tokens2["pos"]).head(50):
    s = ""
    for p in r["pos"]:
        s = s + p["word"] + " (" + p["tag"] + ") | "
    print(s + "\n")

[ (NN) | USA (NNP) | : (:) | Ohio (NNP) | , (,) | Hocking (NNP) | Co. (NNP) | | (NNP) | Little (NNP) | Rocky (NNP) | Hollow| (NNP) | Gerdeman (NNP) | BS (NNP) | | (NNP) | 29 (CD) | V (NNP) | 1998 (CD) | | (CD) | ex (NN) | narceus (VBZ) | | (:) | ( (:) | Diplopoda (NNP) | : (:) | Spirobolidae (NNP) | ) (:) | | (:) | BSG98-0529-4 (-NONE-) | AL5523 (NNP) | ] (NNP) | [ (NNP) | Iphiopsididae (NNP) | | (NNP) | Narceolaelaps (NNP) | annularis (VBZ) | | (:) | Kethley (NNP) | 1978 (CD) | | (CD) | DN (NNP) | | (NNP) | det (NN) | . (.) | : (:) | Gerdeman (NNP) | 1998 (CD) | | (CD) | Lactophenol (NNP) | | (NNP) | Hoyer (NNP) | 's (POS) | | (NNP) | OSAL0004567 (NNP) | ] (NNP) | 

LAB (NNP) | BORN (NNP) | , (,) | AGE (NNP) | 45 (CD) | WEEKS (NNS) | 

Yellow (NNP) | `` (``) | G (NNP) | '' ('') | on (IN) | cap (NN) | . (.) | 

PARTIAL (JJ) | ALBINO. (NN) | `` (``) | 

Freely (RB) | escaped (VBD) | or (CC) | naturalized (VBN) | . (.) | 

berlese (NN) | riparian (VBD) | ravine (NN) | 

Additional (JJ) |

In [158]:
notes_w_tokens2.printSchema()

root
 |-- document: string (nullable = true)
 |-- tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- pos: array (nullable = true)
 |    |-- element: map (containsNull = true)
 |    |    |-- key: string
 |    |    |-- value: string (valueContainsNull = true)



In [159]:
# Can we work with maps natively?
notes_w_tokens2.select(notes_w_tokens2["pos"][0]["word"]).show(3, truncate=False)
# YES!

+------------+
|pos[0][word]|
+------------+
|cloud       |
|Collected   |
|This        |
+------------+
only showing top 3 rows



In [160]:
# Split out words by type
# Can't figure out how to access elements of a map in a filter so 
# build something that filters the lists for us.
def find_pos(pos, part):
    '''
    Take a list of dicts that represent words tagged with
    pos information and return a list of words that match
    the requested pos
    '''
    retval = []
    for p in pos:
        if p["tag"].startswith(part):
            retval.append(p["word"])
    return retval

print(find_pos(pos, "NN"))

['Hello', 'name', 'Mace', 'Windoo']


In [161]:
# Can't figure out how to pass a single string to a UDF
find_nouns_udf = sql.udf(lambda x: find_pos(x, "NN"), types.ArrayType(types.StringType()))


In [162]:
nouns = notes_w_tokens2\
    .select(sql.explode(find_nouns_udf(notes_w_tokens2["pos"])).alias("word"))
nouns.cache()
nouns.show(3)

+-----+
| word|
+-----+
|cloud|
|sappy|
| bark|
+-----+
only showing top 3 rows



In [168]:
noun_counts = nouns\
    .groupBy("word")\
    .count()\
    .orderBy("count", ascending=False)\
    
noun_counts.show(30)

+----------+-----+
|      word|count|
+----------+-----+
|         [|43176|
|         ]|27293|
|      data|16949|
|     notes|14227|
|      card|11475|
|     field|10874|
|      trap|10457|
|       See|10227|
|      soil|10045|
|         ||10035|
|collection| 9629|
|         S| 9596|
|    forest| 9478|
|         (| 9363|
|      Alch| 8557|
|    litter| 8281|
|       Co.| 7984|
|    NOTEBY| 7062|
|         m| 7059|
|    Number| 6985|
|    flight| 6847|
| Herbarium| 6837|
|Collection| 6719|
|         C| 6298|
|         )| 6151|
|         W| 5951|
|  specimen| 5936|
|  NOTEDATE| 5689|
|   prairie| 5583|
|      tall| 5373|
+----------+-----+
only showing top 30 rows



In [169]:
noun_counts.cache()

DataFrame[word: string, count: bigint]

In [171]:
noun_counts.orderBy(noun_counts["count"]).show(30)

+--------------------+-----+
|                word|count|
+--------------------+-----+
|                R32W|    1|
|                Lehm|    1|
|                 CFS|    1|
|Gaultheria-Vaccinium|    1|
|          Slimbridge|    1|
|             CUTLEAF|    1|
|              ywllow|    1|
|             FURNACE|    1|
|             litosol|    1|
|            prosrate|    1|
|            CAS/HBOI|    1|
|                 tim|    1|
|          OSAL001066|    1|
|              km80.2|    1|
|            elkoense|    1|
|det_comments:29.8...|    1|
|             videtai|    1|
|              Pledad|    1|
|               IV-19|    1|
|                 BRH|    1|
|           ATTENDING|    1|
|            bivelves|    1|
|              Khybeu|    1|
|            Oonopsis|    1|
|                Exs.|    1|
|          VI-27-1933|    1|
|               Refer|    1|
|          truncicola|    1|
|           suberecta|    1|
|           rosy-pink|    1|
+--------------------+-----+
only showing t

In [187]:
noun_counts_pdf = noun_counts.limit(1000).toPandas()
print(noun_counts_pdf.head())

tuples = []
for l in noun_counts_pdf.iterrows():
    tuples.append( (l[1], l[0]) )

    word  count
0      [  43176
1      ]  27293
2   data  16949
3  notes  14227
4   card  11475


In [186]:
%matplotlib inline
from wordcloud import WordCloud

wordcloud = WordCloud().generate_from_frequencies(tuples)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [None]:
# And some verbs