# NLTK example
Example on how to use natural language processing with python's NLTK package. 

NLTK is a tool kit for cleaning and prepping the data for analysis. http://www.nltk.org/
Info on NLTK: http://www.nltk.org/book/


In [4]:
import nltk

### Below will bring up a pop-up and you need to select all to download all packages.

This downloaded stuff to /Users/korolo/nltk-data

This does not need to be run again.

In [7]:
# nltk.download()
# this has been downloaded

## Tokenizing

In [8]:
#from nltk.tockenize import sent_tockenize, word_tockenize

In [9]:
example_text = "This is a sentence number one. This is another sentence. Text could be boring sometimes. " + \
    "Don't read this! C. elegans is a small worm. Someone loves Mr. Johnson song by J.A.I.N."

In [10]:
nltk.sent_tokenize(example_text)

['This is a sentence number one.',
 'This is another sentence.',
 'Text could be boring sometimes.',
 "Don't read this!",
 'C. elegans is a small worm.',
 'Someone loves Mr. Johnson song by J.A.I.N.']

In [11]:
nltk.word_tokenize(example_text)

['This',
 'is',
 'a',
 'sentence',
 'number',
 'one',
 '.',
 'This',
 'is',
 'another',
 'sentence',
 '.',
 'Text',
 'could',
 'be',
 'boring',
 'sometimes',
 '.',
 'Do',
 "n't",
 'read',
 'this',
 '!',
 'C.',
 'elegans',
 'is',
 'a',
 'small',
 'worm',
 '.',
 'Someone',
 'loves',
 'Mr.',
 'Johnson',
 'song',
 'by',
 'J.A.I.N',
 '.']

## Stop words removal

In [12]:
from nltk.corpus import stopwords

In [13]:
example_text = "This is a sentence number one. This is another sentence. Text could be boring sometimes. " + \
    "Don't read this! C. elegans is a small worm. Someone loves Mr. Johnson song by J.A.I.N."

In [14]:
stop_words = set(stopwords.words("english"))
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [15]:
# Can add other custom stop words here

In [16]:
words = nltk.word_tokenize(example_text)
words

['This',
 'is',
 'a',
 'sentence',
 'number',
 'one',
 '.',
 'This',
 'is',
 'another',
 'sentence',
 '.',
 'Text',
 'could',
 'be',
 'boring',
 'sometimes',
 '.',
 'Do',
 "n't",
 'read',
 'this',
 '!',
 'C.',
 'elegans',
 'is',
 'a',
 'small',
 'worm',
 '.',
 'Someone',
 'loves',
 'Mr.',
 'Johnson',
 'song',
 'by',
 'J.A.I.N',
 '.']

In [17]:
filtered_text = []
for w in words:
    if w not in stop_words:
        filtered_text.append(w)
#filtered_text

In [18]:
## OR

In [19]:
filtered_text = [w for w in words if w not in stop_words]
filtered_text

['This',
 'sentence',
 'number',
 'one',
 '.',
 'This',
 'another',
 'sentence',
 '.',
 'Text',
 'could',
 'boring',
 'sometimes',
 '.',
 'Do',
 "n't",
 'read',
 '!',
 'C.',
 'elegans',
 'small',
 'worm',
 '.',
 'Someone',
 'loves',
 'Mr.',
 'Johnson',
 'song',
 'J.A.I.N',
 '.']

## Stemming

Stemming is used to get to the root of the word. Used to get to the meaning.
Stemming as below might not be necessary as a separate step in a text mining pipeline, since usually you would feed words through nltk and use wordnet, i.e. use lemmatazing


In [44]:
from nltk.stem import PorterStemmer

In [45]:
ps = PorterStemmer()

In [46]:
example_words = ["sample", "sampling", "sampled"]
for w in example_words:
    print(ps.stem(w))

sampl
sampl
sampl


In [47]:
example_text ='''We have sampled a rather small C. elegans worm and were determined to find \\
out what unique DNA sample resulted from this sampling.'''

In [48]:
words = nltk.word_tokenize(example_text)
words

['We',
 'have',
 'sampled',
 'a',
 'rather',
 'small',
 'C.',
 'elegans',
 'worm',
 'and',
 'were',
 'determined',
 'to',
 'find',
 '\\out',
 'what',
 'unique',
 'DNA',
 'sample',
 'resulted',
 'from',
 'this',
 'sampling',
 '.']

In [49]:
for w in words:
    print(ps.stem(w))

We
have
sampl
a
rather
small
C.
elegan
worm
and
were
determin
to
find
\out
what
uniqu
dna
sampl
result
from
thi
sampl
.


## Part of speach tagging

POS tagging, or simply tagging, is known as breaking down the text into word classes, or lexical categories, or part-of-speach.

For information on tagging, see http://www.nltk.org/book/ch05.html
Also, see a separate notebook, pos_tagging.ipynb for more detailed examples.

In [8]:
from nltk.corpus import gutenberg
from nltk.tokenize import PunktSentenceTokenizer # unsupervied trained tokenizer. Can train it myself.

In [10]:
tokenizer_training_set = gutenberg.raw("austen-persuasion.txt")
sense_and_sensibility_text = gutenberg.raw("austen-sense.txt")

custom_austen_tokenizer = nltk.PunktSentenceTokenizer(tokenizer_training_set)
custom_austen_tokenizer

<nltk.tokenize.punkt.PunktSentenceTokenizer at 0x1a18532470>

In [14]:
# This gives an error => gutenberg is not a tagged corpus
# gutenberg.tagged_words()

In [15]:
tokenised_sense_and_sensibility = custom_austen_tokenizer.tokenize(sense_and_sensibility_text)
tokenised_sense_and_sensibility

['[Sense and Sensibility by Jane Austen 1811]\n\nCHAPTER 1\n\n\nThe family of Dashwood had long been settled in Sussex.',
 'Their estate was large, and their residence was at Norland Park,\nin the centre of their property, where, for many generations,\nthey had lived in so respectable a manner as to engage\nthe general good opinion of their surrounding acquaintance.',
 'The late owner of this estate was a single man, who lived\nto a very advanced age, and who for many years of his life,\nhad a constant companion and housekeeper in his sister.',
 'But her death, which happened ten years before his own,\nproduced a great alteration in his home; for to supply\nher loss, he invited and received into his house the family\nof his nephew Mr.',
 'Henry Dashwood, the legal inheritor\nof the Norland estate, and the person to whom he intended\nto bequeath it.',
 "In the society of his nephew and niece,\nand their children, the old Gentleman's days were\ncomfortably spent.",
 'His attachment to th

In [16]:
len(tokenised_sense_and_sensibility)

5525

In [17]:
tokenised_sense_and_sensibility[2]

'The late owner of this estate was a single man, who lived\nto a very advanced age, and who for many years of his life,\nhad a constant companion and housekeeper in his sister.'

In [18]:
def process_text(text):
    tagged_text = []
    try:
        for s in text:
            words = nltk.word_tokenize(s)
            tagged = nltk.pos_tag(words)
            tagged_text.append(tagged)
    except Exception as e:
        print(e)
    
    return tagged_text
        
tagged_text = process_text(tokenised_sense_and_sensibility)


The default pos_tag() model  is maxent_treebanck_pos_tagger, found in ~/nltk_data/taggers/. COde in nltk-master/nltk/tag/__init__.py

The tagged training set would be a .txt file with the ext broken up one word per line and each line looking like this:
    
word POS-tag BIO-tag

In [21]:
tagged_text

[[('[', 'JJ'),
  ('Sense', 'NNP'),
  ('and', 'CC'),
  ('Sensibility', 'NNP'),
  ('by', 'IN'),
  ('Jane', 'NNP'),
  ('Austen', 'NNP'),
  ('1811', 'CD'),
  (']', 'NNP'),
  ('CHAPTER', 'NNP'),
  ('1', 'CD'),
  ('The', 'DT'),
  ('family', 'NN'),
  ('of', 'IN'),
  ('Dashwood', 'NNP'),
  ('had', 'VBD'),
  ('long', 'RB'),
  ('been', 'VBN'),
  ('settled', 'VBN'),
  ('in', 'IN'),
  ('Sussex', 'NNP'),
  ('.', '.')],
 [('Their', 'PRP$'),
  ('estate', 'NN'),
  ('was', 'VBD'),
  ('large', 'JJ'),
  (',', ','),
  ('and', 'CC'),
  ('their', 'PRP$'),
  ('residence', 'NN'),
  ('was', 'VBD'),
  ('at', 'IN'),
  ('Norland', 'NNP'),
  ('Park', 'NNP'),
  (',', ','),
  ('in', 'IN'),
  ('the', 'DT'),
  ('centre', 'NN'),
  ('of', 'IN'),
  ('their', 'PRP$'),
  ('property', 'NN'),
  (',', ','),
  ('where', 'WRB'),
  (',', ','),
  ('for', 'IN'),
  ('many', 'JJ'),
  ('generations', 'NNS'),
  (',', ','),
  ('they', 'PRP'),
  ('had', 'VBD'),
  ('lived', 'VBN'),
  ('in', 'IN'),
  ('so', 'RB'),
  ('respectable', 'JJ'),

#### To see the pos (part-of-speach) tagset and what the tags mean, do this

In [73]:
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

##### Here is a short reference of the above command:
`
CC: conjunction, coordinating
CD: numeral, cardinal
DT: determiner
EX: existential there
FW: foreign word
IN: preposition or conjunction, subordinating
JJ: adjective or numeral, ordinal
JJR: adjective, comparative
JJS: adjective, superlative
LS: list item marker
MD: modal auxiliary
NN: noun, common, singular or mass
NNP: noun, proper, singular
NNPS: noun, proper, plural
NNS: noun, common, plural
PDT: pre-determiner
POS: genitive marker
PRP: pronoun, personal
PRP$: pronoun, possessive
RB: adverb
RBR: adverb, comparative
RBS: adverb, superlative
RP: particle
SYM: symbol
TO: "to" as preposition or infinitive marker
UH: interjection
VB: verb, base form
VBD: verb, past tense
VBG: verb, present participle or gerund
VBN: verb, past participle
VBP: verb, present tense, not 3rd person singular
VBZ: verb, present tense, 3rd person singular
WDT: WH-determiner
WP: WH-pronoun    
WP$: WH-pronoun, possessive
WRB: Wh-adverb
`


Tagged corpora use many different conventions for tagging words. Universal part-of-speach tagset is the simplified one. The one above is not universal, it is unsimplified. For more info see: http://www.nltk.org/book/ch05.html

## Chunking
Figuring out the meaning of the sentense.
What we do is combining part-of-speach tagging and regular expressions.
We will chunk into noun phrases (noun and descriptive group of words surrounding the noun (should be touching each other))

### First, a small example on how chunking works

In [79]:
sample_text = '''The purpose of this assessment is to evaluate the plant health risk of pests associated with the \
importation of organic wheat (Triticum aestivum, T. durum), including triticale (xTriticosecale) (hereafter referred \
to as wheat unless addressed explicitly) from the Republic of Kazakhstan (Kazakhstan). This document is the risk \
assessment portion of the pest risk analysis process. The method used by the CFIA to initiate and conduct this pest \
risk assessment is consistent with international guidelines (IPPC 2016).\
This assessment has considered the pest status of bacteria and phytoplasmas, fungi, viruses, nematodes, insects, \
mites, molluscs and weeds reported for organic wheat in this region. Appendix 1 provides a list of pests potentially \
associated with organic wheat from Kazakhstan.'''
sample_text

'The purpose of this assessment is to evaluate the plant health risk of pests associated with the importation of organic wheat (Triticum aestivum, T. durum), including triticale (xTriticosecale) (hereafter referred to as wheat unless addressed explicitly) from the Republic of Kazakhstan (Kazakhstan). This document is the risk assessment portion of the pest risk analysis process. The method used by the CFIA to initiate and conduct this pest risk assessment is consistent with international guidelines (IPPC 2016).This assessment has considered the pest status of bacteria and phytoplasmas, fungi, viruses, nematodes, insects, mites, molluscs and weeds reported for organic wheat in this region. Appendix 1 provides a list of pests potentially associated with organic wheat from Kazakhstan.'

##### Tokenize text into words and tag the words as pos

In [87]:
words = nltk.word_tokenize(sample_text)
tagged_words = nltk.pos_tag(words)
tagged_words

[('The', 'DT'),
 ('purpose', 'NN'),
 ('of', 'IN'),
 ('this', 'DT'),
 ('assessment', 'NN'),
 ('is', 'VBZ'),
 ('to', 'TO'),
 ('evaluate', 'VB'),
 ('the', 'DT'),
 ('plant', 'NN'),
 ('health', 'NN'),
 ('risk', 'NN'),
 ('of', 'IN'),
 ('pests', 'NNS'),
 ('associated', 'VBN'),
 ('with', 'IN'),
 ('the', 'DT'),
 ('importation', 'NN'),
 ('of', 'IN'),
 ('organic', 'JJ'),
 ('wheat', 'NN'),
 ('(', '('),
 ('Triticum', 'NNP'),
 ('aestivum', 'RB'),
 (',', ','),
 ('T.', 'NNP'),
 ('durum', 'NN'),
 (')', ')'),
 (',', ','),
 ('including', 'VBG'),
 ('triticale', 'NN'),
 ('(', '('),
 ('xTriticosecale', 'JJ'),
 (')', ')'),
 ('(', '('),
 ('hereafter', 'NN'),
 ('referred', 'VBN'),
 ('to', 'TO'),
 ('as', 'IN'),
 ('wheat', 'NN'),
 ('unless', 'IN'),
 ('addressed', 'VBN'),
 ('explicitly', 'RB'),
 (')', ')'),
 ('from', 'IN'),
 ('the', 'DT'),
 ('Republic', 'NNP'),
 ('of', 'IN'),
 ('Kazakhstan', 'NNP'),
 ('(', '('),
 ('Kazakhstan', 'NNP'),
 (')', ')'),
 ('.', '.'),
 ('This', 'DT'),
 ('document', 'NN'),
 ('is', 'VBZ')

##### Use regex to chunk the text 

Note that the following exception will occur if the regex string is not matched to anything in the texts:

ValueError: Expected stage marker (eg NP:)


In [91]:
chunk_expr = r'''Chunk: {<NN.?><NN.?>}'''
chunk_parser = nltk.RegexpParser(chunk_expr)
chunked = chunk_parser.parse(tagged_words)
chunked.draw()
#chunked

### Now a function that takes Jane Austen's tokenized "Sense and Sensibility" and does chunking.

In [96]:
def chunk_tokenized_text(text):
    ''' text: should be tockenized
    '''
    chunked_text = []
    try:
        for s in text:
            words = nltk.word_tokenize(s)
            tagged = nltk.pos_tag(words)
            
            # Any adverb 0 or more, followed by a verb (0 or more) followed by required noun
            chunk_gram = r'''Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}'''
            chunk_parser = nltk.RegexpParser(chunk_gram)
            chunked = chunk_parser.parse(tagged)
            
            chunked_text.append(chunked)
    except Exception as e:
        print(e)
    
    return chunked_text

chunked_sens_and_sensibility = chunk_tokenized_text(tokenised_sense_and_sensibility)



In [97]:
len(chunked_sens_and_sensibility)

5525

In [98]:
chunked_sens_and_sensibility[0].draw()

## Chinking

Chink smth from a chunk. I.e. chunk everything but smth....
Chink is defined as anything between }{ , instead of {} for a chunk.

In [100]:
sample_text = '''The purpose of this assessment is to evaluate the plant health risk of pests associated with the \
importation of organic wheat (Triticum aestivum, T. durum), including triticale (xTriticosecale) (hereafter referred \
to as wheat unless addressed explicitly) from the Republic of Kazakhstan (Kazakhstan). This document is the risk \
assessment portion of the pest risk analysis process. The method used by the CFIA to initiate and conduct this pest \
risk assessment is consistent with international guidelines (IPPC 2016).\
This assessment has considered the pest status of bacteria and phytoplasmas, fungi, viruses, nematodes, insects, \
mites, molluscs and weeds reported for organic wheat in this region. Appendix 1 provides a list of pests potentially \
associated with organic wheat from Kazakhstan.'''
words = nltk.word_tokenize(sample_text)
tagged_words = nltk.pos_tag(words)

chunk_expr = r'''Chunk: {<.*>+}
            }<VB.?|IN|DT|TO>{'''       ##  <=== this is where chinking is happening

chunk_parser = nltk.RegexpParser(chunk_expr)
chunked = chunk_parser.parse(tagged_words)
chunked.draw()

## Named Entity Recognition

Named entities that will bediscovered depend on the corpus on which the chunker (ne_chunk) was trained. 
Note that false positive rate for NER with nltk is high, and general error rate is high, so might not be a good option for most cases.

#### Description of the entities discovered
There doesn't seem to be a command or a full list of entities that are used by the chunker, but there is a website with the main ones:
http://www.nltk.org/book/ch07.html#tab-ne-types

Here they are:

`
ORGANIZATION Georgia-Pacific Corp., WHO 
PERSON       Eddy Bonte, President Obama 
LOCATION     Murray River, Mount Everest 
DATE         June, 2008-06-29 
TIME         two fifty a m, 1:30 p.m. 
MONEY        175 million Canadian Dollars, GBP 10.40 
PERCENT      twenty pct, 18.75 % 
FACILITY     Washington Monument, Stonehenge 
GPE          South East Asia, Midlothian `


In [3]:
sample_text = '''The purpose of this assessment is to evaluate the plant health risk of pests associated with the \
importation of organic wheat (Triticum aestivum, T. durum), including triticale (xTriticosecale) (hereafter referred \
to as wheat unless addressed explicitly) from the Republic of Kazakhstan (Kazakhstan). This document is the risk \
assessment portion of the pest risk analysis process. The method used by the CFIA to initiate and conduct this pest \
risk assessment is consistent with international guidelines (IPPC 2016).\
This assessment has considered the pest status of bacteria and phytoplasmas, fungi, viruses, nematodes, insects, \
mites, molluscs and weeds reported for organic wheat in this region. Appendix 1 provides a list of pests potentially \
associated with organic wheat from Kazakhstan.'''
words = nltk.word_tokenize(sample_text)
tagged_words = nltk.pos_tag(words)

named_entities = nltk.ne_chunk(tagged_words)
#named_entity = nltk.ne_chunk(tagged_words, binary = True)  # if you don't want LOCATION, GPE, etc. displayed, but just entities
named_entities.draw()

Selecting only specific named entities (ex. just geographic locations)

In [4]:
for subtree in named_entities.subtrees(filter=lambda x: x.label() == 'GPE'):
    print(subtree.leaves())

[('Kazakhstan', 'NNP')]
[('Kazakhstan', 'NNP')]
[('Kazakhstan', 'NNP')]


## Lemmatizing

Similar to stemming, but returns an actual word, not just root. By default works with nouns. If using other part of speach, use parameters. I.e. when lemmatizing a text, will need to use pos tagging and then feed the words with tags to lemmatizer.


In [17]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize("bacteria"))   ## default of lemmatizer.lemmatize("bacteria", pos='n')
print(lemmatizer.lemmatize("bacterium"))
print(lemmatizer.lemmatize("mice"))
print(lemmatizer.lemmatize("better", pos='a'))
print(lemmatizer.lemmatize("best", pos='a'))
print(lemmatizer.lemmatize("running", pos='v'))
print(lemmatizer.lemmatize("were", pos='v'))

bacteria
bacterium
mouse
good
best
run
be


## NLTK Corpora and viewing NLTK parameters

Installed on ~/nltk_data. See the data on the file system. 
NLTK has some capability to handle csv data. Otherwise, use pandas.
Within NLTK, the corpus with the biggest capabilities is WordNet.

Some NLTK corpora is tagged for part-of-speach tagging. You can use these corpora for training of the pos taggers. If the corpus is pos tagged, then you can do the following:

`nltk.corpus.<tagged corpus>.tagged_words()
nltk.corpus.<tagged corpus>.tagged_words(tagset='universal')`

Not all corpora have the same tagset. See tagset help and readme for details.

#### See where nltk is installed

In [19]:
nltk.__file__

'/Users/korolo/miniconda3/envs/ipython/lib/python3.6/site-packages/nltk/__init__.py'

To see where nltk data is, in the nltk install dir (above), locate script called "data" and see the code for you OS. Default dir is there.

### WordNet corpus 

In [7]:
from nltk.corpus import wordnet

#### Check corpus's info (readme)

In [6]:
nltk.corpus.wordnet.readme()

'This data is formatted by the Open Multilingual Wordnet Project\nhttp://compling.hss.ntu.edu.sg/omw/\nto be used by NLTK.\n\nPlease cite us if you find the aggregation useful (see citation.bib)\nand email us if you have any suggestions.\n\nFrancis Bond (bond@ieee.org)\n2015-09-24\n\n\n27 languages covered (and we assume you have English):\nals\narb\nbul\ncat\ncmn\ndan\nell\neus\nfas\nfin\nfra\nglg\nheb\nhrv\nind\nita\njpn\nnno\nnob\npol\npor\nqcn\nslv\nspa\nswe\ntha\nzsm \n\n'

#### Finding synonyms

In [40]:
synonyms = wordnet.synsets("cloud")
synonyms

[Synset('cloud.n.01'),
 Synset('cloud.n.02'),
 Synset('cloud.n.03'),
 Synset('cloud.n.04'),
 Synset('cloud.n.05'),
 Synset('swarm.n.02'),
 Synset('overcast.v.01'),
 Synset('obscure.v.01'),
 Synset('cloud.v.03'),
 Synset('cloud.v.04'),
 Synset('defile.v.01'),
 Synset('cloud.v.06'),
 Synset('mottle.v.02'),
 Synset('cloud.v.08')]

In [41]:
# Just the word
synonyms[6].lemmas()[0].name()

'overcast'

In [42]:
synonyms[0].definition()

'any collection of particles (e.g., smoke or dust) or gases that is visible'

In [43]:
synonyms[6].examples()

['Fall weather often overcasts our beaches']

In [38]:
#### Finding all synonyms and antonyms with lemmas

In [51]:
all_synonyms = []
all_antonyms = []
for synset in wordnet.synsets("good"):
    for lemma in synset.lemmas():
        all_synonyms.append(lemma.name())
        if lemma.antonyms():
            for antonm in lemma.antonyms():
                all_antonyms.append(antonm.name())
            
set(all_synonyms)

{'adept',
 'beneficial',
 'commodity',
 'dear',
 'dependable',
 'effective',
 'estimable',
 'expert',
 'full',
 'good',
 'goodness',
 'honest',
 'honorable',
 'in_effect',
 'in_force',
 'just',
 'near',
 'practiced',
 'proficient',
 'respectable',
 'right',
 'ripe',
 'safe',
 'salutary',
 'secure',
 'serious',
 'skilful',
 'skillful',
 'sound',
 'soundly',
 'thoroughly',
 'trade_good',
 'undecomposed',
 'unspoiled',
 'unspoilt',
 'upright',
 'well'}

In [52]:
set(all_antonyms)

{'bad', 'badness', 'evil', 'evilness', 'ill'}

#### Semantic similarity
Use synsets

In [54]:
w1 = wordnet.synset("bacteria.n.01")
w2 = wordnet.synset("virus.n.01")

## Use Wu and Palmer semantic similarity score
w1.wup_similarity(w2)

0.875

In [58]:
## Does not work well on scientific identification
w1 = wordnet.synset("wheat.n.01")
w2 = wordnet.synset("triticum.n.01")
#w2 = wordnet.synset("durum.n.01")  ## better with durum
w1.wup_similarity(w2)

0.9629629629629629