In [None]:
# open the new dataset

import codecs, nltk

article = codecs.open("../datasets/CleanedArticles/15.txt","r","utf-8")
article = article.read()

# split into sentences
sentences = nltk.sent_tokenize(article) 

# take one single sentence 

sentence = sentences[1]

#tokenize it

tokenized_sentence = nltk.word_tokenize(sentence)

# you use the pos-tagger (it gives you back a list of tuples (word,pos))
pos_sentence = nltk.pos_tag(tokenized_sentence)

print (pos_sentence)

In [None]:
# combining lemmatization and pos tagging
from nltk.stem.wordnet import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()

lemma_words = []

for word,pos in pos_sentence:
    
    # if it's a verb - then we tell that to the lemmatizer
    if pos[0] == "V":
        lemma = wordnet_lemmatizer.lemmatize(word,"v")
    else:
    # otherwise, work as usual
        lemma = wordnet_lemmatizer.lemmatize(word)
    # we append the results
    lemma_words.append(lemma)
    
print (lemma_words)

In [None]:
# let's now define a function that does all we need
from nltk.corpus import stopwords

stop_word_list = stopwords.words('english')

# input should be a string
def nlp_pipeline(text):
    
    # if you want you can split in sentences - i'm usually skipping this step
    # text = nltk.sent_tokenize(text) 
    
    #tokenize words for each sentence
    text = nltk.word_tokenize(text)
    
    # pos tagger
    text = nltk.pos_tag(text)

    # lemmatizer
    text = [wordnet_lemmatizer.lemmatize(token.lower(),"v") if pos[0] == "V" else wordnet_lemmatizer.lemmatize(token.lower()) for token,pos in text]
    
    # remove punctuation and numbers
    text = [token for token in text if token.isalpha()]
    
    # remove stopwords - be careful with this step    
    text = [token for token in text if token not in stop_word_list]

    return text

In [None]:
clean_sentence = nlp_pipeline(sentence)
print (clean_sentence)

In [None]:
# let's take an entire article and use our pipeline!

clean_article = nlp_pipeline(article)
print (clean_article)

In [None]:
# word sense disambiguation

# check documentation: http://www.nltk.org/howto/wordnet.html

from nltk.corpus import wordnet as wn

# let's isolate each word - you do this using a set (another type of object in python)

unique_words = set(clean_article)

# let's check how many senses each word has
for word in unique_words:
    print (word, len(wn.synsets(word)))


In [None]:
word = "loan"

senses = wn.synsets(word)

for sense in senses:
    # get definition of sense
    print ("\ndefinition")
    print(sense.definition())
    
    # get a textual example
    print ("\nexample")
    print(sense.examples())
    
    # get hypernymy
    print ("\nhypernymy")
    print(sense.hypernyms())

    # get hyponyms
    print ("\nhyponyms")
    print(sense.hyponyms())
        
    # this is a way of getting synonyms - there are others
    print ("\nsynonyms")
    print (sense.lemma_names())
    
    # this is for getting antonyms - works especially with adjectives 
    print ("\nantonyms")
    print (sense.lemmas()[0].antonyms())
    
    print ("\n\n")


In [None]:
# let's consider two sentences where "cell" is mentioned

sent1 = "The terrorist cell was neutralized near the southern Russian city of Makhachkala, the capital of the Republic of Dagestan."

sent2 = "The molecule, which uses light energy to move protons across a somatic cell membrane, proved unsuitable for crystallography."

# you clean the sentences using our pipeline
clean_sent1 = nlp_pipeline(sent1)

clean_sent2 = nlp_pipeline(sent2)

print ("clean sent 1:", clean_sent1)
print ("clean sent 2:", clean_sent2)
print (" ")

# for each possible sense of "cell" you can, for instance, check the overlap between the definition and the sentence

In [None]:
word = "cell"

senses = wn.synsets(word)

for sense in senses:
    # get definition of sense
    definition =  sense.definition()
    
    # you clean the definition with our pipeline
    clean_definition = nlp_pipeline(definition)
    
    # you check the intersection of the two sentences
    inters_1 = set(clean_sent1).intersection(clean_definition)
    inters_2 = set(clean_sent2).intersection(clean_definition)
    
    print (definition)
    print ("clean definition:", clean_definition)
    print ("intersection with sent 1:", inters_1)
    print ("intersection with sent 2:", inters_2)
    print (len(inters_1),len(inters_2))
    print (" ")

In [None]:
# homework: find the best sense - implement your version of the Lesk algorithm: https://en.wikipedia.org/wiki/Lesk_algorithm



homework 2: get the json file with the tweets from Donald Trump and improve his vocabulary by changing his poor choice of adjectives with more sophisticated synonyms (e.g. "bad ratings on the Emmys last night" -> "substandard ratings on the Emmys last night") 
 
or

make his tweets nicer by changing adjectives with related antonyms (e.g. "bad ratings on the Emmys last night" -> "excellent ratings on the Emmys last night") 

to do you need to combine:

- text processing (POS tagging + WordNet)
- and to find a solution for knowing if a word is "more sophisticated" than another one
