In [None]:
#!pip install stanfordnlp

# http://www.linguisticsweb.org/doku.php?id=linguisticsweb:tutorials:linguistics_tutorials:automaticannotation:stanford_pos_tagger_python
# running the Stanford POS Tagger from NLTK
#!pip install nltk
import nltk
from nltk import word_tokenize
from nltk import StanfordTagger

# download those files to help POS-tagging
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')

import re
# remove punctuations from unicode string except apostrophe
# split sentence as words / tokenization
def word_token(sentence):
  #text_tok = nltk.word_tokenize(text_example)
  sentence = re.sub(r"[^\w\d'\s]+",'',sentence)
  #print(sentence)
  words = sentence.split(' ')
  return words

"""
# all tags with the abbreviations
CC :	 conjunction, coordinating
CD :	 numeral, cardinal
DT :	 determiner
EX :	 existential
FW :	 foreign word
IN :	 preposition
JJ :	 adjective
LS :	 list marker
MD :	 modal auxiliary
NN :	 noun
PDT :	 pre-determiner
POS :	 genitive marker
PR :	 pronoun
RB :	 adverb
RP :	 particle
SYM :	 symbol
UH :	 interjection
VB :	 verb
WDT :	 WH-determiner
WP :	 WH-pronoun
WRB :	 Wh-adverb

#nltk.help.upenn_tagset()
# to print out all tag-set
"""
# pos-tagging all words
def pos_tagging(sentence):
  # there are different tags as above when using Stanford toolkit
  tags = {'CC': 'conjunction, coordinating', 'CD': 'numeral, cardinal', 'DT': 'determiner', 'EX': 'existential', 'FW': 'foreign word', 'IN': 'preposition', 'JJ': 'adjective', 'LS': 'list marker', 'MD': 'modal auxiliary', 'NN': 'noun', 'PDT': 'pre-determiner', 'POS': 'genitive marker', 'PR': 'pronoun', 'RB': 'adverb', 'RP': 'particle', 'SYM': 'symbol', 'UH': 'interjection', 'VB': 'verb', 'WDT': 'WH-determiner', 'WP': 'WH-pronoun', 'WRB': 'Wh-adverb'}
  words = word_token(text_example)
  pos_tagged = nltk.pos_tag(text_tok)
  dic = {}
  for w, tag in pos_tagged:
    tag = tag[:2]
    dic[w] = tags[tag]
  return dic

text_example = "Avengers: Endgame is a 2019 American superhero film based on the Marvel Comics superhero team the Avengers, produced by Marvel Studios and distributed by Walt Disney Studios Motion Pictures. The movie features an ensemble cast including Robert Downey Jr., Chris Evans, Mark Ruffalo, Chris Hemsworth, and others. (Source: wikipedia)."
pos_tagging(text_example)


In [None]:
# https://stackabuse.com/spelling-correction-in-python-with-textblob/
# https://pypi.org/project/pyspellchecker/

#!pip install pyspellchecker
from spellchecker import SpellChecker
#!pip install textblob
from textblob import TextBlob

def check_spelling(sentence):
  spell = SpellChecker()
  words = word_token(sentence)
  mistakes = spell.unknown(words)
  dic = {}
  for w in mistakes:
    # spell.candidates(w)
    dic[w] = spell.correction(w)
  return dic

def check_spelling_update(sentence):
  words = word_token(sentence)
  dic = {}
  for w in words:
    cor = TextBlob(w)
    nw = cor.correct()
    nw = str(nw)    
    if nw!=w:
      dic[w] = nw
  return dic

def check(sentence):
  d1 = check_spelling(sentence)
  d2 = check_spelling_update(sentence)
  dic = {}
  for w in d1:
    dic[w] = d1[w]
  for w in d2:
    dic[w] = d2[w]
  return dic

# find those words that may be misspelled
sentence = 'something is hapenning here, what do you abl about taht'
check_spelling(sentence)
check_spelling_update(sentence)
check(sentence)

{'abl': 'all', 'hapenning': 'happening', 'taht': 'that'}

In [None]:
#!pip install textblob
from textblob import TextBlob
textBlb = TextBlob(sentence)
textCorrected = textBlb.correct() 
print(textCorrected)

something is happening here, what do you all about that


In [None]:

# https://www.pluralsight.com/guides/natural-language-processing-named-entity-recognition
# NER
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
nltk.download('wordnet')  #download if using this module for the first time
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')    #download if using this module for the first time
from nltk.tokenize import word_tokenize
from nltk import word_tokenize, pos_tag, ne_chunk
nltk.download('words')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('maxent_ne_chunker')


In [None]:
textexample = "Avengers: Endgame is a 2019 American superhero film based on the Marvel Comics superhero team the Avengers, produced by Marvel Studios and distributed by Walt Disney Studios Motion Pictures. The movie features an ensemble cast including Robert Downey Jr., Chris Evans, Mark Ruffalo, Chris Hemsworth, and others. (Source: wikipedia)."
#print(textexample)


#Word Tokenization
sentences = nltk.sent_tokenize(textexample)
tokenized_sentence = [nltk.word_tokenize(sent) for sent in sentences]
tokenized_sentence 


"""DT: determiner
IN: preposition/subordinating conjunction
JJ: adjective ‘big’
JJR: adjective, comparative ‘bigger’
JJS: adjective, superlative ‘biggest’
LS: list marker
NN: noun, singular ‘desk’
NNS: noun plural ‘desks’
NNP: proper noun, singular ‘Harrison’
NNPS: proper noun, plural ‘Americans’
PRP: personal pronoun I, he, she
RB: adverb very, silently,
UH: interjection
VB: verb, base form take
VBD: verb, past tense took"""

#Parts of Speech (POS) Tagging
pos_tagging_sentences = [nltk.pos_tag(sent) for sent in tokenized_sentence]
def preprocess(text):
    text = nltk.word_tokenize(text)
    text = nltk.pos_tag(text)
    return text

processed_text = preprocess(textexample)
processed_text

#Chunking - perform NER analysis.
res_chunk = ne_chunk(processed_text)

for x in str(res_chunk).split('\n'):
    if '/NN' in x:
        print(x)


  Avengers/NNS
  Endgame/NN
  superhero/NN
  film/NN
  (ORGANIZATION Marvel/NNP Comics/NNP)
  superhero/NN
  team/NN
  (ORGANIZATION Avengers/NNPS)
  (PERSON Marvel/NNP Studios/NNP)
  (PERSON Walt/NNP Disney/NNP Studios/NNP)
  Motion/NNP
  Pictures/NNP
  movie/NN
  cast/NN
  (PERSON Robert/NNP Downey/NNP Jr./NNP)
  (PERSON Chris/NNP Evans/NNP)
  (PERSON Mark/NNP Ruffalo/NNP)
  (PERSON Chris/NNP Hemsworth/NNP)
  others/NNS
  (PERSON Source/NN)
  wikipedia/NN


In [None]:
# https://realpython.com/python-nltk-sentiment-analysis/

from nltk.sentiment import SentimentIntensityAnalyzer
#nltk.download('vader_lexicon')
"""The negative, neutral, and positive scores are related: 
They all add up to 1 and can’t be negative. 
The compound score is calculated differently. 
It’s not just an average, and it can range from -1 to 1."""

sia = SentimentIntensityAnalyzer()
sia.polarity_scores("Wow, NLTK is really powerful!")

#
#nltk.download('twitter_samples')
tweets = [t.replace("://", "//") for t in nltk.corpus.twitter_samples.strings()]
from random import shuffle

def is_positive(tweet: str) -> bool:
    """True if tweet has positive compound sentiment, False otherwise."""
    return sia.polarity_scores(tweet)["compound"] > 0

shuffle(tweets)
for tweet in tweets[:10]:
    print(">", is_positive(tweet), tweet)

#nltk.download('movie_reviews')
positive_review_ids = nltk.corpus.movie_reviews.fileids(categories=["pos"])
negative_review_ids = nltk.corpus.movie_reviews.fileids(categories=["neg"])
all_review_ids = positive_review_ids + negative_review_ids



> False RT @Those3Amigos: 'Secret' Tory plans for £8bn in welfare cuts exposed by Danny Alexander http//t.co/94BqGpTXM8
> False RT @mrmarksteel: Tomorrow Miliband will say 'let me tell you this, I'm not even voting for myself in case I do a deal with the SNP'.
> True RT @BBCPolitics: David Cameron says child benefit is "key" for UK families http//t.co/jsd8Jb1lYA #bbcqt http//t.co/c13CsAKr4Q
> True Hi BAM ! @BarsAndMelody 
Can you follow my bestfriend @969Horan696 ? 
She loves you a lot :) 
See you in Warsaw &lt;3 
Love you &lt;3 x39
> False @LIAMREILLY20 I can ignore the journos, it's just if Cameron, Osborne or Clegg are on I fear for my TV with heavy objects close to hand
> True @paynepowerr WITHOUT ME :(
> True @NickTheBullsFan @CashNastyGaming still suck at editing but if this turns into something special I owe it to you guys :)
> True @Michex69 @rich_falconer point is Tories will discredit SNP in England exploiting base nationalist instincts , labour is best bet !
> False RT @thom