Natural Language Processing using NLTK


In [0]:
# Install NLTK - pip install nltk
import nltk
nltk.download('wordnet')
nltk.download('punkt')

NLP Part 0 - Get some Data!


In [0]:
import urllib
import bs4 as bs
import re

In [0]:
# We will read the contents of the Wikipedia article "Global_warming" as an example, please feel free to use your own!
# We can open the page using "urllib.request.urlopen" then read it using ".read()"
source = urllib.request.urlopen('https://en.wikipedia.org/wiki/Global_warming').read()

# Beautiful Soup is a Python library for pulling data out of HTML and XML files.
# you may need to install a parser library --> "!pip3 install lxml"
# Parsing the data/creating BeautifulSoup object

soup = bs.BeautifulSoup(source,"html.parser") 

# Fetching the data
text = ""
for paragraph in soup.find_all('p'): #The <p> tag defines a paragraph in the webpages
    text += paragraph.text

# Preprocessing the data
text = re.sub(r'\[[0-9]*\]',' ',text) # [0-9]* --> Matches zero or more repetitions of any digit from 0 to 9
text = text.lower() #everything to lowercase
text = re.sub(r'\W^.?!',' ',text) # \W --> Matches any character which is not a word character except (.?!)
text = re.sub(r'\d',' ',text) # \d --> Matches any decimal digit
text = re.sub(r'\s+',' ',text) # \s --> Matches any characters that are considered whitespace (Ex: [\t\n\r\f\v].)

In [0]:
text[:100]

NLP Part 1 - Tokenization of paragraphs/sentences

In [0]:

# Tokenizing sentences
sentences = nltk.sent_tokenize(text) #tokenizing or splitting a string, text into a list of sentences.

In [0]:

print(sentences[:10])

In [0]:
# Tokenizing words
all_words = nltk.word_tokenize(text) #tokenizing or splitting a string, text into a list of words.

In [0]:
print(all_words[:10])

NLP Part 2 - Stopwords and Punctuation

In [0]:
nltk.download('stopwords')
from nltk.corpus import stopwords

In [0]:
# Removing stopwords
def remove_stopwords(sentences):
    for i in range(len(sentences)):
        words = nltk.word_tokenize(sentences[i])
        words = [word for word in words if word not in stopwords.words('english')]
        sentences[i] = ' '.join(words)
    return sentences
sentences = remove_stopwords(sentences)
print(sentences[:10]) #eliminating all stop words

In [0]:
def remove_punctuation(sentences):
    for i in range(len(sentences)):
        words = nltk.word_tokenize(sentences[i])
        words = [word for word in words if word not in ",.?!()"]
        sentences[i] = ' '.join(words)
    return sentences
sentences = remove_punctuation(sentences)
print(sentences[:10]) #eliminating all punctuation

NLP Part 3a - Stemming the words

In [0]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
# try each of the word below
stemmer.stem('troubled')
#stemmer.stem('trouble')
#stemmer.stem('troubling')
#stemmer.stem('troubles')

In [0]:
sentences = nltk.sent_tokenize(text)
sentences = remove_stopwords(sentences)
stemmer = PorterStemmer()

# Stemming
def stem_sentences(sentences):
    for i in range(len(sentences)):
        words = nltk.word_tokenize(sentences[i])
        words = [stemmer.stem(word) for word in words]
        sentences[i] = ' '.join(words)
    return sentences

stemmed_sentences = stem_sentences(sentences)
print(stemmed_sentences[:10])

NLP Part 3b - Lemmatization

In [0]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
##Unstem our sentences
sentences = nltk.sent_tokenize(text)
sentences = remove_stopwords(sentences)
sentences = remove_punctuation(sentences)

# Lemmatization
def lem_sentences(sentences):
    for i in range(len(sentences)):
        words = nltk.word_tokenize(sentences[i])
        words = [lemmatizer.lemmatize(word) for word in words]
        sentences[i] = ' '.join(words)   
    return sentences
sentences = lem_sentences(sentences)
print(sentences[:10]) 

NLP Part 4 - POS Tagging

In [0]:
nltk.download('averaged_perceptron_tagger')

In [0]:
# POS Tagging

# example
# CC - coordinating conjunction
# NN - noun, singular (cat, tree)
#all_words = nltk.word_tokenize(text)  ###If we want to look at part of speech taking before we stem/lem
tagged_words = nltk.pos_tag(all_words)
##Creates a list of lists where each element of the list is [word,partofspeech abbreviation]
# Tagged word paragraph
word_tags = []
for tw in tagged_words:
    word_tags.append(tw[0]+"_"+tw[1])

tagged_paragraph = ' '.join(word_tags)
print(tagged_paragraph[:1000])