### Connecting Google Colab with your Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
desired_path = "/content/drive/MyDrive/NLP"
    # Check if the directory exists
if not os.path.exists(desired_path):
        # If it doesn't exist, create it
      os.makedirs(desired_path)
      print(f"Directory '{desired_path}' created successfully.")
else:
   print(f"Directory '{desired_path}' already exists.")
    # Change the current working directory
os.chdir(desired_path)
os.chdir("/content/drive/MyDrive/NLP")

Directory '/content/drive/MyDrive/NLP' already exists.


## Text Preprocessing

mk### Natural Language Toolkit

* NLTK is a leading platform for building Python programs to work with human language data.
* NLTK provides easy-to-use interfaces to over 50 corpora and lexical resources such as WordNet, along with a suite of text processing libraries for classification, tokenization, stemming, tagging, parsing, and semantic reasoning, wrappers for industrial-strength NLP libraries, and an active discussion forum.
* NLTK is suitable for linguists, engineers, students, educators, researchers, and industry users alike.
* NLTK is available for Windows, Mac OS X, and Linux.
* Best of all, NLTK is a free, open source, community-driven project. For more details - www.nltk.org

In [None]:
import nltk




```Python
!pip install nltk

nltk.download()
```

#### We will be using a sample text to demonstrate various text pre-processing steps.

In [None]:
string = '''At Waterloo we were fortunate in catching a train for Leatherhead, where we hired a trap at the station inn and drove for four or five miles through the lovely Surrey lanes.
It was a perfect day, with a bright sun and a few fleecy clouds in the heavens. The trees and wayside hedges were just throwing out their first green shoots, and the air was full of the pleasant smell of the moist earth. To me at least there was a strange contrast between the sweet promise of the spring and this sinister quest upon which we were engaged.
My companion Mr. Alfred sat in the front of the trap, his arms folded, his hat pulled down over his eyes, and his chin sunk upon his breast, buried in the deepest thought.
Suddenly, however, he started, tapped me on the shoulder, and pointed over the meadows.
The train was @09:30 AM and we have to reach the station by 08:30 AM. At Waterloo we were fortunate in catching a train for Leatherhead, where we hired a trap at the station inn and drove for four or five miles through the lovely Surrey lanes.
It was a perfect day, with a bright sun and a few fleecy clouds in the heavens.'''

In [None]:
string

'At Waterloo we were fortunate in catching a train for Leatherhead, where we hired a trap at the station inn and drove for four or five miles through the lovely Surrey lanes.\nIt was a perfect day, with a bright sun and a few fleecy clouds in the heavens. The trees and wayside hedges were just throwing out their first green shoots, and the air was full of the pleasant smell of the moist earth. To me at least there was a strange contrast between the sweet promise of the spring and this sinister quest upon which we were engaged.\nMy companion Mr. Alfred sat in the front of the trap, his arms folded, his hat pulled down over his eyes, and his chin sunk upon his breast, buried in the deepest thought.\nSuddenly, however, he started, tapped me on the shoulder, and pointed over the meadows.\nThe train was @09:30 AM and we have to reach the station by 08:30 AM. At Waterloo we were fortunate in catching a train for Leatherhead, where we hired a trap at the station inn and drove for four or five

In [None]:
type(string)

str

In [None]:
len(string)

1114

In [None]:
string.count("Waterloo")

2

In [None]:
x=string.split()
len(x)

211

#### But what if we want to read the text from a file?

Change the working directory using ".chdir()" method

In [None]:
# Change the working directory using ".chdir()" method
PATH = os.getcwd()
DATA_PATH = os.path.join(PATH, "data")


List the files that are present in the path

In [None]:
os.listdir()

[]

Reading from a text file

In [None]:
import os

# Get the current working directory
current_directory = os.getcwd()
print(f"Current working directory: {current_directory}")

# Construct the full file path
file_path = os.path.join(current_directory, 'sample_text.txt')
# or file_path = os.path.join('/content/drive/MyDrive/NLP', "data",'sample_text.txt')
# or replace '/content/drive/MyDrive/NLP' with whatever the parent directory is that contains data
print(f"Looking for file at: {file_path}")
# Open the file


Current working directory: /content/drive/MyDrive/NLP
Looking for file at: /content/drive/MyDrive/NLP/sample_text.txt


'r' stands for read operation.

'w' to write to a file and

'a' to append to an existing file

In [None]:
string

'At Waterloo we were fortunate in catching a train for Leatherhead, where we hired a trap at the station inn and drove for four or five miles through the lovely Surrey lanes.\nIt was a perfect day, with a bright sun and a few fleecy clouds in the heavens. The trees and wayside hedges were just throwing out their first green shoots, and the air was full of the pleasant smell of the moist earth. To me at least there was a strange contrast between the sweet promise of the spring and this sinister quest upon which we were engaged.\nMy companion Mr. Alfred sat in the front of the trap, his arms folded, his hat pulled down over his eyes, and his chin sunk upon his breast, buried in the deepest thought.\nSuddenly, however, he started, tapped me on the shoulder, and pointed over the meadows.\nThe train was @09:30 AM and we have to reach the station by 08:30 AM. At Waterloo we were fortunate in catching a train for Leatherhead, where we hired a trap at the station inn and drove for four or five

#### Now that our sample text is ready, let us perform following steps:

1. Sentence Tokenizing
2. Word Tokenizing
3. Stop Word Removal
4. Lemmatization

### Tokenizing

Use the NLTK Downloader to obtain the punkt resource

In [None]:
#nltk.download()

In [None]:
import nltk

nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

#### sent_tokenize

sent_tokenize, return a sentence-tokenized copy of text, using NLTK's recommended sentence tokenizer

In [None]:
# Step 1: Install and import necessary libraries


from nltk.tokenize import sent_tokenize

sent_tokens = sent_tokenize(string)

In [None]:
print(type(sent_tokens))
print(len(sent_tokens))

<class 'list'>
9


In [None]:
sent_tokens

['At Waterloo we were fortunate in catching a train for Leatherhead, where we hired a trap at the station inn and drove for four or five miles through the lovely Surrey lanes.',
 'It was a perfect day, with a bright sun and a few fleecy clouds in the heavens.',
 'The trees and wayside hedges were just throwing out their first green shoots, and the air was full of the pleasant smell of the moist earth.',
 'To me at least there was a strange contrast between the sweet promise of the spring and this sinister quest upon which we were engaged.',
 'My companion Mr. Alfred sat in the front of the trap, his arms folded, his hat pulled down over his eyes, and his chin sunk upon his breast, buried in the deepest thought.',
 'Suddenly, however, he started, tapped me on the shoulder, and pointed over the meadows.',
 'The train was @09:30 AM and we have to reach the station by 08:30 AM.',
 'At Waterloo we were fortunate in catching a train for Leatherhead, where we hired a trap at the station inn a

In [None]:
for sent in sent_tokens:
    print(sent)
    print("\n")

At Waterloo we were fortunate in catching a train for Leatherhead, where we hired a trap at the station inn and drove for four or five miles through the lovely Surrey lanes.


It was a perfect day, with a bright sun and a few fleecy clouds in the heavens.


The trees and wayside hedges were just throwing out their first green shoots, and the air was full of the pleasant smell of the moist earth.


To me at least there was a strange contrast between the sweet promise of the spring and this sinister quest upon which we were engaged.


My companion Mr. Alfred sat in the front of the trap, his arms folded, his hat pulled down over his eyes, and his chin sunk upon his breast, buried in the deepest thought.


Suddenly, however, he started, tapped me on the shoulder, and pointed over the meadows.


The train was @09:30 AM and we have to reach the station by 08:30 AM.


At Waterloo we were fortunate in catching a train for Leatherhead, where we hired a trap at the station inn and drove for fou


#### word_tokenize
    Return a tokenized copy of text, using NLTK's recommended word tokenizer

In [None]:
from nltk.tokenize import word_tokenize

tokens = word_tokenize(string)
print(tokens)

['At', 'Waterloo', 'we', 'were', 'fortunate', 'in', 'catching', 'a', 'train', 'for', 'Leatherhead', ',', 'where', 'we', 'hired', 'a', 'trap', 'at', 'the', 'station', 'inn', 'and', 'drove', 'for', 'four', 'or', 'five', 'miles', 'through', 'the', 'lovely', 'Surrey', 'lanes', '.', 'It', 'was', 'a', 'perfect', 'day', ',', 'with', 'a', 'bright', 'sun', 'and', 'a', 'few', 'fleecy', 'clouds', 'in', 'the', 'heavens', '.', 'The', 'trees', 'and', 'wayside', 'hedges', 'were', 'just', 'throwing', 'out', 'their', 'first', 'green', 'shoots', ',', 'and', 'the', 'air', 'was', 'full', 'of', 'the', 'pleasant', 'smell', 'of', 'the', 'moist', 'earth', '.', 'To', 'me', 'at', 'least', 'there', 'was', 'a', 'strange', 'contrast', 'between', 'the', 'sweet', 'promise', 'of', 'the', 'spring', 'and', 'this', 'sinister', 'quest', 'upon', 'which', 'we', 'were', 'engaged', '.', 'My', 'companion', 'Mr.', 'Alfred', 'sat', 'in', 'the', 'front', 'of', 'the', 'trap', ',', 'his', 'arms', 'folded', ',', 'his', 'hat', 'pull

In [None]:
print(tokens[0:11])

['At', 'Waterloo', 'we', 'were', 'fortunate', 'in', 'catching', 'a', 'train', 'for', 'Leatherhead']


## Regular-Expression Tokenizers

#### What is Regular Expression?

A RegEx or Regular Expression in a programming language is a special text string used for describing a search pattern. It is extremely useful for extracting information from text such as code, files, log, spreadsheets or even documents.

While using the regular expression the first thing to recognize is that everything is essentially a character, and we are writing patterns to match a specific sequence of characters also referred as string. Ascii or latin letters are those that are on your keyboards and Unicode is used to match the foreign text. It includes digits and punctuation and all special characters like $#@!%, etc.

A RegexpTokenizer splits a string into substrings using a regular expression. For example, the following tokenizer forms tokens out of alphabetic sequences, money expressions, and any other non-whitespace sequences:('\w+|$[\d.]+|\S+') For more information or different variations -
http://www.nltk.org/api/nltk.tokenize.html#module-nltk.tokenize
http://www.nltk.org/howto/tokenize.html

#### Regular Expressions

Regular expressions can contain both special and ordinary characters.

Most ordinary characters, like 'A', 'a', or '0', are the simplest regular expressions; they simply match themselves. You can concatenate ordinary characters, so last matches the string 'last'.


* `\d` - Matches any decimal digit; this is equivalent to the class [0-9].
* `\D` - Matches any non-digit character; this is equivalent to the class [^0-9].
* `\s` - Matches any whitespace character;
* `\S` - Matches any non-whitespace character;
* `\w` - Matches any alphanumeric character; this is equivalent to the class [a-zA-Z0-9_].
* `\W` - Matches any non-alphanumeric character; this is equivalent to the class [^a-zA-Z0-9_].

The special characters are:

* `'.'` - (Dot.) In the default mode, this matches any character except a newline. If the DOTALL flag has been specified, this matches any character including a newline.
* `'^'` - (Caret.) Matches the start of the string, and in MULTILINE mode also matches immediately after each newline.
* `'$'` - Matches the end of the string or just before the newline at the end of the string, and in MULTILINE mode also matches            before a newline.
* `'*'` - Causes the resulting RE to match 0 or more repetitions of the preceding RE, as many repetitions as are possible. `ab*` will match ‘a’, ‘ab’, or ‘a’ followed by any number of ‘b’s.
* `'+'` - Causes the resulting RE to match 1 or more repetitions of the preceding RE. `ab+` will match ‘a’ followed by any non-zero number of ‘b’s; it will not match just ‘a’.
* `'?'` - Causes the resulting RE to match 0 or 1 repetitions of the preceding RE. ab? will match either ‘a’ or ‘ab’.
* `'\'` - Either escapes special characters (permitting you to match characters like '*', '?', and so forth), or signals a special sequence; special sequences are discussed below.
* `[]` - Used to indicate a set of characters. In a set: Example: Characters can be listed individually, e.g. [amk] will match 'a', 'm', or 'k'.
* `'|'` - A|B, where A and B can be arbitrary REs, creates a regular expression that will match either A or B. An arbitrary number of REs can be separated by the '|' in this way.


In [None]:
import re

In [None]:
x='this is a 90 thing in 9'


#### Examples using NLTK Regular Expression

In [None]:
from nltk.corpus import RegexpTokenizer as regextoken

tokenizer = regextoken("\w+")

tokens = tokenizer.tokenize(string)
print(tokens)

['At', 'Waterloo', 'we', 'were', 'fortunate', 'in', 'catching', 'a', 'train', 'for', 'Leatherhead', 'where', 'we', 'hired', 'a', 'trap', 'at', 'the', 'station', 'inn', 'and', 'drove', 'for', 'four', 'or', 'five', 'miles', 'through', 'the', 'lovely', 'Surrey', 'lanes', 'It', 'was', 'a', 'perfect', 'day', 'with', 'a', 'bright', 'sun', 'and', 'a', 'few', 'fleecy', 'clouds', 'in', 'the', 'heavens', 'The', 'trees', 'and', 'wayside', 'hedges', 'were', 'just', 'throwing', 'out', 'their', 'first', 'green', 'shoots', 'and', 'the', 'air', 'was', 'full', 'of', 'the', 'pleasant', 'smell', 'of', 'the', 'moist', 'earth', 'To', 'me', 'at', 'least', 'there', 'was', 'a', 'strange', 'contrast', 'between', 'the', 'sweet', 'promise', 'of', 'the', 'spring', 'and', 'this', 'sinister', 'quest', 'upon', 'which', 'we', 'were', 'engaged', 'My', 'companion', 'Mr', 'Alfred', 'sat', 'in', 'the', 'front', 'of', 'the', 'trap', 'his', 'arms', 'folded', 'his', 'hat', 'pulled', 'down', 'over', 'his', 'eyes', 'and', 'hi

In [None]:
x='Atwaterloo we werefortunate'
t=regextoken('\w+')
t.tokenize(x)


['Atwaterloo', 'we', 'werefortunate']

In [None]:
from nltk.corpus import RegexpTokenizer as regextoken
t=regextoken('\w')
to=t.tokenize(string)
print(to)

['A', 't', 'W', 'a', 't', 'e', 'r', 'l', 'o', 'o', 'w', 'e', 'w', 'e', 'r', 'e', 'f', 'o', 'r', 't', 'u', 'n', 'a', 't', 'e', 'i', 'n', 'c', 'a', 't', 'c', 'h', 'i', 'n', 'g', 'a', 't', 'r', 'a', 'i', 'n', 'f', 'o', 'r', 'L', 'e', 'a', 't', 'h', 'e', 'r', 'h', 'e', 'a', 'd', 'w', 'h', 'e', 'r', 'e', 'w', 'e', 'h', 'i', 'r', 'e', 'd', 'a', 't', 'r', 'a', 'p', 'a', 't', 't', 'h', 'e', 's', 't', 'a', 't', 'i', 'o', 'n', 'i', 'n', 'n', 'a', 'n', 'd', 'd', 'r', 'o', 'v', 'e', 'f', 'o', 'r', 'f', 'o', 'u', 'r', 'o', 'r', 'f', 'i', 'v', 'e', 'm', 'i', 'l', 'e', 's', 't', 'h', 'r', 'o', 'u', 'g', 'h', 't', 'h', 'e', 'l', 'o', 'v', 'e', 'l', 'y', 'S', 'u', 'r', 'r', 'e', 'y', 'l', 'a', 'n', 'e', 's', 'I', 't', 'w', 'a', 's', 'a', 'p', 'e', 'r', 'f', 'e', 'c', 't', 'd', 'a', 'y', 'w', 'i', 't', 'h', 'a', 'b', 'r', 'i', 'g', 'h', 't', 's', 'u', 'n', 'a', 'n', 'd', 'a', 'f', 'e', 'w', 'f', 'l', 'e', 'e', 'c', 'y', 'c', 'l', 'o', 'u', 'd', 's', 'i', 'n', 't', 'h', 'e', 'h', 'e', 'a', 'v', 'e', 'n',

In [None]:
st='This is A test example To understand cap tokenizer'

In [None]:
capword_tokenizer = regextoken('[A-Z]\w+')

tokens_cap = capword_tokenizer.tokenize(st)
print(tokens_cap)

['This', 'To']


In [None]:
capword_tokenizer = regextoken('[A-Z]\w*')

tokens_cap = capword_tokenizer.tokenize(st)
print(tokens_cap)

['This', 'A', 'To']


In [None]:
tokens_cap = capword_tokenizer.tokenize(string)
print(tokens_cap)

['At', 'Waterloo', 'Leatherhead', 'Surrey', 'It', 'The', 'To', 'My', 'Mr', 'Alfred', 'Suddenly', 'The', 'AM', 'AM', 'At', 'Waterloo', 'Leatherhead', 'Surrey', 'It']


In [None]:
text_tokenizer = regextoken('\w*ee\w*')

tokens_text = text_tokenizer.tokenize(string)
print(tokens_text)

['fleecy', 'trees', 'green', 'between', 'sweet', 'deepest', 'fleecy']


In [None]:
text_tokenizer = regextoken('\w+ee\w*')
st='teen and tween twee eehh'
text_tokenizer.tokenize(st)

['teen', 'tween', 'twee']

In [None]:
numeric_tokenizer = regextoken('[0-9]\d*')

tokens_numeric = numeric_tokenizer.tokenize(string)
print(tokens_numeric)

['09', '30', '08', '30']


#### Regular Expressions using "re"

"re" module included with Python is primarily used for string searching and manipulation. It is quite useful for text extraction and pre-processing. The most common use for "__re__" is to search for patterns in text.

In [None]:
import re

text = '''International School of Engineering (INSOFE) is an Applied Engineering school with area of focus in Data Science. It is located in Hyderabad, Bengaluru and Mumbai. It opened in 2011.
The program is delivered through classroom only sessions and is suitable for students and working professionals. Dr. Dakshinamurthy V Kolluru, Dr. Sridhar Pappu and A S L Ganapathi Kumar started the institution in Hyderabad in mid-2011 and expanded to Bengaluru in early-2016. Initially the school functioned under mentorship of Dr. Dakshinamurthy, Dr. Sridhar and Dr. Sreerama Murthy. They are now supported by a team of additional mentors and in-house data scientists.
In 2012, INSOFE also started Corporate training services. It extended operations to Bengaluru in 2016. CIO.com listed INSOFE 3rd in their list of "16 Big Data Certifications That Will Pay Off" consecutively from 2013-2016. Silicon India Magazine listed INSOFE in their list of "Top 5 Big Data Training Institutes 2016". Analytics India Magazine, listed INSOFE in "Top 9 Analytics Training Institutes in India in 2016". KDnuggets mentioned INSOFE in their list of Certificates in Analytics, Data Mining, and Data Science in 2014.
'''

In [None]:
# Tokenize the data
tokens1 = word_tokenize(text)
print(tokens1)

['International', 'School', 'of', 'Engineering', '(', 'INSOFE', ')', 'is', 'an', 'Applied', 'Engineering', 'school', 'with', 'area', 'of', 'focus', 'in', 'Data', 'Science', '.', 'It', 'is', 'located', 'in', 'Hyderabad', ',', 'Bengaluru', 'and', 'Mumbai', '.', 'It', 'opened', 'in', '2011', '.', 'The', 'program', 'is', 'delivered', 'through', 'classroom', 'only', 'sessions', 'and', 'is', 'suitable', 'for', 'students', 'and', 'working', 'professionals', '.', 'Dr.', 'Dakshinamurthy', 'V', 'Kolluru', ',', 'Dr.', 'Sridhar', 'Pappu', 'and', 'A', 'S', 'L', 'Ganapathi', 'Kumar', 'started', 'the', 'institution', 'in', 'Hyderabad', 'in', 'mid-2011', 'and', 'expanded', 'to', 'Bengaluru', 'in', 'early-2016', '.', 'Initially', 'the', 'school', 'functioned', 'under', 'mentorship', 'of', 'Dr.', 'Dakshinamurthy', ',', 'Dr.', 'Sridhar', 'and', 'Dr.', 'Sreerama', 'Murthy', '.', 'They', 'are', 'now', 'supported', 'by', 'a', 'team', 'of', 'additional', 'mentors', 'and', 'in-house', 'data', 'scientists', '.

Match pattern starting with I

In [None]:
[w for w in tokens1 if re.search('^I', w)]

['International',
 'INSOFE',
 'It',
 'It',
 'Initially',
 'In',
 'INSOFE',
 'It',
 'INSOFE',
 'India',
 'INSOFE',
 'Institutes',
 'India',
 'INSOFE',
 'Institutes',
 'India',
 'INSOFE']

Get all the tokens ending with either ing or uru string
ing$|uru$

In [None]:
[w for w in tokens1 if re.search('ing$|uru$', w)]

['Engineering',
 'Engineering',
 'Bengaluru',
 'working',
 'Kolluru',
 'Bengaluru',
 'training',
 'Bengaluru',
 'Training',
 'Training',
 'Mining']

Get all the words that has H,B or M as its first letter

In [None]:
[w for w in tokens1 if re.search('^[H|B|M]', w)]

['Hyderabad',
 'Bengaluru',
 'Mumbai',
 'Hyderabad',
 'Bengaluru',
 'Murthy',
 'Bengaluru',
 'Big',
 'Magazine',
 'Big',
 'Magazine',
 'Mining']

In [None]:
[w for w in tokens1 if re.search('^[H|B|M]', w)]

['Hyderabad',
 'Bengaluru',
 'Mumbai',
 'Hyderabad',
 'Bengaluru',
 'Murthy',
 'Bengaluru',
 'Big',
 'Magazine',
 'Big',
 'Magazine',
 'Mining']

Search for words - Hyderabad, Bengaluru and Mumbai

In [None]:
[w for w in tokens1 if re.search('^Hyd|Ben|Mum', w)]

['Hyderabad', 'Bengaluru', 'Mumbai', 'Hyderabad', 'Bengaluru', 'Bengaluru']

Search for 'Data', 'Analytics or Science' words


In [None]:
[w for w in tokens1 if re.search('Data|Ana|Sci', w)]

['Data',
 'Science',
 'Data',
 'Data',
 'Analytics',
 'Analytics',
 'Analytics',
 'Data',
 'Data',
 'Science']

Get all the workds that ends with es

In [None]:
[w for w in tokens1 if re.search('es$', w)]

['services', 'Institutes', 'Institutes', 'Certificates']

Extract pattern with numbers

In [None]:
[w for w in tokens1 if re.search('[0-9]', w)]

['2011',
 'mid-2011',
 'early-2016',
 '2012',
 '2016',
 '3rd',
 '16',
 '2013-2016',
 '5',
 '2016',
 '9',
 '2016',
 '2014']

In [None]:
x='This is this and That is that'
tokens2 = word_tokenize(x)
[w for w in tokens2 if re.search('^T',w)]

['This', 'That']

In [None]:
[w.upper() for w in tokens2]

['THIS', 'IS', 'THIS', 'AND', 'THAT', 'IS', 'THAT']

# Lower case

In [None]:
tokens = [token.lower() for token in tokens] # Converting list of tokens to lower case
print(tokens)

['at', 'waterloo', 'we', 'were', 'fortunate', 'in', 'catching', 'a', 'train', 'for', 'leatherhead', 'where', 'we', 'hired', 'a', 'trap', 'at', 'the', 'station', 'inn', 'and', 'drove', 'for', 'four', 'or', 'five', 'miles', 'through', 'the', 'lovely', 'surrey', 'lanes', 'it', 'was', 'a', 'perfect', 'day', 'with', 'a', 'bright', 'sun', 'and', 'a', 'few', 'fleecy', 'clouds', 'in', 'the', 'heavens', 'the', 'trees', 'and', 'wayside', 'hedges', 'were', 'just', 'throwing', 'out', 'their', 'first', 'green', 'shoots', 'and', 'the', 'air', 'was', 'full', 'of', 'the', 'pleasant', 'smell', 'of', 'the', 'moist', 'earth', 'to', 'me', 'at', 'least', 'there', 'was', 'a', 'strange', 'contrast', 'between', 'the', 'sweet', 'promise', 'of', 'the', 'spring', 'and', 'this', 'sinister', 'quest', 'upon', 'which', 'we', 'were', 'engaged', 'my', 'companion', 'mr', 'alfred', 'sat', 'in', 'the', 'front', 'of', 'the', 'trap', 'his', 'arms', 'folded', 'his', 'hat', 'pulled', 'down', 'over', 'his', 'eyes', 'and', 'hi

# Stopwords

A stop word is a commonly used word (such as "a", "an“, "it”, “in”, “the”) that a search engine has been programmed to ignore, both when indexing entries for searching and when retrieving them as the result of a search query.

We would not want these words taking up space in our database, or taking up valuable processing time. For this, we can remove them easily, by storing a list of words that you consider to be stop words. NLTK (Natural Language Toolkit) in python has a list of stopwords stored in 16 different languages. You can find them in the nltk_data directory. To check the list of stopwords you can type the following commands in the python shell.

Note: You can even modify the list by adding words of your choice in the english .txt. file in the stopwords directory.

In [None]:
from nltk.corpus import stopwords

In [None]:
nltk.download('stopwords')

print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

Stopword removal

In [None]:
stop = stopwords.words('english')

tokens = [token for token in tokens if token not in stop]
print(tokens)

['waterloo', 'fortunate', 'catching', 'train', 'leatherhead', 'hired', 'trap', 'station', 'inn', 'drove', 'four', 'five', 'miles', 'lovely', 'surrey', 'lanes', 'perfect', 'day', 'bright', 'sun', 'fleecy', 'clouds', 'heavens', 'trees', 'wayside', 'hedges', 'throwing', 'first', 'green', 'shoots', 'air', 'full', 'pleasant', 'smell', 'moist', 'earth', 'least', 'strange', 'contrast', 'sweet', 'promise', 'spring', 'sinister', 'quest', 'upon', 'engaged', 'companion', 'mr', 'alfred', 'sat', 'front', 'trap', 'arms', 'folded', 'hat', 'pulled', 'eyes', 'chin', 'sunk', 'upon', 'breast', 'buried', 'deepest', 'thought', 'suddenly', 'however', 'started', 'tapped', 'shoulder', 'pointed', 'meadows', 'train', '09', '30', 'reach', 'station', '08', '30', 'waterloo', 'fortunate', 'catching', 'train', 'leatherhead', 'hired', 'trap', 'station', 'inn', 'drove', 'four', 'five', 'miles', 'lovely', 'surrey', 'lanes', 'perfect', 'day', 'bright', 'sun', 'fleecy', 'clouds', 'heavens']


In [None]:
"lie"
"lying"
stemming : "ly"
lemmatization : "lie"

# Stemmers and Lemmatizers

WordNet® is a large lexical database of English. Nouns, verbs, adjectives and adverbs are grouped into sets of cognitive synonyms (synsets), each expressing a distinct concept. WordNet’s structure makes it a useful tool for computational linguistics and natural language processing.

WordNet superficially resembles a thesaurus, in that it groups words together based on their meanings. However, there are some important distinctions. First, WordNet interlinks not just word forms—strings of letters—but specific senses of words. As a result, words that are found in close proximity to one another in the network are semantically disambiguated. Second, WordNet labels the semantic relations among words, whereas the groupings of words in a thesaurus does not follow any explicit pattern other than meaning similarity

#### Stemmers vs. Lemmatizers

* Both stemmers and lemmatizers try to bring inflected words to the same form.
* Stemmers use an algorithmic approach of removing prefixes and suffixes. The result might not be an actual dictionary word.
* Lemmatizers use a corpus. The result is always a dictionary word.
* Lemmatizers need extra info about the part of speech they are processing.
* Stemmers are faster than lemmatizers

When to use stemmers and when to use lemmatizers? few guidelines:
* If speed is important, use stemmers (lemmatizers have to search through a corpus while stemmers do simple operations on a string)
* If you just want to make sure that the system you are building is tolerant to inflections, use stemmers (If you query for “best bar in New York”, you’d accept an article on “Best bars in New York 2016″)
* If you need the actual dictionary word, use a lemmatizer. (for example, if you are building a natural language generation system)

How do stemmers work?

* Stemmers are extremely simple to use and very fast. They usually are the preferred choice. They work by applying different transformation rules on the word until no other transformation can be applied.


### Stemmers

**There are two Stemmer algorithms that can be used for stemming - Porter and Snowball**

* Porter: It is the most commonly used stemmer. It is one of the few stemmers that actually have Java support and it is also the most computationally intensive and the oldest algorithm by a large margin.

* Snowball: This is an improvement over porter. It is slightly faster in computation time than porter, with a reasonably large community around it.

In [None]:
from nltk.stem.snowball import SnowballStemmer

snow = SnowballStemmer('english')

print(snow.languages)

('arabic', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 'italian', 'norwegian', 'porter', 'portuguese', 'romanian', 'russian', 'spanish', 'swedish')


In [None]:
print(snow.stem('getting'))
print(snow.stem('rabbits'))
print(snow.stem('xyzing'))
print(snow.stem('agreed'))
print(snow.stem('slowly'))

get
rabbit
xyze
agre
slowli


In [None]:
from nltk.stem.porter import PorterStemmer

porter=PorterStemmer()

print(porter.stem('getting'))
print(porter.stem('rabbits'))
print(porter.stem('xyzing'))
print(porter.stem('agreed'))
print(porter.stem('slowly'))

get
rabbit
xyze
agre
slowli


### Lemmatizers

One major difference between stemming and lemmatization is that lemmatize takes a part of speech parameter, “pos” If not supplied, the default is “noun.”

Obtain the resources using nltk.download('wordnet')

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
import nltk
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
from nltk.stem import WordNetLemmatizer

lmtzr = WordNetLemmatizer()

tokens_new = ["going", "gone", "go", "goes", "went"]
print(tokens_new)

tokens_new = [lmtzr.lemmatize(token, pos='v') for token in tokens_new]
print(tokens_new)

['going', 'gone', 'go', 'goes', 'went']
['go', 'go', 'go', 'go', 'go']


Stemmer and lemmatizer

In [None]:
stemmer = PorterStemmer()

lemtzr = WordNetLemmatizer()

In [None]:
plurals = ['Indian', 'caresses', 'flies', 'dies', 'education', 'denied', 'computer', 'computing', 'xyzing', 'done', 'slept']

In [None]:
singles = [stemmer.stem(plural) for plural in plurals]
print(singles)

tokensLmtz = [lemtzr.lemmatize(token, pos='v') for token in plurals]
print(tokensLmtz)

['indian', 'caress', 'fli', 'die', 'educ', 'deni', 'comput', 'comput', 'xyze', 'done', 'slept']
['Indian', 'caress', 'fly', 'die', 'education', 'deny', 'computer', 'compute', 'xyzing', 'do', 'sleep']


In [None]:
porter_tokens = [stemmer.stem(token) for token in tokens]
print(porter_tokens)

lmtzr_tokens = [lmtzr.lemmatize(token, pos='v') for token in tokens]
print(lmtzr_tokens)

['waterloo', 'fortun', 'catch', 'train', 'leatherhead', 'hire', 'trap', 'station', 'inn', 'drove', 'four', 'five', 'mile', 'love', 'surrey', 'lane', 'perfect', 'day', 'bright', 'sun', 'fleeci', 'cloud', 'heaven', 'tree', 'waysid', 'hedg', 'throw', 'first', 'green', 'shoot', 'air', 'full', 'pleasant', 'smell', 'moist', 'earth', 'least', 'strang', 'contrast', 'sweet', 'promis', 'spring', 'sinist', 'quest', 'upon', 'engag', 'companion', 'mr', 'alfr', 'sat', 'front', 'trap', 'arm', 'fold', 'hat', 'pull', 'eye', 'chin', 'sunk', 'upon', 'breast', 'buri', 'deepest', 'thought', 'suddenli', 'howev', 'start', 'tap', 'shoulder', 'point', 'meadow', 'train', '09', '30', 'reach', 'station', '08', '30', 'waterloo', 'fortun', 'catch', 'train', 'leatherhead', 'hire', 'trap', 'station', 'inn', 'drove', 'four', 'five', 'mile', 'love', 'surrey', 'lane', 'perfect', 'day', 'bright', 'sun', 'fleeci', 'cloud', 'heaven']
['waterloo', 'fortunate', 'catch', 'train', 'leatherhead', 'hire', 'trap', 'station', 'inn

### Connecting Google Colab with your Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
desired_path = "/content/drive/MyDrive/NLP"
    # Check if the directory exists
if not os.path.exists(desired_path):
        # If it doesn't exist, create it
      os.makedirs(desired_path)
      print(f"Directory '{desired_path}' created successfully.")
else:
   print(f"Directory '{desired_path}' already exists.")
    # Change the current working directory
os.chdir(desired_path)
os.chdir("/content/drive/MyDrive/NLP")

Directory '/content/drive/MyDrive/NLP' already exists.


## Text Preprocessing

mk### Natural Language Toolkit

* NLTK is a leading platform for building Python programs to work with human language data.
* NLTK provides easy-to-use interfaces to over 50 corpora and lexical resources such as WordNet, along with a suite of text processing libraries for classification, tokenization, stemming, tagging, parsing, and semantic reasoning, wrappers for industrial-strength NLP libraries, and an active discussion forum.
* NLTK is suitable for linguists, engineers, students, educators, researchers, and industry users alike.
* NLTK is available for Windows, Mac OS X, and Linux.
* Best of all, NLTK is a free, open source, community-driven project. For more details - www.nltk.org

In [None]:
import nltk




```Python
!pip install nltk

nltk.download()
```

#### We will be using a sample text to demonstrate various text pre-processing steps.

In [None]:
string = '''At Waterloo we were fortunate in catching a train for Leatherhead, where we hired a trap at the station inn and drove for four or five miles through the lovely Surrey lanes.
It was a perfect day, with a bright sun and a few fleecy clouds in the heavens. The trees and wayside hedges were just throwing out their first green shoots, and the air was full of the pleasant smell of the moist earth. To me at least there was a strange contrast between the sweet promise of the spring and this sinister quest upon which we were engaged.
My companion Mr. Alfred sat in the front of the trap, his arms folded, his hat pulled down over his eyes, and his chin sunk upon his breast, buried in the deepest thought.
Suddenly, however, he started, tapped me on the shoulder, and pointed over the meadows.
The train was @09:30 AM and we have to reach the station by 08:30 AM. At Waterloo we were fortunate in catching a train for Leatherhead, where we hired a trap at the station inn and drove for four or five miles through the lovely Surrey lanes.
It was a perfect day, with a bright sun and a few fleecy clouds in the heavens.'''

In [None]:
string

'At Waterloo we were fortunate in catching a train for Leatherhead, where we hired a trap at the station inn and drove for four or five miles through the lovely Surrey lanes.\nIt was a perfect day, with a bright sun and a few fleecy clouds in the heavens. The trees and wayside hedges were just throwing out their first green shoots, and the air was full of the pleasant smell of the moist earth. To me at least there was a strange contrast between the sweet promise of the spring and this sinister quest upon which we were engaged.\nMy companion Mr. Alfred sat in the front of the trap, his arms folded, his hat pulled down over his eyes, and his chin sunk upon his breast, buried in the deepest thought.\nSuddenly, however, he started, tapped me on the shoulder, and pointed over the meadows.\nThe train was @09:30 AM and we have to reach the station by 08:30 AM. At Waterloo we were fortunate in catching a train for Leatherhead, where we hired a trap at the station inn and drove for four or five

In [None]:
type(string)

str

In [None]:
len(string)

1114

In [None]:
string.count("Waterloo")

2

In [None]:
x=string.split()
len(x)

211

#### But what if we want to read the text from a file?

Change the working directory using ".chdir()" method

In [None]:
# Change the working directory using ".chdir()" method
PATH = os.getcwd()
DATA_PATH = os.path.join(PATH, "data")


List the files that are present in the path

In [None]:
os.listdir()

[]

Reading from a text file

In [None]:
import os

# Get the current working directory
current_directory = os.getcwd()
print(f"Current working directory: {current_directory}")

# Construct the full file path
file_path = os.path.join(current_directory, 'sample_text.txt')
# or file_path = os.path.join('/content/drive/MyDrive/NLP', "data",'sample_text.txt')
# or replace '/content/drive/MyDrive/NLP' with whatever the parent directory is that contains data
print(f"Looking for file at: {file_path}")
# Open the file


Current working directory: /content/drive/MyDrive/NLP
Looking for file at: /content/drive/MyDrive/NLP/sample_text.txt


'r' stands for read operation.

'w' to write to a file and

'a' to append to an existing file

In [None]:
string

'At Waterloo we were fortunate in catching a train for Leatherhead, where we hired a trap at the station inn and drove for four or five miles through the lovely Surrey lanes.\nIt was a perfect day, with a bright sun and a few fleecy clouds in the heavens. The trees and wayside hedges were just throwing out their first green shoots, and the air was full of the pleasant smell of the moist earth. To me at least there was a strange contrast between the sweet promise of the spring and this sinister quest upon which we were engaged.\nMy companion Mr. Alfred sat in the front of the trap, his arms folded, his hat pulled down over his eyes, and his chin sunk upon his breast, buried in the deepest thought.\nSuddenly, however, he started, tapped me on the shoulder, and pointed over the meadows.\nThe train was @09:30 AM and we have to reach the station by 08:30 AM. At Waterloo we were fortunate in catching a train for Leatherhead, where we hired a trap at the station inn and drove for four or five

#### Now that our sample text is ready, let us perform following steps:

1. Sentence Tokenizing
2. Word Tokenizing
3. Stop Word Removal
4. Lemmatization

### Tokenizing

Use the NLTK Downloader to obtain the punkt resource

In [None]:
#nltk.download()

In [None]:
import nltk

nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

#### sent_tokenize

sent_tokenize, return a sentence-tokenized copy of text, using NLTK's recommended sentence tokenizer

In [None]:
# Step 1: Install and import necessary libraries


from nltk.tokenize import sent_tokenize

sent_tokens = sent_tokenize(string)

In [None]:
print(type(sent_tokens))
print(len(sent_tokens))

<class 'list'>
9


In [None]:
sent_tokens

['At Waterloo we were fortunate in catching a train for Leatherhead, where we hired a trap at the station inn and drove for four or five miles through the lovely Surrey lanes.',
 'It was a perfect day, with a bright sun and a few fleecy clouds in the heavens.',
 'The trees and wayside hedges were just throwing out their first green shoots, and the air was full of the pleasant smell of the moist earth.',
 'To me at least there was a strange contrast between the sweet promise of the spring and this sinister quest upon which we were engaged.',
 'My companion Mr. Alfred sat in the front of the trap, his arms folded, his hat pulled down over his eyes, and his chin sunk upon his breast, buried in the deepest thought.',
 'Suddenly, however, he started, tapped me on the shoulder, and pointed over the meadows.',
 'The train was @09:30 AM and we have to reach the station by 08:30 AM.',
 'At Waterloo we were fortunate in catching a train for Leatherhead, where we hired a trap at the station inn a

In [None]:
for sent in sent_tokens:
    print(sent)
    print("\n")

At Waterloo we were fortunate in catching a train for Leatherhead, where we hired a trap at the station inn and drove for four or five miles through the lovely Surrey lanes.


It was a perfect day, with a bright sun and a few fleecy clouds in the heavens.


The trees and wayside hedges were just throwing out their first green shoots, and the air was full of the pleasant smell of the moist earth.


To me at least there was a strange contrast between the sweet promise of the spring and this sinister quest upon which we were engaged.


My companion Mr. Alfred sat in the front of the trap, his arms folded, his hat pulled down over his eyes, and his chin sunk upon his breast, buried in the deepest thought.


Suddenly, however, he started, tapped me on the shoulder, and pointed over the meadows.


The train was @09:30 AM and we have to reach the station by 08:30 AM.


At Waterloo we were fortunate in catching a train for Leatherhead, where we hired a trap at the station inn and drove for fou


#### word_tokenize
    Return a tokenized copy of text, using NLTK's recommended word tokenizer

In [None]:
from nltk.tokenize import word_tokenize

tokens = word_tokenize(string)
print(tokens)

['At', 'Waterloo', 'we', 'were', 'fortunate', 'in', 'catching', 'a', 'train', 'for', 'Leatherhead', ',', 'where', 'we', 'hired', 'a', 'trap', 'at', 'the', 'station', 'inn', 'and', 'drove', 'for', 'four', 'or', 'five', 'miles', 'through', 'the', 'lovely', 'Surrey', 'lanes', '.', 'It', 'was', 'a', 'perfect', 'day', ',', 'with', 'a', 'bright', 'sun', 'and', 'a', 'few', 'fleecy', 'clouds', 'in', 'the', 'heavens', '.', 'The', 'trees', 'and', 'wayside', 'hedges', 'were', 'just', 'throwing', 'out', 'their', 'first', 'green', 'shoots', ',', 'and', 'the', 'air', 'was', 'full', 'of', 'the', 'pleasant', 'smell', 'of', 'the', 'moist', 'earth', '.', 'To', 'me', 'at', 'least', 'there', 'was', 'a', 'strange', 'contrast', 'between', 'the', 'sweet', 'promise', 'of', 'the', 'spring', 'and', 'this', 'sinister', 'quest', 'upon', 'which', 'we', 'were', 'engaged', '.', 'My', 'companion', 'Mr.', 'Alfred', 'sat', 'in', 'the', 'front', 'of', 'the', 'trap', ',', 'his', 'arms', 'folded', ',', 'his', 'hat', 'pull

In [None]:
print(tokens[0:11])

['At', 'Waterloo', 'we', 'were', 'fortunate', 'in', 'catching', 'a', 'train', 'for', 'Leatherhead']


## Regular-Expression Tokenizers

#### What is Regular Expression?

A RegEx or Regular Expression in a programming language is a special text string used for describing a search pattern. It is extremely useful for extracting information from text such as code, files, log, spreadsheets or even documents.

While using the regular expression the first thing to recognize is that everything is essentially a character, and we are writing patterns to match a specific sequence of characters also referred as string. Ascii or latin letters are those that are on your keyboards and Unicode is used to match the foreign text. It includes digits and punctuation and all special characters like $#@!%, etc.

A RegexpTokenizer splits a string into substrings using a regular expression. For example, the following tokenizer forms tokens out of alphabetic sequences, money expressions, and any other non-whitespace sequences:('\w+|$[\d.]+|\S+') For more information or different variations -
http://www.nltk.org/api/nltk.tokenize.html#module-nltk.tokenize
http://www.nltk.org/howto/tokenize.html

#### Regular Expressions

Regular expressions can contain both special and ordinary characters.

Most ordinary characters, like 'A', 'a', or '0', are the simplest regular expressions; they simply match themselves. You can concatenate ordinary characters, so last matches the string 'last'.


* `\d` - Matches any decimal digit; this is equivalent to the class [0-9].
* `\D` - Matches any non-digit character; this is equivalent to the class [^0-9].
* `\s` - Matches any whitespace character;
* `\S` - Matches any non-whitespace character;
* `\w` - Matches any alphanumeric character; this is equivalent to the class [a-zA-Z0-9_].
* `\W` - Matches any non-alphanumeric character; this is equivalent to the class [^a-zA-Z0-9_].

The special characters are:

* `'.'` - (Dot.) In the default mode, this matches any character except a newline. If the DOTALL flag has been specified, this matches any character including a newline.
* `'^'` - (Caret.) Matches the start of the string, and in MULTILINE mode also matches immediately after each newline.
* `'$'` - Matches the end of the string or just before the newline at the end of the string, and in MULTILINE mode also matches            before a newline.
* `'*'` - Causes the resulting RE to match 0 or more repetitions of the preceding RE, as many repetitions as are possible. `ab*` will match ‘a’, ‘ab’, or ‘a’ followed by any number of ‘b’s.
* `'+'` - Causes the resulting RE to match 1 or more repetitions of the preceding RE. `ab+` will match ‘a’ followed by any non-zero number of ‘b’s; it will not match just ‘a’.
* `'?'` - Causes the resulting RE to match 0 or 1 repetitions of the preceding RE. ab? will match either ‘a’ or ‘ab’.
* `'\'` - Either escapes special characters (permitting you to match characters like '*', '?', and so forth), or signals a special sequence; special sequences are discussed below.
* `[]` - Used to indicate a set of characters. In a set: Example: Characters can be listed individually, e.g. [amk] will match 'a', 'm', or 'k'.
* `'|'` - A|B, where A and B can be arbitrary REs, creates a regular expression that will match either A or B. An arbitrary number of REs can be separated by the '|' in this way.


In [None]:
import re

In [None]:
x='this is a 90 thing in 9'


#### Examples using NLTK Regular Expression

In [None]:
from nltk.corpus import RegexpTokenizer as regextoken

tokenizer = regextoken("\w+")

tokens = tokenizer.tokenize(string)
print(tokens)

['At', 'Waterloo', 'we', 'were', 'fortunate', 'in', 'catching', 'a', 'train', 'for', 'Leatherhead', 'where', 'we', 'hired', 'a', 'trap', 'at', 'the', 'station', 'inn', 'and', 'drove', 'for', 'four', 'or', 'five', 'miles', 'through', 'the', 'lovely', 'Surrey', 'lanes', 'It', 'was', 'a', 'perfect', 'day', 'with', 'a', 'bright', 'sun', 'and', 'a', 'few', 'fleecy', 'clouds', 'in', 'the', 'heavens', 'The', 'trees', 'and', 'wayside', 'hedges', 'were', 'just', 'throwing', 'out', 'their', 'first', 'green', 'shoots', 'and', 'the', 'air', 'was', 'full', 'of', 'the', 'pleasant', 'smell', 'of', 'the', 'moist', 'earth', 'To', 'me', 'at', 'least', 'there', 'was', 'a', 'strange', 'contrast', 'between', 'the', 'sweet', 'promise', 'of', 'the', 'spring', 'and', 'this', 'sinister', 'quest', 'upon', 'which', 'we', 'were', 'engaged', 'My', 'companion', 'Mr', 'Alfred', 'sat', 'in', 'the', 'front', 'of', 'the', 'trap', 'his', 'arms', 'folded', 'his', 'hat', 'pulled', 'down', 'over', 'his', 'eyes', 'and', 'hi

In [None]:
x='Atwaterloo we werefortunate'
t=regextoken('\w+')
t.tokenize(x)


['Atwaterloo', 'we', 'werefortunate']

In [None]:
from nltk.corpus import RegexpTokenizer as regextoken
t=regextoken('\w')
to=t.tokenize(string)
print(to)

['A', 't', 'W', 'a', 't', 'e', 'r', 'l', 'o', 'o', 'w', 'e', 'w', 'e', 'r', 'e', 'f', 'o', 'r', 't', 'u', 'n', 'a', 't', 'e', 'i', 'n', 'c', 'a', 't', 'c', 'h', 'i', 'n', 'g', 'a', 't', 'r', 'a', 'i', 'n', 'f', 'o', 'r', 'L', 'e', 'a', 't', 'h', 'e', 'r', 'h', 'e', 'a', 'd', 'w', 'h', 'e', 'r', 'e', 'w', 'e', 'h', 'i', 'r', 'e', 'd', 'a', 't', 'r', 'a', 'p', 'a', 't', 't', 'h', 'e', 's', 't', 'a', 't', 'i', 'o', 'n', 'i', 'n', 'n', 'a', 'n', 'd', 'd', 'r', 'o', 'v', 'e', 'f', 'o', 'r', 'f', 'o', 'u', 'r', 'o', 'r', 'f', 'i', 'v', 'e', 'm', 'i', 'l', 'e', 's', 't', 'h', 'r', 'o', 'u', 'g', 'h', 't', 'h', 'e', 'l', 'o', 'v', 'e', 'l', 'y', 'S', 'u', 'r', 'r', 'e', 'y', 'l', 'a', 'n', 'e', 's', 'I', 't', 'w', 'a', 's', 'a', 'p', 'e', 'r', 'f', 'e', 'c', 't', 'd', 'a', 'y', 'w', 'i', 't', 'h', 'a', 'b', 'r', 'i', 'g', 'h', 't', 's', 'u', 'n', 'a', 'n', 'd', 'a', 'f', 'e', 'w', 'f', 'l', 'e', 'e', 'c', 'y', 'c', 'l', 'o', 'u', 'd', 's', 'i', 'n', 't', 'h', 'e', 'h', 'e', 'a', 'v', 'e', 'n',

In [None]:
st='This is A test example To understand cap tokenizer'

In [None]:
capword_tokenizer = regextoken('[A-Z]\w+')

tokens_cap = capword_tokenizer.tokenize(st)
print(tokens_cap)

['This', 'To']


In [None]:
capword_tokenizer = regextoken('[A-Z]\w*')

tokens_cap = capword_tokenizer.tokenize(st)
print(tokens_cap)

['This', 'A', 'To']


In [None]:
tokens_cap = capword_tokenizer.tokenize(string)
print(tokens_cap)

['At', 'Waterloo', 'Leatherhead', 'Surrey', 'It', 'The', 'To', 'My', 'Mr', 'Alfred', 'Suddenly', 'The', 'AM', 'AM', 'At', 'Waterloo', 'Leatherhead', 'Surrey', 'It']


In [None]:
text_tokenizer = regextoken('\w*ee\w*')

tokens_text = text_tokenizer.tokenize(string)
print(tokens_text)

['fleecy', 'trees', 'green', 'between', 'sweet', 'deepest', 'fleecy']


In [None]:
text_tokenizer = regextoken('\w+ee\w*')
st='teen and tween twee eehh'
text_tokenizer.tokenize(st)

['teen', 'tween', 'twee']

In [None]:
numeric_tokenizer = regextoken('[0-9]\d*')

tokens_numeric = numeric_tokenizer.tokenize(string)
print(tokens_numeric)

['09', '30', '08', '30']


#### Regular Expressions using "re"

"re" module included with Python is primarily used for string searching and manipulation. It is quite useful for text extraction and pre-processing. The most common use for "__re__" is to search for patterns in text.

In [None]:
import re

text = '''International School of Engineering (INSOFE) is an Applied Engineering school with area of focus in Data Science. It is located in Hyderabad, Bengaluru and Mumbai. It opened in 2011.
The program is delivered through classroom only sessions and is suitable for students and working professionals. Dr. Dakshinamurthy V Kolluru, Dr. Sridhar Pappu and A S L Ganapathi Kumar started the institution in Hyderabad in mid-2011 and expanded to Bengaluru in early-2016. Initially the school functioned under mentorship of Dr. Dakshinamurthy, Dr. Sridhar and Dr. Sreerama Murthy. They are now supported by a team of additional mentors and in-house data scientists.
In 2012, INSOFE also started Corporate training services. It extended operations to Bengaluru in 2016. CIO.com listed INSOFE 3rd in their list of "16 Big Data Certifications That Will Pay Off" consecutively from 2013-2016. Silicon India Magazine listed INSOFE in their list of "Top 5 Big Data Training Institutes 2016". Analytics India Magazine, listed INSOFE in "Top 9 Analytics Training Institutes in India in 2016". KDnuggets mentioned INSOFE in their list of Certificates in Analytics, Data Mining, and Data Science in 2014.
'''

In [None]:
# Tokenize the data
tokens1 = word_tokenize(text)
print(tokens1)

['International', 'School', 'of', 'Engineering', '(', 'INSOFE', ')', 'is', 'an', 'Applied', 'Engineering', 'school', 'with', 'area', 'of', 'focus', 'in', 'Data', 'Science', '.', 'It', 'is', 'located', 'in', 'Hyderabad', ',', 'Bengaluru', 'and', 'Mumbai', '.', 'It', 'opened', 'in', '2011', '.', 'The', 'program', 'is', 'delivered', 'through', 'classroom', 'only', 'sessions', 'and', 'is', 'suitable', 'for', 'students', 'and', 'working', 'professionals', '.', 'Dr.', 'Dakshinamurthy', 'V', 'Kolluru', ',', 'Dr.', 'Sridhar', 'Pappu', 'and', 'A', 'S', 'L', 'Ganapathi', 'Kumar', 'started', 'the', 'institution', 'in', 'Hyderabad', 'in', 'mid-2011', 'and', 'expanded', 'to', 'Bengaluru', 'in', 'early-2016', '.', 'Initially', 'the', 'school', 'functioned', 'under', 'mentorship', 'of', 'Dr.', 'Dakshinamurthy', ',', 'Dr.', 'Sridhar', 'and', 'Dr.', 'Sreerama', 'Murthy', '.', 'They', 'are', 'now', 'supported', 'by', 'a', 'team', 'of', 'additional', 'mentors', 'and', 'in-house', 'data', 'scientists', '.

Match pattern starting with I

In [None]:
[w for w in tokens1 if re.search('^I', w)]

['International',
 'INSOFE',
 'It',
 'It',
 'Initially',
 'In',
 'INSOFE',
 'It',
 'INSOFE',
 'India',
 'INSOFE',
 'Institutes',
 'India',
 'INSOFE',
 'Institutes',
 'India',
 'INSOFE']

Get all the tokens ending with either ing or uru string
ing$|uru$

In [None]:
[w for w in tokens1 if re.search('ing$|uru$', w)]

['Engineering',
 'Engineering',
 'Bengaluru',
 'working',
 'Kolluru',
 'Bengaluru',
 'training',
 'Bengaluru',
 'Training',
 'Training',
 'Mining']

Get all the words that has H,B or M as its first letter

In [None]:
[w for w in tokens1 if re.search('^[H|B|M]', w)]

['Hyderabad',
 'Bengaluru',
 'Mumbai',
 'Hyderabad',
 'Bengaluru',
 'Murthy',
 'Bengaluru',
 'Big',
 'Magazine',
 'Big',
 'Magazine',
 'Mining']

In [None]:
[w for w in tokens1 if re.search('^[H|B|M]', w)]

['Hyderabad',
 'Bengaluru',
 'Mumbai',
 'Hyderabad',
 'Bengaluru',
 'Murthy',
 'Bengaluru',
 'Big',
 'Magazine',
 'Big',
 'Magazine',
 'Mining']

Search for words - Hyderabad, Bengaluru and Mumbai

In [None]:
[w for w in tokens1 if re.search('^Hyd|Ben|Mum', w)]

['Hyderabad', 'Bengaluru', 'Mumbai', 'Hyderabad', 'Bengaluru', 'Bengaluru']

Search for 'Data', 'Analytics or Science' words


In [None]:
[w for w in tokens1 if re.search('Data|Ana|Sci', w)]

['Data',
 'Science',
 'Data',
 'Data',
 'Analytics',
 'Analytics',
 'Analytics',
 'Data',
 'Data',
 'Science']

Get all the workds that ends with es

In [None]:
[w for w in tokens1 if re.search('es$', w)]

['services', 'Institutes', 'Institutes', 'Certificates']

Extract pattern with numbers

In [None]:
[w for w in tokens1 if re.search('[0-9]', w)]

['2011',
 'mid-2011',
 'early-2016',
 '2012',
 '2016',
 '3rd',
 '16',
 '2013-2016',
 '5',
 '2016',
 '9',
 '2016',
 '2014']

In [None]:
x='This is this and That is that'
tokens2 = word_tokenize(x)
[w for w in tokens2 if re.search('^T',w)]

['This', 'That']

In [None]:
[w.upper() for w in tokens2]

['THIS', 'IS', 'THIS', 'AND', 'THAT', 'IS', 'THAT']

# Lower case

In [None]:
tokens = [token.lower() for token in tokens] # Converting list of tokens to lower case
print(tokens)

['at', 'waterloo', 'we', 'were', 'fortunate', 'in', 'catching', 'a', 'train', 'for', 'leatherhead', 'where', 'we', 'hired', 'a', 'trap', 'at', 'the', 'station', 'inn', 'and', 'drove', 'for', 'four', 'or', 'five', 'miles', 'through', 'the', 'lovely', 'surrey', 'lanes', 'it', 'was', 'a', 'perfect', 'day', 'with', 'a', 'bright', 'sun', 'and', 'a', 'few', 'fleecy', 'clouds', 'in', 'the', 'heavens', 'the', 'trees', 'and', 'wayside', 'hedges', 'were', 'just', 'throwing', 'out', 'their', 'first', 'green', 'shoots', 'and', 'the', 'air', 'was', 'full', 'of', 'the', 'pleasant', 'smell', 'of', 'the', 'moist', 'earth', 'to', 'me', 'at', 'least', 'there', 'was', 'a', 'strange', 'contrast', 'between', 'the', 'sweet', 'promise', 'of', 'the', 'spring', 'and', 'this', 'sinister', 'quest', 'upon', 'which', 'we', 'were', 'engaged', 'my', 'companion', 'mr', 'alfred', 'sat', 'in', 'the', 'front', 'of', 'the', 'trap', 'his', 'arms', 'folded', 'his', 'hat', 'pulled', 'down', 'over', 'his', 'eyes', 'and', 'hi

# Stopwords

A stop word is a commonly used word (such as "a", "an“, "it”, “in”, “the”) that a search engine has been programmed to ignore, both when indexing entries for searching and when retrieving them as the result of a search query.

We would not want these words taking up space in our database, or taking up valuable processing time. For this, we can remove them easily, by storing a list of words that you consider to be stop words. NLTK (Natural Language Toolkit) in python has a list of stopwords stored in 16 different languages. You can find them in the nltk_data directory. To check the list of stopwords you can type the following commands in the python shell.

Note: You can even modify the list by adding words of your choice in the english .txt. file in the stopwords directory.

In [None]:
from nltk.corpus import stopwords

In [None]:
nltk.download('stopwords')

print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

Stopword removal

In [None]:
stop = stopwords.words('english')

tokens = [token for token in tokens if token not in stop]
print(tokens)

['waterloo', 'fortunate', 'catching', 'train', 'leatherhead', 'hired', 'trap', 'station', 'inn', 'drove', 'four', 'five', 'miles', 'lovely', 'surrey', 'lanes', 'perfect', 'day', 'bright', 'sun', 'fleecy', 'clouds', 'heavens', 'trees', 'wayside', 'hedges', 'throwing', 'first', 'green', 'shoots', 'air', 'full', 'pleasant', 'smell', 'moist', 'earth', 'least', 'strange', 'contrast', 'sweet', 'promise', 'spring', 'sinister', 'quest', 'upon', 'engaged', 'companion', 'mr', 'alfred', 'sat', 'front', 'trap', 'arms', 'folded', 'hat', 'pulled', 'eyes', 'chin', 'sunk', 'upon', 'breast', 'buried', 'deepest', 'thought', 'suddenly', 'however', 'started', 'tapped', 'shoulder', 'pointed', 'meadows', 'train', '09', '30', 'reach', 'station', '08', '30', 'waterloo', 'fortunate', 'catching', 'train', 'leatherhead', 'hired', 'trap', 'station', 'inn', 'drove', 'four', 'five', 'miles', 'lovely', 'surrey', 'lanes', 'perfect', 'day', 'bright', 'sun', 'fleecy', 'clouds', 'heavens']


In [None]:
"lie"
"lying"
stemming : "ly"
lemmatization : "lie"

# Stemmers and Lemmatizers

WordNet® is a large lexical database of English. Nouns, verbs, adjectives and adverbs are grouped into sets of cognitive synonyms (synsets), each expressing a distinct concept. WordNet’s structure makes it a useful tool for computational linguistics and natural language processing.

WordNet superficially resembles a thesaurus, in that it groups words together based on their meanings. However, there are some important distinctions. First, WordNet interlinks not just word forms—strings of letters—but specific senses of words. As a result, words that are found in close proximity to one another in the network are semantically disambiguated. Second, WordNet labels the semantic relations among words, whereas the groupings of words in a thesaurus does not follow any explicit pattern other than meaning similarity

#### Stemmers vs. Lemmatizers

* Both stemmers and lemmatizers try to bring inflected words to the same form.
* Stemmers use an algorithmic approach of removing prefixes and suffixes. The result might not be an actual dictionary word.
* Lemmatizers use a corpus. The result is always a dictionary word.
* Lemmatizers need extra info about the part of speech they are processing.
* Stemmers are faster than lemmatizers

When to use stemmers and when to use lemmatizers? few guidelines:
* If speed is important, use stemmers (lemmatizers have to search through a corpus while stemmers do simple operations on a string)
* If you just want to make sure that the system you are building is tolerant to inflections, use stemmers (If you query for “best bar in New York”, you’d accept an article on “Best bars in New York 2016″)
* If you need the actual dictionary word, use a lemmatizer. (for example, if you are building a natural language generation system)

How do stemmers work?

* Stemmers are extremely simple to use and very fast. They usually are the preferred choice. They work by applying different transformation rules on the word until no other transformation can be applied.


### Stemmers

**There are two Stemmer algorithms that can be used for stemming - Porter and Snowball**

* Porter: It is the most commonly used stemmer. It is one of the few stemmers that actually have Java support and it is also the most computationally intensive and the oldest algorithm by a large margin.

* Snowball: This is an improvement over porter. It is slightly faster in computation time than porter, with a reasonably large community around it.

In [None]:
from nltk.stem.snowball import SnowballStemmer

snow = SnowballStemmer('english')

print(snow.languages)

('arabic', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 'italian', 'norwegian', 'porter', 'portuguese', 'romanian', 'russian', 'spanish', 'swedish')


In [None]:
print(snow.stem('getting'))
print(snow.stem('rabbits'))
print(snow.stem('xyzing'))
print(snow.stem('agreed'))
print(snow.stem('slowly'))

get
rabbit
xyze
agre
slowli


In [None]:
from nltk.stem.porter import PorterStemmer

porter=PorterStemmer()

print(porter.stem('getting'))
print(porter.stem('rabbits'))
print(porter.stem('xyzing'))
print(porter.stem('agreed'))
print(porter.stem('slowly'))

get
rabbit
xyze
agre
slowli


### Lemmatizers

One major difference between stemming and lemmatization is that lemmatize takes a part of speech parameter, “pos” If not supplied, the default is “noun.”

Obtain the resources using nltk.download('wordnet')

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
import nltk
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
from nltk.stem import WordNetLemmatizer

lmtzr = WordNetLemmatizer()

tokens_new = ["going", "gone", "go", "goes", "went"]
print(tokens_new)

tokens_new = [lmtzr.lemmatize(token, pos='v') for token in tokens_new]
print(tokens_new)

['going', 'gone', 'go', 'goes', 'went']
['go', 'go', 'go', 'go', 'go']


Stemmer and lemmatizer

In [None]:
stemmer = PorterStemmer()

lemtzr = WordNetLemmatizer()

In [None]:
plurals = ['Indian', 'caresses', 'flies', 'dies', 'education', 'denied', 'computer', 'computing', 'xyzing', 'done', 'slept']

In [None]:
singles = [stemmer.stem(plural) for plural in plurals]
print(singles)

tokensLmtz = [lemtzr.lemmatize(token, pos='v') for token in plurals]
print(tokensLmtz)

['indian', 'caress', 'fli', 'die', 'educ', 'deni', 'comput', 'comput', 'xyze', 'done', 'slept']
['Indian', 'caress', 'fly', 'die', 'education', 'deny', 'computer', 'compute', 'xyzing', 'do', 'sleep']


In [None]:
porter_tokens = [stemmer.stem(token) for token in tokens]
print(porter_tokens)

lmtzr_tokens = [lmtzr.lemmatize(token, pos='v') for token in tokens]
print(lmtzr_tokens)

['waterloo', 'fortun', 'catch', 'train', 'leatherhead', 'hire', 'trap', 'station', 'inn', 'drove', 'four', 'five', 'mile', 'love', 'surrey', 'lane', 'perfect', 'day', 'bright', 'sun', 'fleeci', 'cloud', 'heaven', 'tree', 'waysid', 'hedg', 'throw', 'first', 'green', 'shoot', 'air', 'full', 'pleasant', 'smell', 'moist', 'earth', 'least', 'strang', 'contrast', 'sweet', 'promis', 'spring', 'sinist', 'quest', 'upon', 'engag', 'companion', 'mr', 'alfr', 'sat', 'front', 'trap', 'arm', 'fold', 'hat', 'pull', 'eye', 'chin', 'sunk', 'upon', 'breast', 'buri', 'deepest', 'thought', 'suddenli', 'howev', 'start', 'tap', 'shoulder', 'point', 'meadow', 'train', '09', '30', 'reach', 'station', '08', '30', 'waterloo', 'fortun', 'catch', 'train', 'leatherhead', 'hire', 'trap', 'station', 'inn', 'drove', 'four', 'five', 'mile', 'love', 'surrey', 'lane', 'perfect', 'day', 'bright', 'sun', 'fleeci', 'cloud', 'heaven']
['waterloo', 'fortunate', 'catch', 'train', 'leatherhead', 'hire', 'trap', 'station', 'inn