## Imports

In [1]:
import re
import string

import nltk
from nltk import FreqDist
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer

## Part 1 : Regular Expressions

In [2]:
text = 'I am 20 years old. My previous license plate number was 4XUI302 and my new one is 3ABC278. My ID is J987492 and my address is 123 Main street, San Jose, CA. Please email me at myemail123+spam@google.com'

### Find email

In [3]:
email = re.findall(r"\S+@\S+",text)
print(email)

['myemail123+spam@google.com']


### Find License Number

In [4]:
license = re.findall(r"\d{1}[A-Z]{3}\d{3}",text)
print(license)

['4XUI302', '3ABC278']


### Find ID

In [5]:
id = re.findall(r"[A-Z]?\d{6}",text)
print(id)

['J987492']


### Find address

In [6]:
address = re.split("[.]","".join(re.findall(r"(?<=address is )\w+.*",text)))[0]
print(address)

123 Main street, San Jose, CA


### Replace License plate with LP_NUM

In [7]:
new_text = re.sub(r"\d{1}[A-Z]{3}\d{3}", "LP_NUM", text)
new_text

'I am 20 years old. My previous license plate number was LP_NUM and my new one is LP_NUM. My ID is J987492 and my address is 123 Main street, San Jose, CA. Please email me at myemail123+spam@google.com'

## Part 2: NLTK, remove punctuation and stop words

### Download movie_reviews, stopwords, wordnet

In [8]:
nltk.download('movie_reviews')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Original count of unique words from movie_reviews dataset and sample

In [9]:
len(nltk.FreqDist(movie_reviews.words()))

39768

In [10]:
words_all = [word.lower() for word in movie_reviews.words()]
words_all[:20]

['plot',
 ':',
 'two',
 'teen',
 'couples',
 'go',
 'to',
 'a',
 'church',
 'party',
 ',',
 'drink',
 'and',
 'then',
 'drive',
 '.',
 'they',
 'get',
 'into',
 'an']

### Removing Punctuations and "--" not detected in string.punctuation

In [11]:
words_without_punc = [word.lower() for word in movie_reviews.words() if word not in string.punctuation]
words_without_punc = [word for word in words_without_punc if word != "--"]
words_without_punc[:20]

['plot',
 'two',
 'teen',
 'couples',
 'go',
 'to',
 'a',
 'church',
 'party',
 'drink',
 'and',
 'then',
 'drive',
 'they',
 'get',
 'into',
 'an',
 'accident',
 'one',
 'of']

In [12]:
len(nltk.FreqDist(words_without_punc))

39736

### 20 Most Common unique words after removing punctuations

In [13]:
FreqDist(words_without_punc).most_common(20)

[('the', 76529),
 ('a', 38106),
 ('and', 35576),
 ('of', 34123),
 ('to', 31937),
 ('is', 25195),
 ('in', 21822),
 ('s', 18513),
 ('it', 16107),
 ('that', 15924),
 ('as', 11378),
 ('with', 10792),
 ('for', 9961),
 ('his', 9587),
 ('this', 9578),
 ('film', 9517),
 ('i', 8889),
 ('he', 8864),
 ('but', 8634),
 ('on', 7385)]

### Removing stopwords

In [14]:
stopwords_eng = stopwords.words("english")
words_without_punc_stopwords = [word for word in words_without_punc if word not in stopwords_eng]
words_without_punc_stopwords[:20]

['plot',
 'two',
 'teen',
 'couples',
 'go',
 'church',
 'party',
 'drink',
 'drive',
 'get',
 'accident',
 'one',
 'guys',
 'dies',
 'girlfriend',
 'continues',
 'see',
 'life',
 'nightmares',
 'deal']

In [15]:
len(nltk.FreqDist(words_without_punc_stopwords))

39585

### 20 Most Common unique words after removing stopwords

In [16]:
FreqDist(words_without_punc_stopwords).most_common(20)

[('film', 9517),
 ('one', 5852),
 ('movie', 5771),
 ('like', 3690),
 ('even', 2565),
 ('good', 2411),
 ('time', 2411),
 ('story', 2169),
 ('would', 2109),
 ('much', 2049),
 ('character', 2020),
 ('also', 1967),
 ('get', 1949),
 ('two', 1911),
 ('well', 1906),
 ('characters', 1859),
 ('first', 1836),
 ('see', 1749),
 ('way', 1693),
 ('make', 1642)]

### 10 Least used unique words using hapaxes()

In [17]:
least_used_words = FreqDist(words_without_punc_stopwords).hapaxes()[:10]
print(least_used_words)

['looooot', 'schnazzy', 'timex', 'indiglo', 'jessalyn', 'gilsig', 'ruber', 'jaleel', 'balki', 'wavers']


## Part 3: Stemming and Lemmatizing dataset

In [18]:
### trying two different types of stemming techinques
ps = PorterStemmer()
ls = LancasterStemmer()
lm = WordNetLemmatizer() 

### Using PorterStemmer on words without punctuations and stopwords and calculating count of unique words after Stemming

In [19]:
ps_stem_words=[ps.stem(word) for word in words_without_punc_stopwords ]
ps_stem_words[:20]

['plot',
 'two',
 'teen',
 'coupl',
 'go',
 'church',
 'parti',
 'drink',
 'drive',
 'get',
 'accid',
 'one',
 'guy',
 'die',
 'girlfriend',
 'continu',
 'see',
 'life',
 'nightmar',
 'deal']

In [20]:
print(len(nltk.FreqDist(ps_stem_words)))

26100


### Using LancasterStemmer on words without punctuations and stopwords and calculating count of unique words after Stemming

In [21]:
ls_stemwords=[ls.stem(word) for word in words_without_punc_stopwords ]
ls_stemwords[:20]

['plot',
 'two',
 'teen',
 'coupl',
 'go',
 'church',
 'party',
 'drink',
 'driv',
 'get',
 'accid',
 'on',
 'guy',
 'die',
 'girlfriend',
 'continu',
 'see',
 'lif',
 'nightm',
 'deal']

In [22]:
print(len(nltk.FreqDist(ls_stemwords))) 

21466


### Using WordNetLemmatizer on words without punctuations and stopwords and calculating count of unique words after Stemming

In [23]:
word_lem=[lm.lemmatize(word) for word in words_without_punc_stopwords ]
word_lem[:20]

['plot',
 'two',
 'teen',
 'couple',
 'go',
 'church',
 'party',
 'drink',
 'drive',
 'get',
 'accident',
 'one',
 'guy',
 'dy',
 'girlfriend',
 'continues',
 'see',
 'life',
 'nightmare',
 'deal']

In [24]:
print(len(nltk.FreqDist(word_lem)))

35171
