### Regular Expressions

In [1]:
import re

In [21]:
pattern = r"[+-]?(?P<integer>\d+)\.?(?P<decimal>\d*)"

In [22]:
text = "This statement consists of 10992.435 two floting -9854.34 point numbers"

In [6]:
m = re.search(pattern, text)

In [5]:
text[27:36]

'10992.435'

In [7]:
m.group()

'10992.435'

In [8]:
m.span()

(27, 36)

In [9]:
re.findall(pattern, text)

['10992.435', '-9854.34']

In [11]:
m = re.finditer(pattern, text)

In [12]:
m

<callable_iterator at 0x1cfa8a91518>

In [13]:
for o in m:
    print(o.span(), ' ---> ', o.group())

(27, 36)  --->  10992.435
(49, 57)  --->  -9854.34


In [23]:
m = re.search(pattern, text)

In [24]:
m.group()


'10992.435'

In [25]:
m.group(1)

'10992'

In [26]:
m.group(2)

'435'

In [27]:
m.groups()

('10992', '435')

In [28]:
m.groupdict()

{'integer': '10992', 'decimal': '435'}

### Tokenization

In [31]:
from nltk.tokenize import word_tokenize, sent_tokenize, RegexpTokenizer
from nltk.util import ngrams

In [32]:
my_text = "Hi Mr. Smith! I’m going to buy some vegetables (tomatoes and cucumbers)\
from the store. Should I pick up some black-eyed peas as well?"

In [33]:
print(word_tokenize(my_text))

['Hi', 'Mr.', 'Smith', '!', 'I', '’', 'm', 'going', 'to', 'buy', 'some', 'vegetables', '(', 'tomatoes', 'and', 'cucumbers', ')', 'from', 'the', 'store', '.', 'Should', 'I', 'pick', 'up', 'some', 'black-eyed', 'peas', 'as', 'well', '?']


In [34]:
print(sent_tokenize(my_text))

['Hi Mr. Smith!', 'I’m going to buy some vegetables (tomatoes and cucumbers)from the store.', 'Should I pick up some black-eyed peas as well?']


In [36]:
words = word_tokenize(my_text)
twograms = list(ngrams(words, 2))
print(twograms)

[('Hi', 'Mr.'), ('Mr.', 'Smith'), ('Smith', '!'), ('!', 'I'), ('I', '’'), ('’', 'm'), ('m', 'going'), ('going', 'to'), ('to', 'buy'), ('buy', 'some'), ('some', 'vegetables'), ('vegetables', '('), ('(', 'tomatoes'), ('tomatoes', 'and'), ('and', 'cucumbers'), ('cucumbers', ')'), (')', 'from'), ('from', 'the'), ('the', 'store'), ('store', '.'), ('.', 'Should'), ('Should', 'I'), ('I', 'pick'), ('pick', 'up'), ('up', 'some'), ('some', 'black-eyed'), ('black-eyed', 'peas'), ('peas', 'as'), ('as', 'well'), ('well', '?')]


In [37]:
r_token = RegexpTokenizer(r"[A-Z]['\w]+")
print(r_token.tokenize(my_text))

['Hi', 'Mr', 'Smith', 'Should']


### Removing Characters

In [38]:
import re
import string

In [39]:
my_text

'Hi Mr. Smith! I’m going to buy some vegetables (tomatoes and cucumbers)from the store. Should I pick up some black-eyed peas as well?'

In [41]:
clean_text = re.sub('[%s]'%re.escape(string.punctuation), ' ', my_text)
clean_text

'Hi Mr  Smith  I’m going to buy some vegetables  tomatoes and cucumbers from the store  Should I pick up some black eyed peas as well '

In [43]:
clean_text = clean_text.lower()
clean_text

'hi mr  smith  i’m going to buy some vegetables  tomatoes and cucumbers from the store  should i pick up some black eyed peas as well '

In [44]:
# Removing numbers ' 45g', '  4  ', 'r56c'
clean_text = re.sub('\w*\d+\w*', ' ', clean_text)
clean_text

'hi mr  smith  i’m going to buy some vegetables  tomatoes and cucumbers from the store  should i pick up some black eyed peas as well '

In [46]:
print(word_tokenize(clean_text))

['hi', 'mr', 'smith', 'i', '’', 'm', 'going', 'to', 'buy', 'some', 'vegetables', 'tomatoes', 'and', 'cucumbers', 'from', 'the', 'store', 'should', 'i', 'pick', 'up', 'some', 'black', 'eyed', 'peas', 'as', 'well']


### Stop Words

In [48]:
from nltk.corpus import stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [51]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
my_text = ["Hi Mr. Smith! I’m going to buy some vegetables (tomatoes and cucumbers)\
from the store. Should I pick up some black-eyed peas as well?"]
# Incorporate stop words when creating the count vectorizer
cv = CountVectorizer(stop_words='english')
X = cv.fit_transform(my_text)
pd.DataFrame(X.toarray(), columns=cv.get_feature_names())



Unnamed: 0,black,buy,cucumbers,eyed,going,hi,mr,peas,pick,smith,store,tomatoes,vegetables
0,1,1,1,1,1,1,1,1,1,1,1,1,1


### Stemming and Lemmatizing

In [52]:
from nltk.stem.lancaster import LancasterStemmer
stemmer = LancasterStemmer()

In [54]:
stemmer.stem('drive')

'driv'

In [55]:
print('drive : {}'.format(stemmer.stem('drive')))
print('drive : {}'.format(stemmer.stem('drives')))
print('drive : {}'.format(stemmer.stem('driver')))
print('drive : {}'.format(stemmer.stem('driving')))
print('drive : {}'.format(stemmer.stem('driven')))

drive : driv
drive : driv
drive : driv
drive : driv
drive : driv


### Parts Of Speech Tagging

In [57]:
from nltk.tag import pos_tag
my_text = "Hi Mr. Smith! I’m going to buy some vegetables (tomatoes and cucumbers)\
from the store. Should I pick up some black-eyed peas as well?"
tokens = pos_tag(word_tokenize(my_text))
print(tokens)

[('Hi', 'NNP'), ('Mr.', 'NNP'), ('Smith', 'NNP'), ('!', '.'), ('I', 'PRP'), ('’', 'VBP'), ('m', 'RB'), ('going', 'VBG'), ('to', 'TO'), ('buy', 'VB'), ('some', 'DT'), ('vegetables', 'NNS'), ('(', '('), ('tomatoes', 'NNS'), ('and', 'CC'), ('cucumbers', 'NNS'), (')', ')'), ('from', 'IN'), ('the', 'DT'), ('store', 'NN'), ('.', '.'), ('Should', 'MD'), ('I', 'PRP'), ('pick', 'VB'), ('up', 'RP'), ('some', 'DT'), ('black-eyed', 'JJ'), ('peas', 'NNS'), ('as', 'IN'), ('well', 'RB'), ('?', '.')]


In [58]:
from nltk.help import upenn_tagset
upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

### Named Entity Recognition

In [67]:
from nltk.chunk import ne_chunk
my_text = "My friend Ramesh lives in England!"
tokens = pos_tag(word_tokenize(my_text))


In [68]:
entities = ne_chunk(tokens)
entities.draw()

### Compound Term Extraction

In [69]:
from nltk.tokenize import MWETokenizer
my_text = "all of you are the best students of all time"
mwe_tokenizer = MWETokenizer([('all', 'of', 'you'), ('of', 'all', 'time')])
mwe_tokens = mwe_tokenizer.tokenize(word_tokenize(my_text))
mwe_tokens

['all_of_you', 'are', 'the', 'best', 'students', 'of_all_time']

In [71]:
entities = ne_chunk(word_tokenize(my_text))
entities.draw()

In [72]:
entities = ne_chunk(mwe_tokens)
entities.draw()

### Levenstein Distance

In [77]:
import nltk

src = "Data Science Learner"
tar = "Data science learners"

dist = nltk.edit_distance(src, tar)
print(dist)

3
