### Stemming is a technique to remove suffiixes from a word, ending up with the stem. 
most common stemming algorithms is the Porter stemming algorithm by Martin
Porter. 

In [1]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
print('cooking stem= ', stemmer.stem('cooking'))
print('cookery stem= ', stemmer.stem('cookery'))

cooking stem=  cook
cookery stem=  cookeri


<img src="stemmerI.png" />

*** LancasterStemmer class

In [2]:
from nltk.stem import LancasterStemmer
stemmer = LancasterStemmer()
print('cooking stem= ', stemmer.stem('cooking'))
print('cookery stem= ', stemmer.stem('cookery'))

cooking stem=  cook
cookery stem=  cookery


*** RegexpStemmer clas

In [3]:
from nltk.stem import RegexpStemmer
stemmer = RegexpStemmer('ing')
print('regex cooking stem= ', stemmer.stem('cooking'))
print('regex cookery stem= ', stemmer.stem('cookery'))

regex cooking stem=  cook
regex cookery stem=  cookery


*** The SnowballStemmer class

In [4]:
from nltk.stem import SnowballStemmer
SnowballStemmer.languages('spanish')

TypeError: 'tuple' object is not callable

In [7]:
spanish_stemmer = SnowballStemmer('spanish')
spanish_stemmer.stem('hola')

TypeError: 'tuple' object is not callable

### Lemmatization is very similar to stemming, but is more akin to synonym replacement.
always left with a valid word that means the same thing. However, the word you end up with can
be completely different.

In [5]:
'''This is because the default
POS is a noun, and as a noun, cooking is its own lemma. On the other hand, cookbooks
is a noun with its singular form, cookbook, as its lemma.
'''
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print('''this is lemmatizing "cooking" without POS; \n''',lemmatizer.lemmatize('cooking'))
print('''this is lemmatizing "cooking" with POS=v; \n''',lemmatizer.lemmatize('cooking',
                                                                              pos='v'))

this is lemmatizing "cooking" without POS; 
 cooking
this is lemmatizing "cooking" with POS=v; 
 cook


### Combining stemming with lemmatization  
 stemming saves one character, lemmatization saves two characters, and
stemming the lemma saves a total of three characters out of five characters. That is nearly
a 60% compression rate!

###  word replacement can be thought of as error correction or text normalization.

This recipe aims to fix this by replacing contractions with their expanded forms, for example, by ***replacing "can't" with "cannot" or "would've" with "would have"**

The key things to know are matching patterns and the re.sub() function
1.  ***define a number of replacement patterns*** This will be a list of tuple
pairs, where the first element is the pattern to match with and the second element is
the replacement.
2.  ***create a RegexpReplacer class*** that will compile the patterns and provide
a replace() method to substitute all the found patterns with their replacements.


In [6]:
#1
import re
replacement_patterns = [
(r'won\'t', 'will not'),
(r'can\'t', 'cannot'),
(r'i\'m', 'i am'),
(r'ain\'t', 'is not'),
(r'(\w+)\'ll', '\g<1> will'),
(r'(\w+)n\'t', '\g<1> not'),
(r'(\w+)\'ve', '\g<1> have'),
(r'(\w+)\'s', '\g<1> is'),
(r'(\w+)\'re', '\g<1> are'),
(r'(\w+)\'d', '\g<1> would')
]

In [3]:
%pwd

'/Users/alessandropiccolo/Google Drive/Python/1JupyterNotebook/NLTK'

In [7]:
%%writefile replacers.py
import re , csv , enchant #, yaml 
from nltk.corpus import wordnet
from nltk.metrics import edit_distance

##################################################
## Replacing Words Matching Regular Expressions ##
##################################################

replacement_patterns = [
	(r'won\'t', 'will not'),
	(r'can\'t', 'cannot'),
	(r'i\'m', 'i am'),
	(r'ain\'t', 'is not'),
	(r'(\w+)\'ll', '\g<1> will'),
	(r'(\w+)n\'t', '\g<1> not'),
	(r'(\w+)\'ve', '\g<1> have'),
	(r'(\w+)\'s', '\g<1> is'),
	(r'(\w+)\'re', '\g<1> are'),
	(r'(\w+)\'d', '\g<1> would'),
]

class RegexpReplacer(object):
	""" Replaces regular expression in a text.
	>>> replacer = RegexpReplacer()
	>>> replacer.replace("can't is a contraction")
	'cannot is a contraction'
	>>> replacer.replace("I should've done that thing I didn't do")
	'I should have done that thing I did not do'
	"""
	def __init__(self, patterns=replacement_patterns):
		self.patterns = [(re.compile(regex), repl) for (regex, repl) in patterns]
	
	def replace(self, text):
		s = text
		
		for (pattern, repl) in self.patterns:
			s = re.sub(pattern, repl, s)
		
		return s

####################################
## Replacing Repeating Characters ##
####################################

class RepeatReplacer(object):
	""" Removes repeating characters until a valid word is found.
	>>> replacer = RepeatReplacer()
	>>> replacer.replace('looooove')
	'love'
	>>> replacer.replace('oooooh')
	'ooh'
	>>> replacer.replace('goose')
	'goose'
	"""
	def __init__(self):
		self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
		self.repl = r'\1\2\3'

	def replace(self, word):
		if wordnet.synsets(word):
			return word
		
		repl_word = self.repeat_regexp.sub(self.repl, word)
		
		if repl_word != word:
			return self.replace(repl_word)
		else:
			return repl_word

######################################
## Spelling Correction with Enchant ##
######################################

class SpellingReplacer(object):
	""" Replaces misspelled words with a likely suggestion based on shortest
	edit distance.
	>>> replacer = SpellingReplacer()
	>>> replacer.replace('cookbok')
	'cookbook'
	"""
	def __init__(self, dict_name='en', max_dist=2):
		self.spell_dict = enchant.Dict(dict_name)
		self.max_dist = max_dist
	
	def replace(self, word):
		if self.spell_dict.check(word):
			return word
		
		suggestions = self.spell_dict.suggest(word)
		
		if suggestions and edit_distance(word, suggestions[0]) <= self.max_dist:
			return suggestions[0]
		else:
			return word

class CustomSpellingReplacer(SpellingReplacer):
	""" SpellingReplacer that allows passing a custom enchant dictionary, such
	a DictWithPWL.
	>>> d = enchant.DictWithPWL('en_US', 'mywords.txt')
	>>> replacer = CustomSpellingReplacer(d)
	>>> replacer.replace('nltk')
	'nltk'
	"""
	def __init__(self, spell_dict, max_dist=2):
		self.spell_dict = spell_dict
		self.max_dist = max_dist

########################
## Replacing Synonyms ##
########################

class WordReplacer(object):
	""" WordReplacer that replaces a given word with a word from the word_map,
	or if the word isn't found, returns the word as is.
	>>> replacer = WordReplacer({'bday': 'birthday'})
	>>> replacer.replace('bday')
	'birthday'
	>>> replacer.replace('happy')
	'happy'
	"""
	def __init__(self, word_map):
		self.word_map = word_map
	
	def replace(self, word):
		return self.word_map.get(word, word)

class CsvWordReplacer(WordReplacer):
	""" WordReplacer that reads word mappings from a csv file.
	>>> replacer = CsvWordReplacer('synonyms.csv')
	>>> replacer.replace('bday')
	'birthday'
	>>> replacer.replace('happy')
	'happy'
	"""
	def __init__(self, fname):
		word_map = {}
		
		for line in csv.reader(open(fname)):
			word, syn = line
			word_map[word] = syn
		
		super(CsvWordReplacer, self).__init__(word_map)
'''
class YamlWordReplacer(WordReplacer):
	""" WordReplacer that reads word mappings from a yaml file.
	>>> replacer = YamlWordReplacer('synonyms.yaml')
	>>> replacer.replace('bday')
	'birthday'
	>>> replacer.replace('happy')
	'happy'
	"""
	def __init__(self, fname):
		word_map = yaml.load(open(fname))
		super(YamlWordReplacer, self).__init__(word_map)

#######################################
## Replacing Negations with Antonyms ##
#######################################
'''
class AntonymReplacer(object):
	def replace(self, word, pos=None):
		""" Returns the antonym of a word, but only if there is no ambiguity.
		>>> replacer = AntonymReplacer()
		>>> replacer.replace('good')
		>>> replacer.replace('uglify')
		'beautify'
		>>> replacer.replace('beautify')
		'uglify'
		"""
		antonyms = set()
		
		for syn in wordnet.synsets(word, pos=pos):
			for lemma in syn.lemmas():
				for antonym in lemma.antonyms():
					antonyms.add(antonym.name())
		
		if len(antonyms) == 1:
			return antonyms.pop()
		else:
			return None
	
	def replace_negations(self, sent):
		""" Try to replace negations with antonyms in the tokenized sentence.
		>>> replacer = AntonymReplacer()
		>>> replacer.replace_negations(['do', 'not', 'uglify', 'our', 'code'])
		['do', 'beautify', 'our', 'code']
		>>> replacer.replace_negations(['good', 'is', 'not', 'evil'])
		['good', 'is', 'not', 'evil']
		"""
		i, l = 0, len(sent)
		words = []
		
		while i < l:
			word = sent[i]
			
			if word == 'not' and i+1 < l:
				ant = self.replace(sent[i+1])
				
				if ant:
					words.append(ant)
					i += 2
					continue
			
			words.append(word)
			i += 1
		
		return words

class AntonymWordReplacer(WordReplacer, AntonymReplacer):
	""" AntonymReplacer that uses a custom mapping instead of WordNet.
	Order of inheritance is very important, this class would not work if
	AntonymReplacer comes before WordReplacer.
	>>> replacer = AntonymWordReplacer({'evil': 'good'})
	>>> replacer.replace_negations(['good', 'is', 'not', 'evil'])
	['good', 'is', 'good']
	"""
	pass

if __name__ == '__main__':
	import doctest
	doctest.testmod()


Writing replacers.py


In [11]:
!pip install pyenchant

[33mThe directory '/home/jovyan/.cache/pip/http' or its parent directory is not owned by the current user and the cache has been disabled. Please check the permissions and owner of that directory. If executing pip with sudo, you may want sudo's -H flag.[0m
[33mThe directory '/home/jovyan/.cache/pip' or its parent directory is not owned by the current user and caching wheels has been disabled. check the permissions and owner of that directory. If executing pip with sudo, you may want sudo's -H flag.[0m
Collecting pyenchant
[33m  Retrying (Retry(total=4, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.VerifiedHTTPSConnection object at 0x7fe27db5d908>: Failed to establish a new connection: [Errno -2] Name or service not known',)': /simple/pyenchant/[0m
  Downloading https://files.pythonhosted.org/packages/9e/54/04d88a59efa33fefb88133ceb638cdf754319030c28aadc5a379d82140ed/pyenchant-2.0.0.tar.gz (64k

In [12]:
'''simple usage example:'''
import enchant
import replacers

from replacers import RegexpReplacer
replacer = RegexpReplacer()

In [13]:
print('''Using RegexpReplacer class created for "can't is a contraction"\n''',
      replacer.replace("can't is a contraction"))
print('''Using RegexpReplacer class created for "I should've done that thing I didn't do"\n''',
      replacer.replace("I should've done that thing I didn't do"))

Using RegexpReplacer class created for "can't is a contraction"
 cannot is a contraction
Using RegexpReplacer class created for "I should've done that thing I didn't do"
 I should have done that thing I did not do


### using the RegexpReplacer class as a preliminary step before tokenization

In [14]:
from nltk.tokenize import word_tokenize
from replacers import RegexpReplacer
replacer = RegexpReplacer()
print('''this is without regexReplacer \n''',
      word_tokenize("can't is a contraction"))
print('''this is with regexReplacer \n''',
      word_tokenize(replacer.replace("can't is a contraction")))

this is without regexReplacer 
 ['ca', "n't", 'is', 'a', 'contraction']
this is with regexReplacer 
 ['can', 'not', 'is', 'a', 'contraction']


### Removing repeating characters

In [15]:
#writefile replacers.py
import re

class RepeatReplacer(object):
    def __init__(self):
        self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
        self.repl = r'\1\2\3'
    def replace(self, word):
        if wordnet.synsets(word):
            return word
        repl_word = self.repeat_regexp.sub(self.repl, word)
        if repl_word != word:
            return self.replace(repl_word)
        else:
            return repl_word

In [16]:
from replacers import RepeatReplacer
replacer = RepeatReplacer()

In [2]:
%whos

Variable         Type      Data/Info
------------------------------------
RepeatReplacer   type      <class '__main__.RepeatReplacer'>
re               module    <module 're' from '/Users<...>nda/lib/python3.5/re.py'>


In [17]:
print('''replacer.replace('looooove')\n''', replacer.replace('looooove'))
print('''replacer.replace('oooooh')\n''',replacer.replace('oooooh'))
print('''replacer.replace('goose')\n''',replacer.replace('goose'))

replacer.replace('looooove')
 love
replacer.replace('oooooh')
 ooh
replacer.replace('goose')
 goose


### Replacing synonyms Page 56
This allows for compressing vocabulary without losgin meaning to save memory and inprove frequency analysis

     store the synonyms in a CSV file or in a YAML file
     

In [18]:
''' defined mapping of a word to its synonym in the csv file.'''
from replacers import CsvWordReplacer
replacer = CsvWordReplacer('synonyms.csv')
print('replace bday with synonym in csvReplacer\n', replacer.replace('bday'))
print('when not in synonym in csvReplacer\n', replacer.replace('happy'))

FileNotFoundError: [Errno 2] No such file or directory: 'synonyms.csv'

## Replacing negations with antonyms
created an AntonymReplacer class in replacers.py as follows:

In [19]:
from replacers import AntonymReplacer
replacer = AntonymReplacer()
replacer.replace('good')
replacer.replace('uglify')
sent = ["let's", 'not', 'uglify', 'our', 'code']
print('''the original sent = "let's", 'not', 'uglify', 'our', 'code'"\n,''',
      replacer.replace_negations(sent))

the original sent = "let's", 'not', 'uglify', 'our', 'code'"
, ["let's", 'beautify', 'our', 'code']


###  new class called SpellingReplacer in replacers.py

In [20]:
from replacers import SpellingReplacer
#replacer = SpellingReplacer()
replacerSP = SpellingReplacer()
replacerSP.replace('cookbok')

'cookbook'

In [21]:
'''Here is an example showing all the suggestions for languege, a
misspelling of language:  language, all the other words have an edit distance of
three or greater. '''

import enchant
d = enchant.Dict('en')
d.suggest('languege')


['language', 'languages', 'languor', "language's"]

In [22]:
from nltk.metrics import edit_distance
print('distance ', edit_distance('language', 'languege'))
print('distance', edit_distance('language', 'languo'))

distance  1
distance 3


### Personal word lists  
This CustomSpellingReplacer class will not replace any words that you put into mywords.txt

In [23]:
import enchant
from replacers import CustomSpellingReplacer
d = enchant.DictWithPWL('en_US', 'mywords.txt')
replacer = CustomSpellingReplacer(d)
replacer.replace('nltk')

'nltk'

In [24]:
%whos

Variable                 Type                      Data/Info
------------------------------------------------------------
AntonymReplacer          type                      <class 'replacers.AntonymReplacer'>
CsvWordReplacer          type                      <class 'replacers.CsvWordReplacer'>
CustomSpellingReplacer   type                      <class 'replacers.CustomSpellingReplacer'>
LancasterStemmer         ABCMeta                   <class 'nltk.stem.lancaster.LancasterStemmer'>
PorterStemmer            ABCMeta                   <class 'nltk.stem.porter.PorterStemmer'>
RegexpReplacer           type                      <class 'replacers.RegexpReplacer'>
RegexpStemmer            ABCMeta                   <class 'nltk.stem.regexp.RegexpStemmer'>
RepeatReplacer           type                      <class 'replacers.RepeatReplacer'>
SnowballStemmer          ABCMeta                   <class 'nltk.stem.snowball.SnowballStemmer'>
SpellingReplacer         type                      <class 'r