# Processing raw text

## Import libraries

In [1]:
import nltk, re, pprint

## 3.1 Accessing Text from the Web and from Disk

In [6]:
from urllib.request import urlopen
url = "http://www.gutenberg.org/files/2554/2554-0.txt"
raw = urlopen(url).read()
decode_raw = raw.decode('utf-8')

In [7]:
decode_raw[:75]

'*** START OF THE PROJECT GUTENBERG EBOOK 2554 ***\n\n\n\n\nCRIME AND PUNISHMENT\n'

In [8]:
## Breaking up the text into words and punctuation
tokens = nltk.word_tokenize(decode_raw)
type(tokens)

list

In [9]:
tokens[:10]

['*', '*', '*', 'START', 'OF', 'THE', 'PROJECT', 'GUTENBERG', 'EBOOK', '2554']

In [10]:
text = nltk.text.Text(tokens)

In [11]:
text.collocations()

Katerina Ivanovna; Pyotr Petrovitch; Pulcheria Alexandrovna; Avdotya
Romanovna; Rodion Romanovitch; Marfa Petrovna; Sofya Semyonovna; old
woman; Porfiry Petrovitch; Amalia Ivanovna; great deal; young man;
Nikodim Fomitch; Ilya Petrovitch; Andrey Semyonovitch; Hay Market;
Dmitri Prokofitch; Good heavens; police station; head clerk


> - `find` and `rfind`(reverse find) methods help us get the right index values to use for slicing string.

### Dealing with HTML

In [13]:
url = "http://news.bbc.co.uk/2/hi/health/2284783.stm"
html = urlopen(url).read().decode('utf8')
html[:60]

'<!doctype html public "-//W3C//DTD HTML 4.0 Transitional//EN'

In [17]:
## cleaning the html
from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'html.parser')
raw = soup.get_text()
tokens = nltk.word_tokenize(raw)
text = nltk.Text(tokens)
text.concordance('gene')

Displaying 7 of 7 matches:
hey say too few people now carry the gene for blondes to last beyond the next 
blonde hair is caused by a recessive gene . In order for a child to have blond
 have blonde hair , it must have the gene on both sides of the family in the g
ere is a disadvantage of having that gene or by chance . They do n't disappear
des would disappear is if having the gene was a disadvantage and I do not thin
r 's Polio campaign launched in Iraq Gene defect explains high blood pressure 
r 's Polio campaign launched in Iraq Gene defect explains high blood pressure 


### Reading Local Files

In [18]:
f = open('document.txt')
raw = f.read()

## The NLP Pipeline

HTML --> ASCII --> Text --> Vocab
- HTML
    - html = urlopen(url).read().decode('utf-8')
    - soup = BeautifulSoup(html, 'html.parser')
    - raw = soup.get_text()
- ASCII
    - tokens = nltk.wordpunct_tokenize(raw)
- Text    
    - text = nltk.Text(tokens)
- Vocab
    - words = [w.lower() for w in text]
    - vocab = sorted(set(words))


## 3.2 Useful applications of Regular Expressions

### Extracting Word pieces

In [20]:
word = "supercalifragilisticexpialidocious"
re.findall(r'[aeiou]', word)

['u',
 'e',
 'a',
 'i',
 'a',
 'i',
 'i',
 'i',
 'e',
 'i',
 'a',
 'i',
 'o',
 'i',
 'o',
 'u']

In [21]:
len(re.findall(r'[aeiou]', word))

16

## 3.3 Normalizing Text

### Stemmers

In [30]:
raw = """Dennis: Listen, strange women lying in ponds distributing swords is no basis for a system of government. Supreme executive power derives from a mandate from the masses, not from some farcical aquatic ceremony."""
tokens = nltk.word_tokenize(raw)

In [31]:
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()

In [32]:
print([porter.stem(t) for t in tokens])

['denni', ':', 'listen', ',', 'strang', 'women', 'lie', 'in', 'pond', 'distribut', 'sword', 'is', 'no', 'basi', 'for', 'a', 'system', 'of', 'govern', '.', 'suprem', 'execut', 'power', 'deriv', 'from', 'a', 'mandat', 'from', 'the', 'mass', ',', 'not', 'from', 'some', 'farcic', 'aquat', 'ceremoni', '.']


In [33]:
print([lancaster.stem(t) for t in tokens])

['den', ':', 'list', ',', 'strange', 'wom', 'lying', 'in', 'pond', 'distribut', 'sword', 'is', 'no', 'bas', 'for', 'a', 'system', 'of', 'govern', '.', 'suprem', 'execut', 'pow', 'der', 'from', 'a', 'mand', 'from', 'the', 'mass', ',', 'not', 'from', 'som', 'farc', 'aqu', 'ceremony', '.']


### Lemmatization
The wordNet lemmatizer removes affixes only if the resulting word is in its dictionary. 

In [34]:
wnl = nltk.WordNetLemmatizer()
print([wnl.lemmatize(t) for t in tokens])

['Dennis', ':', 'Listen', ',', 'strange', 'woman', 'lying', 'in', 'pond', 'distributing', 'sword', 'is', 'no', 'basis', 'for', 'a', 'system', 'of', 'government', '.', 'Supreme', 'executive', 'power', 'derives', 'from', 'a', 'mandate', 'from', 'the', 'mass', ',', 'not', 'from', 'some', 'farcical', 'aquatic', 'ceremony', '.']


- The wordnet lemmatizer is a good choice if you want to compile the vocabulary of some texts and want a list of valid lemmas (or lexicon headwords)