__The goal of this chapter is to answer the following questions:__

- How can we write programs to access text from local files and from the web, in order to get hold of an unlimited range of language material?
- How can we split documents up into individual words and punctuation symbols, so we can carry out the same kinds of analysis we did with text corpora in earlier chapters?
- How can we write programs to produce formatted output and save it in a file? 

In [1]:
import nltk,re,pprint

from nltk import word_tokenize

In [2]:
from urllib import request

In [6]:
request.ProxyHandler(proxies)

<urllib.request.ProxyHandler at 0x90a2a90>

In [11]:
response = request.urlopen(url)

In [12]:
raw = response.read().decode('utf8')

In [13]:
type(raw)

str

In [14]:
len(raw)

1176965

In [15]:
raw[:75]

'\ufeffThe Project Gutenberg EBook of Crime and Punishment, by Fyodor Dostoevsky\r'

In [16]:
tokens = word_tokenize(raw)

In [17]:
tokens[:10]

['\ufeffThe',
 'Project',
 'Gutenberg',
 'EBook',
 'of',
 'Crime',
 'and',
 'Punishment',
 ',',
 'by']

In [37]:
from nltk.corpus import PlaintextCorpusReader

In [38]:
root = 'c://Users/EIEUCHH/Documents/Trace/'

In [39]:
wordlist = PlaintextCorpusReader(root,'.*')

In [40]:
trace = wordlist.words('Trace_sample.txt')

In [41]:
trace

['[', '2018', '-', '04', '-', '28', '22', ':', '20', ...]

In [42]:
type(trace)

nltk.corpus.reader.util.StreamBackedCorpusView

In [44]:
trace_str = ' '.join(trace)

In [45]:
type(trace_str)

str

In [46]:
trace_tokens = word_tokenize(trace_str)

In [48]:
print(type(trace_tokens),len(trace_tokens))

<class 'list'> 25125


In [50]:
trace_tokens[:10]

['[', '2018', '-', '04', '-', '28', '22', ':', '20', ':']

In [51]:
trace_text = nltk.Text(trace_tokens)

In [52]:
type(trace_text)

nltk.text.Text

In [53]:
trace_text.collocations()

du1 com_ericsson_triobjif; handleSignal received; 1648098 receiver;
164809b receiver; 164809e receiver; 16480a1 receiver; 1648097 sender;
164809a sender; 164809d sender; 16480a0 sender; Received
RICM_CONFIGURE_IQC_SWITCH_CFM; Received RICM_CONFIGURE_IQ_SWITCH_CFM;
Sent RICM_CONFIGURE_IQC_SWITCH_REQ; Sent RICM_CONFIGURE_IQ_SWITCH_REQ;
received RICM_CONFIGURE_IQC_SWITCH_CFM; received
RICM_CONFIGURE_IQ_SWITCH_CFM; 6993 RICM_RELEASE_IQC_SWITCH_CFM; 6993
RICM_RELEASE_IQ_SWITCH_CFM; 6993 RICM_RELEASE_IQ_SWITCH_REQ; 6993
RICM_RELEASE_IQC_SWITCH_REQ


In [54]:
import os

In [55]:
os.listdir('.')

['.ipynb_checkpoints',
 'add_num.py',
 'Ch02-Accessing Text Corpora and Lexical Resources.ipynb',
 'Ch03-Processing Raw Text.ipynb',
 'code_unusual.py',
 'print_sumab.py',
 'spell_check.py',
 '__pycache__']

In [58]:
raw_trace = open('c://Users/EIEUCHH/Documents/Trace/Trace_sample.txt').read()

In [59]:
raw_trace

'[2018-04-28 22:20:57.947056730] (+0.000071015) du1 com_ericsson_triobjif:TRACE6: { cpu_id = 11 }, { processAndObjIf = "nc_main_thread(ncDcIf)", fileAndLine = "IcDciMsgHelpers.cc:154", msg = "RICM_RELEASE_IQC_SWITCH_REQ, carrierBranchId=3, unitId=1" }\n[2018-04-28 22:20:57.947232012] (+0.000142235) du1 com_ericsson_triobjif:REC_SIG: { cpu_id = 9 }, { processAndObjIf = "Nci_control_proc(RICM_IF_DCI)", fileAndLine = "ricm_control_iq.c:1973", msg = "signo:16480a0 sender :6993 RICM_RELEASE_IQC_SWITCH_REQ: unitId: 0x1, carrierBranchId: 3" }\n[2018-04-28 22:20:57.947330730] (+0.000013484) du1 com_ericsson_triobjif:SEND_SIG: { cpu_id = 9 }, { processAndObjIf = "Nci_control_proc(RICM_IF_DCI)", fileAndLine = "ricm_control_iq.c:1183", msg = "signo:16480a1 receiver :6993 RICM_RELEASE_IQC_SWITCH_CFM: unitId: 0x1, carrierBranchId: 3" }\n[2018-04-28 22:20:57.947355855] (+0.000025125) du1 com_ericsson_triobjif:TRACE6: { cpu_id = 11 }, { processAndObjIf = "nc_main_thread(ncIcEquipmentControlIfSwU)", f

In [61]:
words = [w.lower() for w in trace_tokens]

In [62]:
vocab = sorted(set(words))

In [63]:
print(len(words),len(vocab))

25125 920


In [64]:
vocab.append('blog')

## 3.4   Regular Expressions for Detecting Word Patterns

In [67]:
import nltk
import re

In [68]:
wordlist = [ w for w in nltk.corpus.words.words('en') if w.islower()]

In [71]:
[w for w in wordlist if re.search('x$',w)]

['abox',
 'abrasax',
 'accusatrix',
 'acronyx',
 'addax',
 'adfix',
 'adieux',
 'administratrix',
 'admix',
 'admonitrix',
 'adnex',
 'advocatrix',
 'affix',
 'afflux',
 'agitatrix',
 'ampyx',
 'anatox',
 'androsphinx',
 'annex',
 'antapex',
 'antefix',
 'anthrax',
 'antianthrax',
 'anticlimax',
 'anticomplex',
 'antiflux',
 'antihelix',
 'antiorthodox',
 'antispadix',
 'antitax',
 'apex',
 'apoplex',
 'appendix',
 'arbitratrix',
 'archocystosyrinx',
 'archosyrinx',
 'arquifoux',
 'arthrosyrinx',
 'arx',
 'assertrix',
 'auspex',
 'autocratrix',
 'auxotox',
 'aviatrix',
 'ax',
 'azox',
 'bandbox',
 'bateaux',
 'beaux',
 'beeswax',
 'bemix',
 'biconvex',
 'biflex',
 'bijoux',
 'billywix',
 'bisectrix',
 'blennothorax',
 'borax',
 'bostryx',
 'breadbox',
 'breakax',
 'broadax',
 'bureaux',
 'butterbox',
 'cacanthrax',
 'cakebox',
 'calix',
 'calx',
 'calyx',
 'candlebox',
 'capax',
 'captivatrix',
 'carapax',
 'carfax',
 'carnifex',
 'casebox',
 'cashbox',
 'caudex',
 'cephalothorax',
 'c

In [77]:
[w for w in wordlist if re.search('^a..b..t..$',w)]

['alabaster', 'alabastos', 'anabiotic', 'anabrotic', 'apobiotic', 'azobacter']

In [82]:
[w for w in wordlist if re.search('^[ghi][mno][jlk][def]$',w)]

['gold', 'golf', 'hold', 'hole']

In [84]:
[w for w in wordlist if re.search('^[ghi].[jkl].[mno]$',w)]

['gilim',
 'golem',
 'hakam',
 'hakim',
 'helio',
 'hello',
 'hilum',
 'hokum',
 'hollo',
 'igloo',
 'inken']

In [85]:
[w for w in wordlist if re.search('^[abc][def][jkl][mno]$',w)]

[]

In [87]:
[w for w in wordlist if re.search('^[a-d]b..t$',w)]

['abaft', 'abbot', 'abdat', 'abnet', 'abort', 'about', 'abret', 'absit']

In [91]:
[w for w in wordlist if re.search('^[l]?..l$',w)]

['aal',
 'ail',
 'all',
 'awl',
 'bal',
 'bel',
 'cal',
 'col',
 'dal',
 'eel',
 'ell',
 'gal',
 'gel',
 'gol',
 'gul',
 'ill',
 'kil',
 'kyl',
 'lall',
 'leal',
 'lill',
 'loll',
 'lull',
 'mal',
 'mel',
 'mil',
 'nil',
 'nul',
 'oil',
 'owl',
 'pal',
 'pol',
 'pul',
 'rel',
 'sal',
 'sil',
 'sol',
 'tal',
 'til',
 'tol',
 'ull',
 'vol',
 'zel']