### create a custom corpus (bunch of text files in directory)

to train your own model, such as a part-of-speech tagger or text classifier, you will need to
create a custom corpus to train on.   
    ***create a custom nltk_data directory in our home directory***

In [9]:
# created in /Users/alessandropiccolo/nltk_data
import os, os.path
#defines name of new folder in users directory
path = os.path.expanduser('/notebooks/nltk_data')
if not os.path.exists(path):
    os.mkdir(path)

In [10]:
#test that directory was created correctly
import os, os.path
path = os.path.expanduser('/notebooks/nltk_data')
os.path.exists(path)

True

In [11]:
"""test that custom directory 'Users/alessandropiccolo/nltk_data' is in 
nltk.data.path required to uses as customer corpus
"""
import nltk.data
path in nltk.data.path

False

In [7]:
'''Once you have your nltk_data directory,
the convention is that corpora resides in a corpora subdirectory. Create
this corpora directory within the nltk_data directory, so that the path is
~/nltk_data/corpora. '''#pg 50
import os, os.path
path = os.path.expanduser('~/work/nltk_data/corpora')
#path2 = os.path.expanduser('~/nltk_data/corpora/cookbook')
if not os.path.exists(path):
    os.mkdir(path)


### create a simple wordlist file

In [6]:
%pwd

'/Users/alessandropiccolo/Google Drive/Python/1JupyterNotebook/NLTK'

In [12]:
import nltk.data
nltk.data.load('/home/jovyan/work/nltk_data/corpora/cookbook/mywords.txt', format='raw')

b"'nltk'\n'corpus'\n'corpora'\n'wordnet'"

### Creating a wordlist corpus  
provides access to a file containing a list of words, one word per line. 

In [14]:
"""instantiate a WordListCorpusReader class that will produce a list of words"""
from nltk.corpus.reader import WordListCorpusReader
reader = WordListCorpusReader('/home/jovyan/work/nltk_data/corpora/cookbook', ['mywords.txt'])
reader.words()

["'nltk'", "'corpus'", "'corpora'", "'wordnet'"]

In [15]:
reader.fileids()

['mywords.txt']

### creating a POS tagged word corpus
<img src="taggedCorpusReader.png" />
file called brown.pos, you could then create a TaggedCorpusReader class using the following code:

In [24]:
from nltk.corpus.reader import TaggedCorpusReader
reader = TaggedCorpusReader('/Users/alessandropiccolo/nltk_data/corpora/cookbook', r'.*\.pos')
reader.words()

['The', 'expense', 'and', 'time', 'involved', 'are', ...]

In [25]:
"""list of tagged tokens. A tagged token is simply
a tuple of (word, tag)"""
reader.tagged_words()

[('The', 'AT-TL'), ('expense', 'NN'), ('and', 'CC'), ...]

In [26]:
"""list of every sentence and also every tagged
sentence where the sentence is itself a list of words or tagged tokens"""
reader.sents()

[['The', 'expense', 'and', 'time', 'involved', 'are', 'astronomical', '.']]

In [27]:
"""list of paragraphs, where each paragraph is a list of sentences and each sentence is a list of
words or tagged tokens"""
reader.tagged_sents()

[[('The', 'AT-TL'), ('expense', 'NN'), ('and', 'CC'), ('time', 'NN'), ('involved', 'VBN'), ('are', 'BER'), ('astronomical', 'JJ'), ('.', '.')]]

### customize
them by passing in your own tokenizers, sentence tokenizer, paragraph at the time of initialization.  Page 57

In [28]:
from nltk.tokenize import SpaceTokenizer
reader = TaggedCorpusReader('/Users/alessandropiccolo/nltk_data/corpora/cookbook', r'.*\.pos',
                            word_tokenizer=SpaceTokenizer())
reader.words()

['The', 'expense', 'and', 'time', 'involved', 'are', ...]

### create a corpus with sentences that contain chunks -- creating chunked phrases  
chunk is a short phrase within a sentence  chunks are: subtrees within a sentence tree, and they will be covered in much more detail  in Chapter 5, Extracting Chunks  
Words that are not within brackets are part of the sentence tree, but are not part of any noun phrase subtree.
>[Earlier/JJR staff-reduction/NN moves/NNS] have/VBP trimmed/VBN about/
IN [300/CD jobs/NNS] ,/, [the/DT spokesman/NN] said/VBD ./.

In [3]:
from nltk.corpus.reader import ChunkedCorpusReader
reader = ChunkedCorpusReader('.', r'.*\.chunk')
print('chunked words\n', reader.chunked_words())
print('chuncked sentences\n', reader.chunked_sents())
print('chunked paragraphs\n',reader.chunked_paras())

chunked words
 [Tree('NP', [('Earlier', 'JJR'), ('staff-reduction', 'NN'), ('moves', 'NNS')]), ('have', 'VBP'), ...]
chuncked sentences
 [Tree('S', [Tree('NP', [('Earlier', 'JJR'), ('staff-reduction', 'NN'), ('moves', 'NNS')]), ('have', 'VBP'), ('trimmed', 'VBN'), ('about', 'IN'), Tree('NP', [('300', 'CD'), ('jobs', 'NNS')]), (',', ','), Tree('NP', [('the', 'DT'), ('spokesman', 'NN')]), ('said', 'VBD'), ('.', '.')])]
chunked paragraphs
 [[Tree('S', [Tree('NP', [('Earlier', 'JJR'), ('staff-reduction', 'NN'), ('moves', 'NNS')]), ('have', 'VBP'), ('trimmed', 'VBN'), ('about', 'IN'), Tree('NP', [('300', 'CD'), ('jobs', 'NNS')]), (',', ','), Tree('NP', [('the', 'DT'), ('spokesman', 'NN')]), ('said', 'VBD'), ('.', '.')])]]


### Tree leaves  
 leaves of a tree are the tagged tokens

In [5]:
print('leaves', reader.chunked_words()[0].leaves())
print('leaves of sents\n', reader.chunked_sents()[0].leaves())
print('leaves of para \n', reader.chunked_paras()[0][0].leaves())

leaves [('Earlier', 'JJR'), ('staff-reduction', 'NN'), ('moves', 'NNS')]
leaves of sents
 [('Earlier', 'JJR'), ('staff-reduction', 'NN'), ('moves', 'NNS'), ('have', 'VBP'), ('trimmed', 'VBN'), ('about', 'IN'), ('300', 'CD'), ('jobs', 'NNS'), (',', ','), ('the', 'DT'), ('spokesman', 'NN'), ('said', 'VBD'), ('.', '.')]
leaves of para 
 [('Earlier', 'JJR'), ('staff-reduction', 'NN'), ('moves', 'NNS'), ('have', 'VBP'), ('trimmed', 'VBN'), ('about', 'IN'), ('300', 'CD'), ('jobs', 'NNS'), (',', ','), ('the', 'DT'), ('spokesman', 'NN'), ('said', 'VBD'), ('.', '.')]


###  allowing multiple different chunk phrase types, not just noun phrases. 

In [2]:
from nltk.corpus.reader import ConllChunkCorpusReader
conllreader = ConllChunkCorpusReader('/Users/alessandropiccolo/nltk_data/corpora/cookbook', r'.*\.iob',
                                     ('NP', 'VP', 'PP'))
print('conllreader chunked words\n', conllreader.chunked_words())
print('conllreader chunked sents\n', conllreader.chunked_sents())
print('conllreader chunked into list of tupples word\n', conllreader.iob_words())
print('conllreader chunked into list of tupples sent\n', conllreader.iob_sents())

conllreader chunked words
 [Tree('NP', [('Mr.', 'NNP'), ('Meador', 'NNP')]), Tree('VP', [('had', 'VBD'), ('been', 'VBN')]), ...]
conllreader chunked sents
 [Tree('S', [Tree('NP', [('Mr.', 'NNP'), ('Meador', 'NNP')]), Tree('VP', [('had', 'VBD'), ('been', 'VBN')]), Tree('NP', [('executive', 'JJ'), ('vice', 'NN'), ('president', 'NN')]), Tree('PP', [('of', 'IN')]), Tree('NP', [('Balcor', 'NNP')]), ('.', '.')])]
conllreader chunked into list of tupples word
 [('Mr.', 'NNP', 'B-NP'), ('Meador', 'NNP', 'I-NP'), ...]
conllreader chunked into list of tupples sent
 [[('Mr.', 'NNP', 'B-NP'), ('Meador', 'NNP', 'I-NP'), ('had', 'VBD', 'B-VP'), ('been', 'VBN', 'I-VP'), ('executive', 'JJ', 'B-NP'), ('vice', 'NN', 'I-NP'), ('president', 'NN', 'I-NP'), ('of', 'IN', 'B-PP'), ('Balcor', 'NNP', 'B-NP'), ('.', '.', 'O')]]
