### Whoosh Index Build Notebook

1) Import Dependencies

In [1]:
#Python 3.6.7 :: Anaconda custom (64-bit)
#Whoosh 2.7.4_py36_1

import os.path
import codecs
import pandas as pd

from whoosh import fields, index
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import StemmingAnalyzer
from whoosh import qparser
from whoosh.qparser import QueryParser
from whoosh.filedb.filestore import FileStorage

from tqdm import tqdm_notebook

import nltk
nltk.download('punkt')
nltk.download('brown')
nltk.download('universal_tagset')
from nltk.corpus import brown
from nltk import bigrams

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Gamer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\Gamer\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\Gamer\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


2) Import Brown Corpus into Pandas

In [2]:
# build dataframe for the brown corpus
fileids = brown.fileids()
file_list = []
cat_list = []
sent_list = []
word_list = []
for file in fileids:
    for sentence in brown.tagged_sents(tagset='universal', fileids = [file]):
        file_list.append(file)
        cat_list.append(brown.categories(fileids = [file])[0])
        words = []
        for tup in sentence:
            words.append(tup[0])
        sent_list.append(' '.join(words))
        word_list.append(words)
    
data = pd.DataFrame({'file':file_list,'category':cat_list, 'sentence':sent_list, 'words':word_list})
data.index.name = 'b100'
data.head() 

Unnamed: 0_level_0,file,category,sentence,words
b100,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,ca01,news,The Fulton County Grand Jury said Friday an in...,"[The, Fulton, County, Grand, Jury, said, Frida..."
1,ca01,news,The jury further said in term-end presentments...,"[The, jury, further, said, in, term-end, prese..."
2,ca01,news,The September-October term jury had been charg...,"[The, September-October, term, jury, had, been..."
3,ca01,news,`` Only a relative handful of such reports was...,"[``, Only, a, relative, handful, of, such, rep..."
4,ca01,news,The jury said it did find that many of Georgia...,"[The, jury, said, it, did, find, that, many, o..."


3) Index the Corpus

In [3]:
#define the search schema
schema = fields.Schema(
    b100 = fields.ID(stored=True),
    file = fields.TEXT(stored=True),
    category = fields.TEXT(stored=True),
    sentence = fields.TEXT(stored=True),
    words = fields.KEYWORD(stored=True))

In [None]:
# add dataframe rows to the index
if not os.path.exists("index"):
    os.mkdir("index")
ix = create_in("index", schema)
with ix.writer() as w:
    for i,nrows in tqdm_notebook(data.iterrows()):
        w.add_document(b100 = str(i),
                       file = data.file[i],
                       category = data.category[i],
                       sentence = data.sentence[i],
                       words = data.words[i],)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [None]:
# manage index for use later
storage = FileStorage("indexdir")

# Create an index
#ix = storage.create_index(myschema)

# Open an existing index
storage.open_index();

4) Head over to the search notebook to use the index.  

whoosh_search_brown.ipynb