## Preparation

### Installation

We assume that the repo is cloned, all necessary packages are installed, including calling the script:

```./install_packages.sh```

and the code is compiled:

```./build.sh```

### Changing directory to the repo root

In [None]:
cd ../..

### Downloading demo data

In [None]:
!wget boytsov.info/datasets/flecsneurt-demo-2020-04-07.tar.bz2 

In [None]:
# Unpacking it
!tar jxvf flecsneurt-demo-2020-04-07.tar.bz2

In [2]:
# Creating a Lucene index
!scripts/index/create_lucene_index.sh squad

Using collection root: collections
Data directory: collections/squad/input_data
Index directory: collections/squad/lucene_index
Removing previously created index (if exists)
Checking data sub-directory: dev1
Found indexable data file: dev1/AnswerFields.jsonl
Checking data sub-directory: dev2
Found indexable data file: dev2/AnswerFields.jsonl
Checking data sub-directory: test
Found indexable data file: test/AnswerFields.jsonl
Checking data sub-directory: train
Found indexable data file: train/AnswerFields.jsonl
Checking data sub-directory: train_bitext
Found indexable data file: train_bitext/AnswerFields.jsonl
Found query file: dev1/QuestionFields.jsonl
Found query file: dev2/QuestionFields.jsonl
Found query file: test/QuestionFields.jsonl
Found query file: train/QuestionFields.jsonl
Found query file: train_bitext/QuestionFields.jsonl
Using the data input file: AnswerFields.jsonl
JAVA_OPTS=-Xms65972946k -Xmx115452655k -server
Creating a new Lucene index, maximum # of docs to process: 21

In [3]:
#!creating a forward index for two fields:
# text is a parsed text field
# text_raw is a raw text field that keeps the text as is
# -clean removes all previous forward indices
!scripts/index/create_fwd_index.sh squad mapdb  \
                               "text:parsedText text_unlemm:raw" \
                               -clean

Using collection root: collections
Data directory:            collections/squad/input_data
Forward index directory:   collections/squad/forward_index/
Clean old index?:          1
Removing previously created index (if exists)
Field list definition:     text:parsedText text_unlemm:raw
Checking data sub-directory: dev1
Found indexable data file: dev1/AnswerFields.jsonl
Checking data sub-directory: dev2
Found indexable data file: dev2/AnswerFields.jsonl
Checking data sub-directory: test
Found indexable data file: test/AnswerFields.jsonl
Checking data sub-directory: train
Found indexable data file: train/AnswerFields.jsonl
Checking data sub-directory: train_bitext
Found indexable data file: train_bitext/AnswerFields.jsonl
Found query file: dev1/QuestionFields.jsonl
Found query file: dev2/QuestionFields.jsonl
Found query file: test/QuestionFields.jsonl
Found query file: train/QuestionFields.jsonl
Found query file: train_bitext/QuestionFields.jsonl
JAVA_OPTS=-Xms98959419k -Xmx115452655k -ser

## API demo

In [4]:
from scripts.py_flexneuart.setup import *

In [5]:
# add Java JAR to the class path
configure_classpath('target')

In [6]:
# create a resource manager
resource_manager=create_featextr_resource_manager('collections/squad/forward_index')

### Retrieval

In [7]:
from scripts.py_flexneuart.cand_provider import *
# create a candidate provider/generator
cand_prov = create_cand_provider(resource_manager, PROVIDER_TYPE_LUCENE, 'collections/squad/lucene_index')

In [8]:
run_query(cand_prov, 20, "university notre dame student run".split())

(1961,
 [CandidateEntry(doc_id='@4309', score=27.225019454956055),
  CandidateEntry(doc_id='@4323', score=27.155115127563477),
  CandidateEntry(doc_id='@2608', score=26.603111267089844),
  CandidateEntry(doc_id='@4310', score=26.364826202392578),
  CandidateEntry(doc_id='@4303', score=26.196977615356445),
  CandidateEntry(doc_id='@4350', score=25.074060440063477),
  CandidateEntry(doc_id='@4346', score=24.675006866455078),
  CandidateEntry(doc_id='@4316', score=24.361064910888672),
  CandidateEntry(doc_id='@4336', score=23.92790985107422),
  CandidateEntry(doc_id='@4345', score=23.00181007385254),
  CandidateEntry(doc_id='@4320', score=22.964710235595703),
  CandidateEntry(doc_id='@2607', score=22.55484390258789),
  CandidateEntry(doc_id='@4325', score=22.466575622558594),
  CandidateEntry(doc_id='@4330', score=21.723785400390625),
  CandidateEntry(doc_id='@4348', score=21.596769332885742),
  CandidateEntry(doc_id='@4321', score=21.47646713256836),
  CandidateEntry(doc_id='@4338', scor

### Forward index demo

In [9]:
from scripts.py_flexneuart.fwd_index import get_forward_index

#### First let's play with a raw index that keeps ony unparsed text

In [10]:
raw_indx = get_forward_index(resource_manager, 'text_unlemm')

In [11]:
# the raw flag is set
raw_indx.is_raw

True

In [12]:
raw_indx.get_doc_raw('@4302')

'architecturally school catholic character atop main building gold dome golden statue virgin mary immediately main building facing copper statue christ arms upraised legend venite ad omnes main building basilica sacred heart immediately basilica grotto marian place prayer reflection replica grotto lourdes france virgin mary reputedly appeared saint bernadette soubirous 1858 end main drive direct line connects 3 statues gold dome simple modern stone statue mary'

#### A parsed index has more info

In [13]:
parsed_indx = get_forward_index(resource_manager, 'text')

In [14]:
# here is_raw is False
parsed_indx.is_raw

False

In [15]:
parsed_indx.get_doc_parsed('@4302')

DocEntryParsed(word_ids=[1, 470, 480, 549, 770, 848, 857, 867, 1143, 1193, 1291, 1514, 1562, 1597, 1897, 2210, 2425, 2513, 2579, 3171, 3207, 3357, 3806, 3899, 3960, 4056, 4334, 4790, 5881, 6258, 6274, 6629, 6645, 7051, 7557, 8066, 9139, 10063, 11826, 12878, 13240, 16221, 20752, 32578, 32579, 32580, 32581, 32582, 32583], word_qtys=[1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 4, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1], word_id_seq=[10063, 1, 6274, 848, 8066, 1291, 1193, 6645, 9139, 3171, 11826, 1143, 4334, 1597, 1291, 1193, 2210, 3899, 11826, 2425, 3806, 32578, 857, 32579, 7051, 13240, 1291, 1193, 4790, 7557, 4056, 1597, 4790, 32580, 6258, 1897, 5881, 16221, 6629, 32580, 32581, 3207, 1143, 4334, 20752, 470, 2579, 32582, 32583, 12878, 1562, 1291, 3357, 867, 770, 2513, 549, 11826, 6645, 9139, 1514, 480, 3960, 11826, 4334], doc_len=65)

In [16]:
# Let's extract the first document word and its info
parsed_indx.get_word_by_id(10063), parsed_indx.get_word_entry_by_id(10063)

('architecturally', WordEntry(word_id=10063, word_freq=11))