# Brand New Concordance

A quick and dirty way of building a concordance

In [1]:
# coding: utf-8

import os

from cheshire3.baseObjects import Session
from cheshire3.document import StringDocument
from cheshire3.internal import cheshire3Root
from cheshire3.server import SimpleServer   

session = Session()
session.database = 'db_dickens'
serv = SimpleServer(session, os.path.join(cheshire3Root, 'configs', 'serverConfig.xml'))
db = serv.get_object(session, session.database)
qf = db.get_object(session, 'defaultQueryFactory')
resultSetStore = db.get_object(session, 'resultSetStore')
idxStore = db.get_object(session, 'indexStore')

In [2]:
def build_concordance(term, context):
    
    query = qf.get_query(session, """(c3.subcorpus-idx all "dickens" and/proxinfo c3.chapter-idx all/proxinfo "{}" )""".format(term))
    result_set = db.search(session, query)

    concordance = []
        
    for result in result_set:
        record = result.fetch_record(session)
        tree = record.get_dom(session)
        text_nodes = tree.xpath('//txt/text()')
        text_only = ' '.join(sentence for sentence in text_nodes)
            
        for hit in result.proxInfo:
            word_id = hit[0][1]  
            char_location = hit[0][2]
            concordance_line = text_only[char_location - context : char_location + len(term) + context]
            #NOTE in these cases record.process_xpath(session, xpath) is not faster
            #TODO check there is only one result
            #sentence = tree.xpath('/div/descendant::w[%i]/ancestor-or-self::s/@id' % int(word_id + 1))
            #paragraph = tree.xpath('/div/descendant::w[%i]/ancestor-or-self::p/@id' % int(word_id + 1))
            #concordance.append((concordance_line, sentence[0], paragraph[0]))
            concordance.append((hit, concordance_line, text_only, tree))
    
    return concordance

In [3]:
#%timeit -n6 concordance = build_concordance("fog", 25)

In [4]:
concordance = build_concordance("fog", 25)
print len(concordance)

94


In [5]:
concordance[55][:2]

([[0, 3603, 19274, 15292, 15292]],
 "pike lamp was a blur, quite out of the lamp's usual p")

In [6]:
text = concordance[55][2]

In [7]:
print text[15292:19500]
# is the fourth element the number of w nodes?



In [8]:
from lxml import etree
xmlstring = etree.tostring(concordance[55][3])

In [9]:
snippet = """w o="94">home</w><n>.</n></toks></s><s sid="194" id="GE.c15.s194" eid="456"><txt>Beyond town, we found a heavy mist out, and it fell wet and thick.</txt><toks><w o="0">Beyond</w><n> </n><w o="7">town</w><n>,</n><n> </n><w o="13">we</w><n> </n><w o="16">found</w><n> </n><w o="22">a</w><n> </n><w o="24">heavy</w><n> </n><w o="30">mist</w><n> </n><w o="35">out</w><n>,</n><n> </n><w o="40">and</w><n> </n><w o="44">it</w><n> </n><w o="47">fell</w><n> </n><w o="52">wet</w><n> </n><w o="56">and</w><n> </n><w o="60">thick</w><n>.</n></toks></s><s sid="195" id="GE.c15.s195" eid="457"><txt>The turnpike lamp was a blur, quite out of the lamp\'s usual place apparently, and its rays looked solid substance on the fog.</txt><toks><w o="0">The</w><n> </n><w o="4">turnpike</w><n> </n><w o="13">lamp</w><n> </n><w o="18">was</w><n> </n><w o="22">a</w><n> </n><w o="24">blur</w><n>,</n><n> </n><w o="30">quite</w><n> </n><w o="36">out</w><n> </n><w o="40">of</w><n> </n><w o="43">the</w><n> </n><w o="47">lamp\'s</w><n> </n><w o="54">usual</w><n> </n><w o="60">place</w><n> </n><w o="66">apparently</w><n>,</n><n> </n><w o="78">and</w><n> </n><w o="82">its</w><n> </n><w o="86">rays</w><n> </n><w o="91">looked</w><n> </n><w o="98">solid</w><n> </n><w o="104">substance</w><n> </n><w o="114">on</w><n> </n><w o="117">the</w><n> </n><w o="121">fog</w><n>.</n></toks></s><s sid="196" id="GE.c15.s196" eid="458"><txt>We were noticing this, and saying how that the mist rose with a change of wind from a certain quarter of our marshes, when we came upon a man, slouching under the lee of the turnpike house.</txt><toks><w o="0">We</w><n> </n><w o="3">were</w><n> </n><w o="8">noticing</w><n> </n><w o="17">this</w><n>,</n><n> </n><w o="23">and</w><n> </n><w o="27">saying</w><n> </n><w o="34">how</w><n> </n><w o="38">that</w><n> </n><w o="43">the</w><n> </n><w o="47">mist</w><n> </n><w o="52">rose</w><n> </n><w o="57">with</w><n> </n><w o="62">a</w><n> </n><w o="64">change</w><n> </n><w o="71">of</w><n> </n><w o="74">wind</w><n> </n><w o="79">from</w><n> </n><w o="84">a</w><n> </n><w o="86">certain</w><n> </n><w o="94">quarter</w><n> </n><w o="102">of</w><n> </n><w o="105">our</w><n> </n><w o="109">marshes</w><n>,</n><n> </n><w o="118">when</w><n> </n><w o="123">we</w><n> </n><w o="126">c"""

In [10]:
print snippet

w o="94">home</w><n>.</n></toks></s><s sid="194" id="GE.c15.s194" eid="456"><txt>Beyond town, we found a heavy mist out, and it fell wet and thick.</txt><toks><w o="0">Beyond</w><n> </n><w o="7">town</w><n>,</n><n> </n><w o="13">we</w><n> </n><w o="16">found</w><n> </n><w o="22">a</w><n> </n><w o="24">heavy</w><n> </n><w o="30">mist</w><n> </n><w o="35">out</w><n>,</n><n> </n><w o="40">and</w><n> </n><w o="44">it</w><n> </n><w o="47">fell</w><n> </n><w o="52">wet</w><n> </n><w o="56">and</w><n> </n><w o="60">thick</w><n>.</n></toks></s><s sid="195" id="GE.c15.s195" eid="457"><txt>The turnpike lamp was a blur, quite out of the lamp's usual place apparently, and its rays looked solid substance on the fog.</txt><toks><w o="0">The</w><n> </n><w o="4">turnpike</w><n> </n><w o="13">lamp</w><n> </n><w o="18">was</w><n> </n><w o="22">a</w><n> </n><w o="24">blur</w><n>,</n><n> </n><w o="30">quite</w><n> </n><w o="36">out</w><n> </n><w o="40">of</w><n> </n><w o="43">the</w><n> </n><w o="47">lamp

In [11]:
# idea: use a tokenmerger?

In [12]:
xmlstring



In [86]:
# concordance = build_concordance("lamp", 25)
print len(concordance)

94


In [88]:
for line in concordance[55:56]:
    print concordance.index(line), line[0], line[1], "########", line[2][:100]

55 [[0, 3603, 19274, 15292, 15292]] pike lamp was a blur, quite out of the lamp's usual p ######## As I was getting too big for Mr. Wopsle's great-aunt's room, my education under that preposterous fe


In [41]:
for line in concordance[0:1000]:
    print concordance.index(line)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150


In [26]:
def build_concordance_with_locations(term, context, max_hits):
    
    query = qf.get_query(session, """(c3.subcorpus-idx all "dickens" and/cql.proxinfo c3.chapter-idx any "{}" )""".format(term))
    result_set = db.search(session, query)

    concordance = []
    
    count = 0
    
    for result in result_set:
    
        if count < max_hits:
            record = result.fetch_record(session)
            tree = record.get_dom(session)
            text_nodes = tree.xpath('//txt/text()')
            text_only = ' '.join(sentence for sentence in text_nodes)

            for hit in result.proxInfo:
                if count < max_hits:
                    count +=1 
                
                    word_id = hit[0][1]  
                    char_location = hit[0][2]
                    concordance_line = text_only[char_location - context : char_location + len(term) + context]
                    #NOTE in these cases record.process_xpath(session, xpath) is not faster
                    #TODO check there is only one result
                    sentence = tree.xpath('/div/descendant::w[%i]/ancestor-or-self::s/@id' % int(word_id + 1))
                    paragraph = tree.xpath('/div/descendant::w[%i]/ancestor-or-self::p/@id' % int(word_id + 1))
                    concordance.append((concordance_line, sentence[0], paragraph[0]))
    
    return concordance

In [27]:
%timeit build_concordance_with_locations("fog", 25, 100)

1 loops, best of 3: 560 ms per loop


In [None]:
%timeit build_concordance_with_locations("the", 25, 100)

In [10]:
%timeit build_concordance_with_locations("the", 25, 1000)

1 loops, best of 3: 858 ms per loop


In [None]:
%prun build_concordance_with_locations("the", 25)

In [9]:
%timeit build_concordance_with_locations("the", 25, 10000)

1 loops, best of 3: 6.36 s per loop


In [None]:
%timeit build_concordance_with_locations("the", 25, 100000)

In [None]:
%timeit -n1 concordance = build_concordance_with_locations("the", 25, 1000000)

In [23]:
concordance = build_concordance_with_locations("dense fog", 25, 1000)

In [24]:
len(concordance)

3

In [25]:
for line in concordance:
    print line

('rnoon is rawest, and the dense fog is densest, and the mudd', 'BH.c1.s17', 'BH.c1.p4')
('r had been filled with a dense fog, which, clearing away in', 'BR.c26.s62', 'BR.c26.p21')
('en midnight. There was a dense fog too; as if it were a cit', 'MC.c8.s141', 'MC.c8.p60')


# Testing and learning stuff

In [None]:
# coding: utf-8

import os

from cheshire3.baseObjects import Session
from cheshire3.document import StringDocument
from cheshire3.internal import cheshire3Root
from cheshire3.server import SimpleServer   

session = Session()
session.database = 'db_dickens'
serv = SimpleServer(session, os.path.join(cheshire3Root, 'configs', 'serverConfig.xml'))
db = serv.get_object(session, session.database)
qf = db.get_object(session, 'defaultQueryFactory')
resultSetStore = db.get_object(session, 'resultSetStore')
idxStore = db.get_object(session, 'indexStore')

In [None]:
query = qf.get_query(session, '(c3.subcorpus-idx all "dickens" and/cql.proxinfo c3.chapter-idx any "fog")')
result_set = db.search(session, query)

A result in a resultset refers to a recordStore
(which in the case of the chapter-idx are chapters)

In [None]:
result = result_set[0]
print result

The occurences value is wrong, from what I can gather.

In [None]:
result.occurences

The values in proxInfo are not sorted

In [None]:
result.proxInfo[:15]

From the result one can get an actual result
which we can use to get_xml (as a string)
or get_dom (an XML tree).

In [None]:
record = result.fetch_record(session)

In [None]:
record.byteCount

In [None]:
?record.fetch_proxVector(session)

In [None]:
record.get_xml(session)

In [None]:
tree = record.get_dom(session)

## Types of XPATH queries needed for the concordance

In [None]:
%timeit tree.xpath("//txt/text()")
one = tree.xpath("//txt/text()")
len(one)

In [None]:
%timeit tree.xpath("/div/p/s/txt/text()")
two = tree.xpath("/div/p/s/txt/text()")
len(two)

If one has the eid and the character ofset (which
is also part of the w elements!) one can the actual
word that was a hit, but one can also go a bit further
and get the sentence and paragraphs id's.

In [None]:
tree.xpath("/div/p/s[@eid=183]/toks/w[@o=215]/text()")

In [None]:
tree.xpath("/div/p/s[@eid=183]/toks")

In [None]:
tree.xpath("/div/p/s[@eid=183]/toks/w/text()")

In [None]:
tree.xpath("/div/p/self::s[@eid=183]/toks/w[@o=215]")

In [None]:
tree.xpath("/div/p/s/toks/w[@o=215]")

Three ways to get the book, chapter and sentence id:

In [None]:
%timeit tree.xpath("//p/s[@eid=183]/@id")
tree.xpath("//p/s[@eid=183]/@id")

In [None]:
%timeit tree.xpath("//*[@eid=183]/@id")
# see how slow this is!
tree.xpath("//*[@eid=183]/@id")

In [None]:
%timeit tree.xpath("//s[@eid=183]/@id")
tree.xpath("//s[@eid=183]/@id")

If one creates a smaller tree, the computations 
are more efficient.

In [None]:
s = tree.xpath("//p/s[@eid=%i]" % 183)

In [None]:
s

In [None]:
print s[0]

In [None]:
s[0].xpath("self::s/@id")

In [None]:
s[0].xpath("attribute::id")

In [None]:
%timeit tree.xpath("//s[@eid=183]/ancestor::p/@id")
tree.xpath("//s[@eid=183]/ancestor::p/@id")

In [None]:
%timeit s[0].xpath("ancestor::p/@id")
s[0].xpath("ancestor::p/@id")

In [None]:
sentence = tree.xpath("//p/s[@eid=%i]" % 1)    # format: ['BH.c1.s93']
sentence

# Searching for fog

In [None]:
query = qf.get_query(session, '(c3.subcorpus-idx all "dickens" and/cql.proxinfo c3.chapter-idx any "fog")')
result_set = db.search(session, query)

In [None]:
result = result_set[0]
proxinfo = result.proxInfo[:15]

from pprint import pprint  # to "prettry print" a nested list nicely
pprint(proxinfo)

In [None]:
record.get_xml(session)[6409:8000]

In [None]:
tree = record.get_dom(session)

In [None]:
prox1 = proxinfo[0][0][0]
prox2 = proxinfo[0][0][1]
prox3 = proxinfo[0][0][2]
prox4 = proxinfo[0][0][3]

for prox in (prox1, prox2, prox3, prox4): print prox

In [None]:
# output is a boolean
tree.xpath('/div/p/s/toks/w/@o=%i' % prox1)

In [None]:
w = tree.xpath('/div/p/s/toks/w[@o=%i]/text()' % prox1)
print w
# o = 0 in this case; this is interesting to see
# quickly what words sentences start with. 

In [None]:
w = tree.xpath('/div/descendant::w[%i]' % prox2)
print w

In [None]:
w = tree.xpath('/div/descendant::w[%i]/text()' % int(prox2 + 1))
print w

In [None]:
tree.xpath('/div/descendant::w[%i]/text()' % int(prox2 + 1))

In [None]:
tree.xpath('/div/descendant::w[%i]/ancestor-or-self::s/@id' % int(prox2 + 1))

In [None]:
tree.xpath('/div/descendant::w[%i]/ancestor-or-self::p/@id' % int(prox2 + 1))

In [None]:
#TODO time
record.process_xpath(session, '//*[@eid="%d"]/following::w[%d+1]/ancestor-or-self::s' % (prox1, prox2))

In [None]:
def build_concordance_with_location(term, context):
    
    query = qf.get_query(session, """(c3.subcorpus-idx all "dickens" and/cql.proxinfo c3.chapter-idx any "{}" )""".format(term))
    result_set = db.search(session, query)

    concordance = []
        
    for result in result_set:
        record = result.fetch_record(session)
        tree = record.get_dom(session)
        text_nodes = tree.xpath('//txt/text()')
        text_only = ' '.join(sentence for sentence in text_nodes)
            
        for hit in result.proxInfo:
            word_id = hit[0][1]  
            char_location = hit[0][2]
            concordance_line = text_only[char_location - context : char_location + len(term) + context]
            #NOTE in these cases record.process_xpath(session, xpath) is not faster
            #TODO check there is only one result
            sentence = tree.xpath('/div/descendant::w[%i]/ancestor-or-self::s/@id' % int(word_id + 1))
            paragraph = tree.xpath('/div/descendant::w[%i]/ancestor-or-self::p/@id' % int(word_id + 1))
            concordance.append((concordance_line, sentence[0], paragraph[0]))
    
    return concordance

In [None]:
%timeit -n1 concordance = build_concordance_with_locations("the", 25, 1000000)

In [None]:
# coding: utf-8

import os

from cheshire3.baseObjects import Session
from cheshire3.document import StringDocument
from cheshire3.internal import cheshire3Root
from cheshire3.server import SimpleServer   

session = Session()
session.database = 'db_dickens'
serv = SimpleServer(session, os.path.join(cheshire3Root, 'configs', 'serverConfig.xml'))
db = serv.get_object(session, session.database)
qf = db.get_object(session, 'defaultQueryFactory')
resultSetStore = db.get_object(session, 'resultSetStore')
idxStore = db.get_object(session, 'indexStore')

In [None]:
%timeit build_concordance_with_locations("the", 25, 100000)

In [None]:
term = 'fog'
"""(c3.subcorpus-idx all "dickens" and/cql.proxinfo c3.chapter-idx any "{}" )""".format(term)


In [None]:
query = qf.get_query(session, """(c3.subcorpus-idx all "dickens" and/cql.proxinfo c3.chapter-idx any "{}" )""".format(term))
result_set = db.search(session, query)
len(result_set)

In [None]:
#build_concordance(result_set, 'we', 25)
# %timeit build_concordance(result_set, 'the', 25)
# option 1: get chapter as string
# option 2: get sentence as string with location info as well.

#TODO implement eid for quotes and suspensions
#TODO implement start and end result for pagination

In [None]:
the_location_dickens = []
def test():
    for result in result_set:
        proxinfo = result.proxInfo
        record = result.fetch_record(session)
        tree = record.get_dom(session)
        for hit in proxinfo:
            word_location = hit[0][1]
            char_location = hit[0][2]
            word_location = tree.xpath("/div/p/s/toks/w[@o=215]")
            # word_location = tree.xpath('//w[%i]' % word_location)
            the_location_dickens.append(word_location)
            #location = tree.xpath('//w[word_location]')
            #the_in_dickens.append([concordance_line, location])
#%timeit test()
test()
len(the_location_dickens)
# option 1: merge all txt together and then do xpath to get the location
# option 2: get txt with location info as well.

In [None]:
for result in result_set:
    proxinfo = result.proxInfo
    record = result.fetch_record(session)
    tree = record.get_dom(session)
    tree.xpath('//w[1]')

In [None]:
the_in_dickens = []
for result in result_set:
    proxinfo = result.proxInfo
    record = result.fetch_record(session)
    tree = record.get_dom(session)
    raw_tree = tree.xpath('//txt/text()')
    raw_str = ' '.join(sentence for sentence in raw_tree)
    for hit in proxinfo:
        char_location = hit[0][2]
        concordance_line = raw_str[char_location-20:char_location+len("the")+20]
        the_in_dickens.append(concordance_line)


for result in result_set:
    proxinfo = result.proxInfo
    record = result.fetch_record(session)
    tree = record.get_dom(session)
    raw_tree = tree.xpath('//txt/text()')
    raw_str = ' '.join(sentence for sentence in raw_tree)
    for hit in proxinfo:
        char_location = hit[0][2]
        print raw_str[char_location-20:char_location+len("the")+20]
        
for hit in proxinfo:
    char_location = hit[0][2]
    print raw_str[char_location-20:char_location+len("the")+20]        






Specs / Questions / Todos
-------------------------

* what if the search terms are more complex? how do you then do their len()?
* handle quotes etc.
* doing pagination
* adding query builder
* highlighting in a form if a word is frequent and it will thus take some time
* searches for more than a word (either a phrase or an or search)
* a transformer somewhere?


# In quotes

In [125]:
def build_concordance_with_locations(term, idx, context, max_hits):
    
    query = qf.get_query(session, """(c3.subcorpus-idx all "dickens" and/cql.proxinfo c3.{} any/proxinfo "{}" )""".format(idx, term))
    result_set = db.search(session, query)

    concordance = []
    
    count = 0
    
    for result in result_set:
    
        if count < max_hits:
            record = result.fetch_record(session)
            tree = record.get_dom(session)
            text_nodes = tree.xpath('//txt/text()')
            text_only = ' '.join(sentence for sentence in text_nodes)

            for hit in result.proxInfo:
                if count < max_hits:
                    count +=1 
                
                    element_id = hit[0][0]
                    word_id = hit[0][1]  
                    char_location = hit[0][2]
                    
                    ## the eid only differs from 0 for indexes other than the chapter index
                    if element_id:
                        el = tree.xpath('//*[@eid="{}"]'.format(element_id))[0]
                        el_char_offset = el.get('offset')
                        word_id = el.get('wordOffset')
                        char_location = int(el_char_offset) + char_location - 1
                    
                    concordance_line = text_only[char_location - context : char_location + len(term) + context]
                    #NOTE in these cases record.process_xpath(session, xpath) is not faster
                    #TODO check there is only one result
                    sentence = tree.xpath('/div/descendant::w[{}]/ancestor-or-self::s/@id'.format(int(word_id) + 1))
                    paragraph = tree.xpath('/div/descendant::w[{}]/ancestor-or-self::p/@id'.format(int(word_id) + 1))
                    concordance.append((concordance_line, sentence[0], paragraph[0]))
    
    return concordance

In [135]:
concordance = build_concordance_with_locations("fog", "quote-idx", 25, 100)

In [136]:
for line in concordance:
    print line[0]

eard of such a thing. "A fog, miss," said the young g
h it on my account. "The fog is very dense indeed!" s
're choking!' 'It's this fog,' returned Edwin; 'and i
ar in an hour or two. We can have dinner in from just
t is in the wind besides fog?' 'Mr. Drood,' said Bazz
nter. Mrs Quilp obeyed right willingly, and, kneeling
old, cold night, and the fog clings so.' As Miss Abbe
s summut run down in the fog, ma'am,' answered Bob. '
nd that's what makes the fog and the noise worse, don
oem--what is that name?--Fog--Perspiring Fog--ver goo
t name?--Fog--Perspiring Fog--ver good--ver good inde


In [13]:
build_concordance_with_locations("fog", "quote-idx", 25, 100)

[('', 'BH.c3.s1', 'BH.c3.p1'),
 ('', 'BH.c4.s1', 'BH.c4.p1'),
 ('', 'ED.c11.s1', 'ED.c11.p1'),
 ('e certain gabled houses some centuries of age still s',
  'ED.c11.s1',
  'ED.c11.p1'),
 ('nd the most ancient part of Holborn, London, where ce',
  'ED.c11.s1',
  'ED.c11.p1'),
 (' some accounts--an occupation to which the silence an',
  'OCS.c67.s1',
  'OCS.c67.p1'),
 ('e Jew once more came forth into Saint Mary Axe. But t',
  'OMF.c35.s1',
  'OMF.c35.p1'),
 ('he evening of this same foggy day when the yellow win',
  'OMF.c35.s1',
  'OMF.c35.p1'),
 ('e yellow window- blind of Pubsey and Co. was drawn do',
  'OMF.c35.s1',
  'OMF.c35.p1'),
 ('st of them, on the third morning after the election h',
  'PP.c15.s1',
  'PP.c15.p1'),
 ('he third morning after the election had terminated, w',
  'PP.c15.s1',
  'PP.c15.p1')]

In [14]:
build_concordance_with_locations("fog", "non-quote-idx", 25, 100)

[('to loom by husbandman and ploughboy. Most of the shop',
  'BH.c1.s14',
  'BH.c1.p2'),
 ('er weather. As much mud in the streets as if the wate',
  'BH.c1.s4',
  'BH.c1.p1'),
 ('ll. Implacable November weather. As much mud in the s',
  'BH.c1.s4',
  'BH.c1.p1'),
 (', and you must be patient with me, like a dear!" And ',
  'BH.c3.s3',
  'BH.c3.p1'),
 ('yby," said Mr. Kenge, standing with his back to the f',
  'BH.c4.s6',
  'BH.c4.p3'),
 ('iting), and with his quick abilities, his good spirit',
  'BH.c17.s1',
  'BH.c17.p1'),
 (' looked straight before him. "Mr. Vholes," said my gu',
  'BH.c45.s16',
  'BH.c45.p8'),
 ('should like to know," said my guardian, "what you thi',
  'BH.c45.s19',
  'BH.c45.p10'),
 ("ar this, Varden?' said Mr Haredale. 'Well! You and sh",
  'BR.c26.s2',
  'BR.c26.p1'),
 ('o idea of it but whistling. Peggotty had a basket of ',
  'DC.c3.s4',
  'DC.c3.p1'),
 ('to have that lofty castle to myself, and to feel, whe',
  'DC.c24.s1',
  'DC.c24.p1'),
 ('shing voices we

In [86]:
query = qf.get_query(session, """(c3.subcorpus-idx all "dickens" and/cql.proxinfo c3.{} any "{}" )""".format("quote-idx", "fog"))
result_set = db.search(session, query)

In [34]:
for result in result_set:
    print result.proxInfo

[[[636, 1, 3, 9178]]]
[[[94, 1, 5, 9178]]]
[[[89, 2, 11, 9178]], [[130, 7, 78, 9178]], [[54, 6, 29, 9178]]]
[[[106, 28, 473, 9178]]]
[[[395, 32, 154, 9178]], [[476, 6, 29, 9178]], [[517, 12, 70, 9178]]]
[[[696, 30, 191, 9178]], [[696, 32, 207, 9178]]]


In [40]:
one_result = result_set[0]

In [45]:
one_fetched_result = one_result.fetch_record(session)

In [146]:
dom = one_fetched_result.get_dom(session)

In [147]:
qs = dom.xpath('//*[@eid="1"]')

In [151]:
qs[0].attrib

{'type': 'speech', 'pid': '1', 'id': 'BH.c3.p1', 'eid': '1'}

In [113]:
dom_text_nodes = dom.xpath('//txt/text()')
dom_text_only = ' '.join(sentence for sentence in dom_text_nodes)

In [114]:
dom_text_only[30639+2:30700]

'fog, miss," said the young gentleman. "Oh, indeed!" said I.'

In [105]:
second_result = result_set[1]
print second_result.proxInfo
second_fetched_result = second_result.fetch_record(session)
second_dom = second_fetched_result.get_dom(session)
second_qs = second_dom.xpath('//qs[@eid="94"]')[0]
second_qs.attrib

[[[94, 1, 5, 9178]]]


{'wordOffset': '589', 'eid': '94', 'offset': '3259'}

In [106]:
second_dom_text_nodes = second_dom.xpath('//txt/text()')
second_dom_text_only = ' '.join(sentence for sentence in second_dom_text_nodes)
second_dom_text_only[3259+4:3400]

'fog is very dense indeed!" said I. "Not that it affects you, though, I\'m sure," said Mr. Guppy, putting up the steps. "On the contrary, i'

In [30]:
query = qf.get_query(session, """(c3.subcorpus-idx all "dickens" and/cql.proxinfo c3.{} any "{}" )""".format("chapter-idx", "fog"))
result_set = db.search(session, query)
for result in result_set:
    print result.proxInfo

[[[0, 169, 1033, 15292]], [[0, 171, 1049, 15292]], [[0, 206, 1241, 15292]], [[0, 216, 1295, 15292]], [[0, 247, 1471, 15292]], [[0, 183, 1112, 15292]], [[0, 211, 1267, 15292]], [[0, 223, 1344, 15292]], [[0, 237, 1415, 15292]], [[0, 264, 1574, 15292]], [[0, 283, 1671, 15292]], [[0, 312, 1836, 15292]], [[0, 314, 1846, 15292]], [[0, 336, 1955, 15292]], [[0, 392, 2248, 15292]], [[0, 433, 2499, 15292]], [[0, 449, 2586, 15292]], [[0, 556, 3190, 15292]], [[0, 727, 4181, 15292]], [[0, 2017, 11496, 15292]], [[0, 2365, 13596, 15292]], [[0, 2430, 13942, 15292]]]
[[[0, 5841, 30641, 15292]], [[0, 7479, 39482, 15292]]]
[[[0, 590, 3263, 15292]], [[0, 694, 3814, 15292]], [[0, 4848, 26290, 15292]]]
[[[0, 8, 47, 15292]]]
[[[0, 1988, 11147, 15292]]]
[[[0, 39, 239, 15292]]]
[[[0, 1756, 9305, 15292]], [[0, 1847, 9807, 15292]]]
[[[0, 1076, 5846, 15292]]]
[[[0, 1851, 9818, 15292]]]
[[[0, 170, 914, 15292]]]
[[[0, 149, 802, 15292]]]
[[[0, 990, 5285, 15292]]]
[[[0, 2639, 14398, 15292]]]
[[[0, 18, 87, 15292]]]
[[

In [24]:
query = qf.get_query(session, """(c3.subcorpus-idx all "dickens" and/cql.proxinfo c3.{} = "{}" )""".format("chapter-idx", "substance on the fog"))
result_set = db.search(session, query)
print len(result_set)
for result in result_set:
    print result.proxInfo
# record.get_xml(session)


1
[[[0, 3600, 19257, 37577], [0, 3601, 19267, 26551], [0, 3602, 19270, 38822], [0, 3603, 19274, 15292]]]


In [26]:
result = result_set[0]
result = result.fetch_record(session)
record = result.get_xml(session)

In [28]:
record.find('fog')

143708

In [29]:
record[143708:143800]

'fog.</txt><toks><w o="0">The</w><n> </n><w o="4">turnpike</w><n> </n><w o="13">lamp</w><n> <'

In [30]:
type(record)

str

In [31]:
dom = result.get_dom(session)

In [35]:
dom.xpath('//w[@o="19257"]')

[]

In [39]:
xml_to_clean_txt(dom)[19257:19500]

"p was a blur, quite out of the lamp's usual place apparently, and its rays looked solid substance on the fog. We were noticing this, and saying how that the mist rose with a change of wind from a certain quarter of our marshes, when we came up"

In [37]:
def xml_to_clean_txt(xmltree):
    text_nodes = xmltree.xpath('//txt/text()')
    text_only = ' '.join(sentence for sentence in text_nodes)
    return text_only  

In [40]:
xml_to_clean_txt(dom)[19000
                      :19500]

"ded I could only induce one to have the weakness to become my benefactor. It was a very dark night when it was all over, and when I set out with Mr. Wopsle on the walk home. Beyond town, we found a heavy mist out, and it fell wet and thick. The turnpike lamp was a blur, quite out of the lamp's usual place apparently, and its rays looked solid substance on the fog. We were noticing this, and saying how that the mist rose with a change of wind from a certain quarter of our marshes, when we came up"