# Cheshire queries

In [1]:
# coding: utf-8

import os

from cheshire3.baseObjects import Session
from cheshire3.document import StringDocument
from cheshire3.internal import cheshire3Root
from cheshire3.server import SimpleServer   

session = Session()
session.database = 'db_dickens'
serv = SimpleServer(session, os.path.join(cheshire3Root, 'configs', 'serverConfig.xml'))
db = serv.get_object(session, session.database)
qf = db.get_object(session, 'defaultQueryFactory')
resultSetStore = db.get_object(session, 'resultSetStore')
idxStore = db.get_object(session, 'indexStore')

## Define helper functions

In [2]:
def count_total(result_set):
    """
    Helper function to count the total number of hits
    in the search results
    """
    count = 0 
    for result in result_set:
        count += len(result.proxInfo)
    return count

def try_query(query):
    """
    Another helper function to take a query and return
    the total number of hits
    """
    query = qf.get_query(session, query)
    result_set = db.search(session, query)
    return count_total(result_set)

# Basic queries

## Or search: any/proxinfo
gets any of the elements in the search string.

In [3]:
try_query("""
           (c3.subcorpus-idx all "dickens" 
           and/cql.proxinfo c3.chapter-idx any/proxinfo "dense fog")
           """
           )

142

Note that the `/proxinfo` bit is important. Without it the results would be wrong.

In [4]:
try_query("""
           (c3.subcorpus-idx all "dickens" 
           and/cql.proxinfo c3.chapter-idx any "dense fog")
           """
           )

107

## Phrase search: =
gets the exact phrase

In [5]:
try_query("""
           (c3.subcorpus-idx all "dickens" 
           and/cql.proxinfo c3.chapter-idx = "dense fog")
           """
           )

3

## And search: all/proxinfo

In [6]:
try_query("""
           (c3.subcorpus-idx all "dickens" 
           and/cql.proxinfo c3.chapter-idx all/proxinfo "dense fog")
           """
           )

44

or in reverse order:

In [7]:
try_query("""
           (c3.subcorpus-idx all "dickens" 
           and/cql.proxinfo c3.chapter-idx all/proxinfo "fog dense")
           """
           )

44

Note that again the `/proxinfo` bit is important. Without it the results would be wrong.

In [8]:
try_query("""
           (c3.subcorpus-idx all "dickens" 
           and/cql.proxinfo c3.chapter-idx all "fog dense")
           """
           )

35

## Not search: not

If we'd want to know where fog occurs, but without dense:

In [42]:
try_query("""
           (c3.subcorpus-idx all "dickens" 
           and/cql.proxinfo c3.chapter-idx = "fog" 
           not c3.chapter-idx = "dense")
           """
           )

59

If we'd want to know where dense occurs, but without fog:

In [43]:
try_query("""
           (c3.subcorpus-idx all "dickens" and/cql.proxinfo c3.chapter-idx = "dense" not c3.chapter-idx = "fog")
           """
           )

39

fog with dense:

In [10]:
try_query("""
           (c3.subcorpus-idx all "dickens" 
           and/proxinfo c3.chapter-idx all "fog dense")
           """
           )

35

fog without dense + fog with dense together make up for the total occurrences of fog (94). Note that the search resolution is the chapter.

# Simple combinations of these operators

Search for several colors but not black or white

In [11]:
try_query("""
           (c3.subcorpus-idx all "dickens" 
           and/cql.proxinfo c3.chapter-idx any/proxinfo "grey yellow brown"
           not c3.chapter-idx any/proxinfo "black white") 
           """
           )

198

#FIXME
Search for EITHER several colors but not black or white OR several title but not sir

In [13]:
try_query("""
           (c3.subcorpus-idx all "dickens" 
           
           and (any/cql.proxinfo c3.chapter-idx any/proxinfo "grey yellow brown"
           not c3.chapter-idx any/proxinfo "black white")
           
           or (any/cql.proxinfo c3.chapter-idx any/proxinfo "gentleman gentlemen woman man boy"
           not c3.chapter-idx = "sir")
           )
           """
           )

ERROR: An unexpected error occurred while tokenizing input
The following traceback may be corrupted or invalid
The error message is: ('EOF in multi-line string', (1, 11))



Diagnostic: info:srw/diagnostic/1/10 [Malformed Query]: c3.chapter-idx

# Searching the entire corpus

In [14]:
# Occurences for fog in the entire corpus
try_query("""(c3.chapter-idx = "fog")""")

182

In [15]:
fog_in_dickens = try_query("""(c3.subcorpus-idx all "dickens" and/cql.proxinfo c3.chapter-idx = "fog")""")
print fog_in_dickens

94


In [16]:
fog_in_ntc = try_query("""(c3.subcorpus-idx all "ntc" and/cql.proxinfo c3.chapter-idx = "fog")""")
print fog_in_ntc

88


In [17]:
fog_in_dickens + fog_in_ntc

182

## Searching in a specific book

In [18]:
try_query('c3.book-idx = "BH" and/proxinfo c3.chapter-idx = "fog"')

32

In [19]:
try_query('c3.subcorpus-idx = "dickens" and c3.book-idx = "BH" and c3.chapter-idx = "fog"')

0

Note the subtle difference between the following two queries (the order of the book and chapter index)

In [20]:
try_query('c3.subcorpus-idx = "dickens" and c3.chapter-idx = "fog"  and c3.book-idx = "BH"')

32

Again the proxinfo is really important:

In [21]:
try_query('c3.book-idx = "BH" and c3.chapter-idx = "fog"')

0

In [22]:
try_query('c3.book-idx = "BH" and/proxinfo c3.sentence-idx = "fog"')

32

# Different indexes can be combined

In [23]:
try_query("""
           (c3.subcorpus-idx all "dickens" 
           and/cql.proxinfo c3.quote-idx = "dense" 
           not c3.chapter-idx = "fog")
           """
           )

2

In [24]:
try_query("""
           (c3.subcorpus-idx all "dickens" 
           and/cql.proxinfo c3.quote-idx any/cql.proxinfo "dense fog")
           """
           )

14

In [25]:
try_query("""
           (c3.subcorpus-idx all "dickens" 
           and/cql.proxinfo c3.quote-idx any "dense fog")
           """
           )

13

In [26]:
try_query("""
           (c3.subcorpus-idx all "dickens" 
           and c3.quote-idx any "dense fog")
           """
           )

13

In [27]:
try_query("""
           (c3.subcorpus-idx all "dickens" 
           and c3.quote-idx any/cql.proxinfo "dense fog")
           """
           )

14

In [28]:
# find all instances where both a quote and a long suspension contain the word 'said'
# these *could* be cases where one character is quoting another

try_query("""
           (c3.subcorpus-idx all "dickens" 
           and/cql.proxinfo c3.quote-idx any "said"
           and/cql.proxinfo c3.longsus-idx any "said")
           """
           )

4520

In [29]:
try_query("""
           (c3.subcorpus-idx all "dickens" 
           and c3.quote-idx = "said"
           and/cql.proxinfo c3.longsus-idx = "said")
           """
           )

3094

In [44]:
result

Ptr:recordStore/0

In [45]:
record

recordStore/835