In [1]:
# imports
# Put all your imports here
from whoosh import index, writing
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import *
from whoosh.qparser import QueryParser
import os.path
from pathlib import Path
import tempfile
import subprocess
import nltk
#from nltk.stem import *
#nltk.download("wordnet")
from whoosh import scoring


In [2]:
DATA_DIR = "government"

DOCUMENTS_DIR = os.path.join(DATA_DIR, "documents")
TOPIC_FILE = os.path.join(DATA_DIR, "gov.topics")
QRELS_FILE = os.path.join(DATA_DIR, "gov.qrels")

TREC_EVAL = os.path.join("trec_eval", "trec_eval.exe")

## Question 1


### Q1 (a): Provide answer to Q1 (a) here [markdown cell]
'P_5'

### Q1 (b): Provide answer to Q1 (b) here [markdown cell]
People searching government websites (have specific purpose) would be different from general websearcher's and what is more important than ranking is whether the relevant documents are retrieved or not.

## Question 2

### Q2 (a): Write your code below

### Creating the index

In [4]:
def createIndex(schema):
    # Generate a temporary directory for the index
    indexDir = tempfile.mkdtemp()

    # create and return the index
    return index.create_in(indexDir, schema)

In [5]:
# first, define a Schema for the index
mySchema = Schema(file_path = ID(stored=True),
                  file_content = TEXT(analyzer = RegexTokenizer()))

# now, create the index at the path INDEX_DIR based on the new schema
myIndex = createIndex(mySchema)

### Indexing the documents

In [6]:
def addFilesToIndex(indexObj, fileList):
    # open writer
    writer = writing.BufferedWriter(indexObj, period=None, limit=1000)

    try:
        # write each file to index
        for docNum, filePath in enumerate(fileList):
            with open(filePath, "r", encoding="utf-8") as f:
                fileContent = f.read()
                writer.add_document(file_path = filePath,
                                    file_content = fileContent)

                # print status every 1000 documents
                if (docNum+1 % 1000 == 0):
                    print("already indexed:", docNum+1)
        print("done indexing.")

    finally:
        # close the index
        writer.close()

In [7]:
# Build a list of files to index
filesToIndex = [str(filePath) for filePath in Path(DOCUMENTS_DIR).glob("**/*") if filePath.is_file()]

In [8]:
# Check the list
#filesToIndex[:5]

In [9]:
# count files to index
print("number of files:", len(filesToIndex))

number of files: 4078


In [10]:
addFilesToIndex(myIndex, filesToIndex)

done indexing.


### Querying

In [11]:
# define a query parser for the field "file_content" in the index
myQueryParser = QueryParser("file_content", schema=myIndex.schema)
mySearcher = myIndex.searcher()

### Evaluation using TREC_EVAL

In [14]:
# print the first 10 lines in the qrels file
with open(QRELS_FILE, "r") as f:
    qrels10 = f.readlines()[:10]
    #print("".join(qrels10))

In [15]:
def trecEval(topicFile, qrelsFile, queryParser, searcher):
    # Load topic file - a list of topics(search phrases) used for evalutation
    with open(topicFile, "r") as tf:
        topics = tf.read().splitlines()
        #print(topics)
        

    tempOutputFile = tempfile.mkstemp()[1]
    with open(tempOutputFile, "w") as outputTRECFile:

        for topic in topics:
            topic_id, topic_phrase = tuple(topic.split(" ", 1))
            topicQuery = queryParser.parse(topic_phrase)
            #print(topicQuery)
            topicResults = searcher.search(topicQuery, limit=None)
            
            for (docnum, result) in enumerate(topicResults):
                score = topicResults.score(docnum)               
                outputTRECFile.write("%s Q0 %s %d %lf test\n" % (topic_id, os.path.basename(result["file_path"]), docnum, score))
    
    result = subprocess.run([TREC_EVAL, '-q', qrelsFile, tempOutputFile], stdout=subprocess.PIPE)
    print(result.stdout.decode())

In [16]:
trecEval(TOPIC_FILE, QRELS_FILE, myQueryParser, mySearcher) 

num_ret               	1	1
num_rel               	1	5
num_rel_ret           	1	0
map                   	1	0.0000
Rprec                 	1	0.0000
bpref                 	1	0.0000
recip_rank            	1	0.0000
iprec_at_recall_0.00  	1	0.0000
iprec_at_recall_0.10  	1	0.0000
iprec_at_recall_0.20  	1	0.0000
iprec_at_recall_0.30  	1	0.0000
iprec_at_recall_0.40  	1	0.0000
iprec_at_recall_0.50  	1	0.0000
iprec_at_recall_0.60  	1	0.0000
iprec_at_recall_0.70  	1	0.0000
iprec_at_recall_0.80  	1	0.0000
iprec_at_recall_0.90  	1	0.0000
iprec_at_recall_1.00  	1	0.0000
P_5                   	1	0.0000
P_10                  	1	0.0000
P_15                  	1	0.0000
P_20                  	1	0.0000
P_30                  	1	0.0000
P_100                 	1	0.0000
P_200                 	1	0.0000
P_500                 	1	0.0000
P_1000                	1	0.0000
num_ret               	10	16
num_rel               	10	1
num_rel_ret           	10	1
map                   	10	0.1667
Rprec                 	10	0.0000


In [19]:
INDEX_Q2 = myIndex # Replace None with your index for Q2
QP_Q2 = myQueryParser # Replace None with your query parser for Q2
SEARCHER_Q2 = mySearcher # Replace None with your searcher for Q2

### Q2 (b): Provide answer to Q2 (b) here [markdown cell]
P_5: 0.0714.

This is not good as it says on average only 0.35 relevant documents are retrieved in top 5 results. As there is atleast one relavant document for each query so the average should have been around 0.2 (1/5)

### Q2 (c): Provide answer to Q2(c) here [markdown cell]
Bad Cases: Eventhough they are more than 1 relavant document not even one is retrieved in top 5.

(ID,  Score):
1. 1,	0.0000 (no of rel are '5');
2. 2,	0.0000 (no of rel are '2');
3. 4,  0.0000 (no of rel are '4');
4. 16, 0.0000 (no of rel are '7');

Good Cases: There is only one relavant document and it is in top 5 results so the maximum P_5 possible is 0.2.

(ID,  Score): 
1. 14,	0.2000 (no of rel are '1');
2. 18, 0.2000 (no of rel are '1');


## Question 3

### Q3 (a): Provide answer to Q3 (a) here [markdown cell]
False Positive:

For example lets consider the query 2 : juvenile delinquency. six docuemnts are retrieved but none of them is relavent. If you look in to one of the document G00-22-3396139 you can see that the document is not about the juvenile crime rates but it actually about which department is responsible for juvenile crime rate. This is retrieved but not relavent.

False negative:
If we considet the same example but instead look in to the document G00-22-1831204 it is not retrieved and also not relavent.

This means the bag of words model is working correctly and not retrieving the documents that doesn't have the words atleast once. But we could try to decrease its False positives


### Q3 (b): Write your code below

In [18]:
# we probably want to ignore words like "we", "are", "with" when we index files
# so we add StopFilter to filter stop words
stmLwrStpAnalyzer = RegexTokenizer() | LowercaseFilter() | StopFilter() | StemFilter()
#[token.text for token in stmLwrStpAnalyzer("We are going to do Text Analysis with whoosh.analysis industries industry ")]

In [20]:
mySchema3 = Schema(file_path = ID(stored=True),
                   file_content = TEXT(analyzer = stmLwrStpAnalyzer))

myIndex3 = createIndex(mySchema3)

In [21]:
addFilesToIndex(myIndex3, filesToIndex)

done indexing.


In [22]:
# define a query parser for the field "file_content" in the index
myQueryParser3 = QueryParser("file_content", schema=myIndex3.schema)
mySearcher3 = myIndex3.searcher()

In [23]:
trecEval(TOPIC_FILE, QRELS_FILE, myQueryParser3, mySearcher3) 

num_ret               	1	3
num_rel               	1	5
num_rel_ret           	1	0
map                   	1	0.0000
Rprec                 	1	0.0000
bpref                 	1	0.0000
recip_rank            	1	0.0000
iprec_at_recall_0.00  	1	0.0000
iprec_at_recall_0.10  	1	0.0000
iprec_at_recall_0.20  	1	0.0000
iprec_at_recall_0.30  	1	0.0000
iprec_at_recall_0.40  	1	0.0000
iprec_at_recall_0.50  	1	0.0000
iprec_at_recall_0.60  	1	0.0000
iprec_at_recall_0.70  	1	0.0000
iprec_at_recall_0.80  	1	0.0000
iprec_at_recall_0.90  	1	0.0000
iprec_at_recall_1.00  	1	0.0000
P_5                   	1	0.0000
P_10                  	1	0.0000
P_15                  	1	0.0000
P_20                  	1	0.0000
P_30                  	1	0.0000
P_100                 	1	0.0000
P_200                 	1	0.0000
P_500                 	1	0.0000
P_1000                	1	0.0000
num_ret               	10	42
num_rel               	10	1
num_rel_ret           	10	1
map                   	10	0.2500
Rprec                 	10	0.0000


In [24]:
INDEX_Q3 = myIndex3 # Replace None with your index for Q3
QP_Q3 = myQueryParser3 # Replace None with your query parser for Q3
SEARCHER_Q3 = mySearcher3 # Replace None with your searcher for Q3

### Q3 (c): Provide answer to Q3 (c) here [markdown cell]

I changed the default RegexTokenizer in the baseline system to "stmLwrStpAnalyzer" 
stmLwrStpAnalyzer applies following modifications to both Query and the document content before ranking.
1. converts all the content in to lower case this makes both words "Nuclear" and "nuclear" equivalent.
2. removes stop words like "we", "are" and this makes ranking more relavent.
3. Stemfilter stems all the words so the words "industries" and "industry" both become "industri".

The following are the results after modification

1. num_q                 	all	15 
2. num_ret               	all	453
3. num_rel               	all	35
4. num_rel_ret           	all	15
5. map                   	all	0.3363
6. Rprec                 	all	0.3000
7. P_5                   	all	0.1067

The following are the results before modification

1. num_q                 	all	14
2. num_ret               	all	151
3. num_rel               	all	33
4. num_rel_ret           	all	7
5. map                   	all	0.1971
6. Rprec                 	all	0.1667
7. P_5                   	all	0.0714

The overall improvements have been in areas of Rprec and P_5. 

Rprec we can clearly see that the improvment is almost double from 0.1667 to 0.3. This means on average assume for each query there are 5 relevant documents in total then we would get 1.5 relevant documents in top 5 results now compared to 0.8 before.

P_5 also increased from 0.0714 to 0.1067. This a good increase too because not many queries have more than 1 relevant documents so value of 0.2 for P_5 can be considered excellent but 0.1067 is a good precision. 

map also increased from 0.1971 to 0.3363 indicating the relavent documents also had better ranks and this can be atrributed partially to the removing of stop words

Although there was overall improvement if we go back to the same false positive example there the document G00-22-3396139 is again retrieved for the topic id 2, but not relavent. And this is a drawback of bag of words approach and only could be addressed by quality of phrasing or positional indices.

### Q3 (d): Provide answer to Q3 (d) here [markdown cell]
yes


### Q3 (e): Provide answer to Q3 (e) here [markdown cell]
yes (p_5 for query id 2,4 got better were as 26 got worse)



### Q3 (f): Provide answer to Q3 (f) here [markdown cell]

The draw back of bag of words model is that the position phrases are not considered this would lead to retrieving huge number of not relevant documents (total 151 to 453 retrieved) when we do stemming and remove stop words. Considering the bag of words model the above increase has been good. And it is always better to convert all of the content in lower case and remove stop words

## Question 4 (Graduate Students)

In [27]:
GRAD_STUDENT = True # change to True if you are a grad student

### Q4 (a): Provide answer to Q4 (a) here [markdown cell]

### Q4 (b): Write your code below

In [25]:
mySchema4 = Schema(file_path = ID(stored=True),
                   file_content = TEXT(analyzer = stmLwrStpAnalyzer))

myIndex4 = createIndex(mySchema4)

In [28]:
addFilesToIndex(myIndex4, filesToIndex)

done indexing.


In [31]:
# define a query parser for the field "file_content" in the index
myQueryParser4 = QueryParser("file_content", schema=myIndex4.schema)
#mySearcher4 = myIndex4.searcher(weighting=scoring.TF_IDF())
mySearcher4 = myIndex4.searcher(weighting=scoring.BM25F(B=0.50, K1=0.6))

In [32]:
trecEval(TOPIC_FILE, QRELS_FILE, myQueryParser4, mySearcher4) 

num_ret               	1	3
num_rel               	1	5
num_rel_ret           	1	0
map                   	1	0.0000
Rprec                 	1	0.0000
bpref                 	1	0.0000
recip_rank            	1	0.0000
iprec_at_recall_0.00  	1	0.0000
iprec_at_recall_0.10  	1	0.0000
iprec_at_recall_0.20  	1	0.0000
iprec_at_recall_0.30  	1	0.0000
iprec_at_recall_0.40  	1	0.0000
iprec_at_recall_0.50  	1	0.0000
iprec_at_recall_0.60  	1	0.0000
iprec_at_recall_0.70  	1	0.0000
iprec_at_recall_0.80  	1	0.0000
iprec_at_recall_0.90  	1	0.0000
iprec_at_recall_1.00  	1	0.0000
P_5                   	1	0.0000
P_10                  	1	0.0000
P_15                  	1	0.0000
P_20                  	1	0.0000
P_30                  	1	0.0000
P_100                 	1	0.0000
P_200                 	1	0.0000
P_500                 	1	0.0000
P_1000                	1	0.0000
num_ret               	10	42
num_rel               	10	1
num_rel_ret           	10	1
map                   	10	0.3333
Rprec                 	10	0.0000


In [33]:
INDEX_Q4 = myIndex4 # Replace None with your index for Q4
QP_Q4 = myQueryParser4 # Replace None with your query parser for Q4
SEARCHER_Q4 = mySearcher4 # Replace None with your searcher for Q4

### Q4 (c): Provide answer to Q4 (a) here [markdown cell]

Before
1. map                   	all	0.3366
2. P_5                   	all	0.1067
3. num_q                 	all	15
4. num_ret               	all	453
5. num_rel               	all	35
6. num_rel_ret           	all	15

After
1. map                   	all	0.3557
2. P_5                   	all	0.1200
3. num_q                 	all	15
4. num_ret               	all	453
5. num_rel               	all	35
6. num_rel_ret           	all	15

I tried using alternative method of scoring which is Tf-idf and found no improvement so Instead I tried to tune the default BM25F by changing it's parameters. I could increase both MAP and P_5 by decreasing the by decreasing both B and K1. 

### Q4 (d): Provide answer to Q4 (a) here [markdown cell]
yes

### Q4 (e): Provide answer to Q4 (a) here [markdown cell]
No (wrt to question 3) Yes (question 2)

### Q4 (f): Provide answer to Q4 (a) here [markdown cell]

This is a decent improvement, although the overall retrieval was same there is 20% improvement in the P_5 precision and this is definetely usefull for the users. The queries are distinct and I am considering the average of all the queries we cannot say it is over fitting. 