# Assignment 2: IR

## Preparations
* Put all your imports, and path constants in the next cells
* Make sure all your path constants are **relative to** ***DATA_DIR*** and **NOT hard-coded** in your code.

In [0]:
!pip install whoosh
!pip install pytrec_eval
!pip install wget



In [0]:
import wget
wget.download("https://github.com/MIE451-1513-2019/course-datasets/raw/master/government.zip", "government.zip")

'government (2).zip'

In [0]:
!unzip government.zip

Archive:  government.zip
replace government/topics-with-full-descriptions.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [0]:
# imports
# Put all your imports here
from whoosh import index, writing
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import *
from whoosh.qparser import QueryParser
import os.path
from pathlib import Path
import tempfile
import subprocess
import pytrec_eval
import wget
import nltk
from nltk.stem import *
from whoosh.analysis import Filter
from whoosh import scoring
from whoosh import qparser

In [0]:
DATA_DIR = "government"
#
# Put other path constants here
#
DOCUMENTS_DIR = os.path.join(DATA_DIR, "documents")
TOPIC_FILE = os.path.join(DATA_DIR, "gov.topics")
QRELS_FILE = os.path.join(DATA_DIR, "gov.qrels")

## Question 1
Provide your text answers in the following two markdown cells

### Q1 (a): Provide answer to Q1 (a) here [markdown cell]

Mean Average precision. (map)

### Q1 (b): Provide answer to Q1 (b) here [markdown cell]

MAP is the mean of average precision across multiple queries. It evaluates the search quality across recall levels. Because for the website, the user wants to find more relevant documents, thus MAP can be a good judgement. 

## Question 2

### Q2 (a): Write your code below

In [0]:
def createIndex(schema):
    # Generate a temporary directory for the index
    indexDir = tempfile.mkdtemp()

    # create and return the index
    return index.create_in(indexDir, schema)

In [0]:
# first, define a Schema for the index
mySchema = Schema(file_path = ID(stored=True),
                  file_content = TEXT(analyzer = RegexTokenizer()))

In [0]:
def addFilesToIndex(indexObj, fileList):
    # open writer
    writer = writing.BufferedWriter(indexObj, period=None, limit=1000)

    try:
        # write each file to index
        for docNum, filePath in enumerate(fileList):
            with open(filePath, "r", encoding="utf-8") as f:
                fileContent = f.read()
                writer.add_document(file_path = filePath,
                                    file_content = fileContent)

                # print status every 1000 documents
                if (docNum+1 % 1000 == 0):
                    print("already indexed:", docNum+1)
        print("done indexing.")

    finally:
        # close the index
        writer.close()

In [0]:
# Build a list of files to index
filesToIndex = [str(filePath) for filePath in Path(DOCUMENTS_DIR).glob("**/*") if filePath.is_file()]

In [0]:
# Put your code for creating the index here (you can add more cells).
# Make sure you save the final index in the variable INDEX_Q2, your query parser in QP_Q2, and your searcher in SEARCHER_Q2

In [0]:
INDEX_Q2 = createIndex(mySchema) # Replace None with your index for Q2
addFilesToIndex(INDEX_Q2, filesToIndex)
QP_Q2 = QueryParser("file_content", schema=INDEX_Q2.schema) # Replace None with your query parser for Q2
SEARCHER_Q2 = INDEX_Q2.searcher() # Replace None with your searcher for Q2

done indexing.


### Q2 (b): Provide answer to Q2 (b) here [markdown cell]

In [0]:
def pyTrecEval(topicFile, qrelsFile, queryParser, searcher):
    # Load topic file - a list of topics(search phrases) used for evalutation
    with open(topicFile, "r") as tf:
        topics = tf.read().splitlines()

    # create an output file to which we'll write our results
    tempOutputFile = tempfile.mkstemp()[1]
    with open(tempOutputFile, "w") as outputTRECFile:
        # for each evaluated topic:
        # build a query and record the results in the file in TREC_EVAL format
        for topic in topics:
            topic_id, topic_phrase = tuple(topic.split(" ", 1))
            #print(topic_id, topic_phrase)
            topicQuery = queryParser.parse(topic_phrase)
            topicResults = searcher.search(topicQuery, limit=None)
            for (docnum, result) in enumerate(topicResults):
                score = topicResults.score(docnum)
                #print("%s Q0 %s %d %lf test\n" % (topic_id, os.path.basename(result["file_path"]), docnum, score))
                outputTRECFile.write("%s Q0 %s %d %lf test\n" % (topic_id, os.path.basename(result["file_path"]), docnum, score))
    with open(qrelsFile, 'r') as f_qrel:
        qrel = pytrec_eval.parse_qrel(f_qrel)

    with open(tempOutputFile, 'r') as f_run:
        run = pytrec_eval.parse_run(f_run)

    evaluator = pytrec_eval.RelevanceEvaluator(
        qrel, pytrec_eval.supported_measures)

    results = evaluator.evaluate(run)
    def print_line(measure, scope, value):
        print('{:25s}{:8s}{:.4f}'.format(measure, scope, value))

    for query_id, query_measures in results.items():
        for measure, value in query_measures.items():
            if measure == "runid":
              continue
            print_line(measure, query_id, value)
    for measure in query_measures.keys():
        if measure == "runid":
              continue
        print_line(
            measure,
            'all',
            pytrec_eval.compute_aggregated_measure(
                measure,
                [query_measures[measure]
                 for query_measures in results.values()]))

In [0]:
pyTrecEval(TOPIC_FILE, QRELS_FILE, QP_Q2, SEARCHER_Q2) 

num_q                    1       1.0000
num_ret                  1       1.0000
num_rel                  1       5.0000
num_rel_ret              1       0.0000
map                      1       0.0000
gm_map                   1       -11.5129
Rprec                    1       0.0000
bpref                    1       0.0000
recip_rank               1       0.0000
iprec_at_recall_0.00     1       0.0000
iprec_at_recall_0.10     1       0.0000
iprec_at_recall_0.20     1       0.0000
iprec_at_recall_0.30     1       0.0000
iprec_at_recall_0.40     1       0.0000
iprec_at_recall_0.50     1       0.0000
iprec_at_recall_0.60     1       0.0000
iprec_at_recall_0.70     1       0.0000
iprec_at_recall_0.80     1       0.0000
iprec_at_recall_0.90     1       0.0000
iprec_at_recall_1.00     1       0.0000
P_5                      1       0.0000
P_10                     1       0.0000
P_15                     1       0.0000
P_20                     1       0.0000
P_30                     1       0.000

For mean average precision, the number is 0.1971 for all queries.


### Q2 (c): Provide answer to Q2(c) here [markdown cell]

Topics did well: Topic 18 and 24 are the best for Map=1. Topic 14 and 22 are also good with map 0.25 and 0.2 identically.


Topics that did bad: Topic 1,2, 6,7,9,16 and 28 are the worst with 0 map. Topic 4 also did not perform well with only 0.03 map value.

## Question 3

### Q3 (a): Provide answer to Q3 (a) here [markdown cell]

For query 14, the mean average precision is 0.25. There are 7 documents retrieved but only one is relevant which is G00-89-0000000. since there is only query, the average precision is also 0.25. By the formula, we can get the only relevant documnt rank 4th, that is, 1*(1/4)=0.25. Since it is the only relevant document, it should be rank the first while the other three irrelevant documents G00-79-4144643, G00-09-1193469 and G00-45-0809730 should not according to the results we get above. The topic of the 14th query is Agricultural biotechnology. Let us look at two examples.

False negative: G00-89-0000000. The focus of the document is agricultural technology and the term has appeared for three times. "Algritural" appears only once alone while "biotechnology" appears for six times alone. However, "agriculture" is very frequent in the article which might not be counted when calculating the rank score. That might be the reason why the only relevant article only ranks the fourth. 

False positive: G00-79-4144643. This article ranks the first. First, it is a much longer article compared to the previous one. Second, although the "agricultural biotechnology" also occurs three times, the frequency of "biotechnology" is much lower than the first article. There are also a lot of stop words in the two articles which might influence the calculation of the rank. 



Hence, we can summarize a few tips for further improvement:
1. Use lemmatization to Reduce inflectional/variant forms to base form

2. Remove stop words.

3. Change all words to lower cases so that the search is not case-sensitive.

4. Stem words to help merge tokens that have similar meaning, but differ in spelling based on tense, plurality and so on.

In [0]:
def printRelName(topicFile, qrelsFile, queryParser, searcher, id):
  with open(topicFile, "r") as tf:
        topics = tf.read().splitlines()
  for topic in topics:
        topic_id, topic_phrase = tuple(topic.split(" ", 1))
        if topic_id == id:
          print("---------------------------Topic_id and Topic_phrase----------------------------------")
          print(topic_id, topic_phrase)
          topicQuery = queryParser.parse(topic_phrase)
          topicResults = searcher.search(topicQuery, limit=None)
          print("---------------------------Return documents----------------------------------")
          for (docnum, result) in enumerate(topicResults):
              score = topicResults.score(docnum)
              print("%s Q0 %s %d %lf test" % (topic_id, os.path.basename(result["file_path"]), docnum, score))
          print("---------------------------Relevant documents----------------------------------")
          with open(qrelsFile, 'r') as f_qrel:
            qrels = f_qrel.readlines()
            for i in qrels:
              qid, _, doc, rel = i.rstrip().split(" ")
              if qid == id and rel == "1":
                print(i.rstrip())

In [0]:
printRelName(TOPIC_FILE, QRELS_FILE, QP_Q2, SEARCHER_Q2, "14")

In [0]:
## The content of document G00-89-0000000
fp_path = list(Path(DOCUMENTS_DIR).glob("**/G00-89-0000000"))[0]
with open(fp_path) as fp:
  fp_content=fp.read();
  print(fp_content)

In [0]:
## The content of document G00-79-4144643
fn_path = list(Path(DOCUMENTS_DIR).glob("**/G00-79-4144643"))[0]
with open(fn_path) as fn:
  fn_content=fn.read()
  print(fn_content)

### Q3 (b): Write your code below

In [0]:
# Put your code for creating the index here (you can add more cells).
# Make sure you save the final index in the variable INDEX_Q3, your query parser in QP_Q3, and your searcher in SEARCHER_Q3

In [0]:
nltk.download("wordnet")

In [0]:
class CustomFilter(Filter):
    is_morph = True
    def __init__(self, filterFunc, *args, **kwargs):
        self.customFilter = filterFunc
        self.args = args
        self.kwargs = kwargs
    def __eq__(self):
        return (other
                and self.__class__ is other.__class__)
    def __call__(self, tokens):
        for t in tokens:
            if t.mode == 'query': # if called by query parser
                t.text = self.customFilter(t.text, *self.args, **self.kwargs)
                yield t
            else: # == 'index' if called by indexer
                t.text = self.customFilter(t.text, *self.args, **self.kwargs)
                yield t

In [0]:
filter1 = RegexTokenizer() | CustomFilter(LancasterStemmer().stem) | CustomFilter(WordNetLemmatizer().lemmatize) | LowercaseFilter() | StopFilter()
SCHEMA_Q3=Schema(file_path = ID(stored=True),file_content = TEXT(analyzer = filter1))
INDEX_Q3 = createIndex(SCHEMA_Q3) # Replace None with your index for Q3
addFilesToIndex(INDEX_Q3, filesToIndex)
QP_Q3 = QueryParser("file_content", schema=INDEX_Q3.schema) # Replace None with your query parser for Q3
SEARCHER_Q3 = INDEX_Q3.searcher() # Replace None with your searcher for Q3


In [0]:
pyTrecEval(TOPIC_FILE, QRELS_FILE, QP_Q3, SEARCHER_Q3) 

### Q3 (c): Provide answer to Q3 (c) here [markdown cell]

Instead of using only one RegexTokenizer, I modified the tokenizer to include a lowercase filter and stop words filter as well as stemming and lemmatizing filters from NLTK. The overall performance greatly improve from mean average precision 0.19 to 0.3458. If we look at the query 14, we find that the map for query 14 is now 1 which means that the false postive cases have been eliminated and the only relevant document rank the first.



### Q3 (d): Provide answer to Q3 (d) here [markdown cell]

Yes

### Q3 (e): Provide answer to Q3 (e) here [markdown cell]

Yes

### Q3 (f): Provide answer to Q3 (f) here [markdown cell]

Since the overall MAP score has greatly enhanced and performance of most queries remain the same or improved, the method adopted proves to be efficient. However, performance of query 22 becomes worse. We can further improve it in the next step.




## Question 4

In [0]:
# Put your code for creating the index here (you can add more cells).
# Make sure you save the final index in the variable INDEX_Q4, your query parser in QP_Q4, and your searcher in SEARCHER_Q4

### Step 1: Modify filters
Add IntraWordFilter() to remove the effect of punctuation as well as StemFilter(). The order of filters is also tuned.

In [0]:
#filter2 = RegexTokenizer() | LowercaseFilter() | IntraWordFilter() | StopFilter() | StemFilter() | CustomFilter(LancasterStemmer().stem) | CustomFilter(WordNetLemmatizer().lemmatize, 'v') 
#SCHEMA_Q4=Schema(file_path = ID(stored=True),file_content = TEXT(analyzer = filter2))
#INDEX_Q4 = createIndex(SCHEMA_Q4)
#addFilesToIndex(INDEX_Q4, filesToIndex)
#QP_Q4 = QueryParser("file_content", schema=INDEX_Q4.schema) 
#SEARCHER_Q4 = INDEX_Q4.searcher() 

In [0]:
#pyTrecEval(TOPIC_FILE, QRELS_FILE, QP_Q4, SEARCHER_Q4) 

After modifing the filters, the MAP value has increased to 0.3567.

### Step 2: Change scoring method. 
The default scoring method for Whoosh is BM25. Now we change it to TF-IDF to see if there is any improvement. 

In [0]:
#tfidfSearcher = INDEX_Q4.searcher(weighting=scoring.TF_IDF())


In [0]:
#pyTrecEval(TOPIC_FILE, QRELS_FILE, QP_Q3, tfidfSearcher) 

As we can see from the result, map is 0.1370 which is pretty bad. Thus, we will stick to the previous BM25 method. 

### Step 3: Modify QParser, change Andgroup to OrGroup

In [0]:
#QP_or = QueryParser("file_content", schema=INDEX_Q4.schema, group=qparser.OrGroup)
#pyTrecEval(TOPIC_FILE, QRELS_FILE, QP_or, SEARCHER_Q4)

As we change the AndGroup to OrGroup, the result has been enhanced to 0.3891. So we will use this QR_or querypaser for the next step.

### Step 4: Tune Parameter k1 and B for BM25 scoring method

#### keep k1 and tune B

In [0]:
#Searcher_B_5_0 = INDEX_Q4.searcher(weighting=scoring.BM25F(B=0.5, K1=1.2))

In [0]:
#pyTrecEval(TOPIC_FILE, QRELS_FILE, QP_or, Searcher_B_5_0)

The default k1 value is 0.75. After change k1 to 0.5, the map value improves to 0.4043. 

#### keep B=0.5 and tune k1

In [0]:
#Searcher_K1_4_5 = INDEX_Q4.searcher(weighting=scoring.BM25F(B=0.5, K1=4.5))
#pyTrecEval(TOPIC_FILE, QRELS_FILE, QP_or, Searcher_K1_4_5)

As tried, when k1=4.5 , the map value has further been improved to 0.4103

### Please answer the following questions here
(a) A clear list of all final modifications made.  
(b)  Why each modification was made – how did it help?  
(c)  The  final  MAP  performance  that  these  modifications  attained.

In summary, the final modifications include:
1. Add more filters. The final filters include LowercaseFilter(),  IntraWordFilter(), StopFilter(), StemFilter(), LancasterStemmer() and WordNetLemmatizer().
2. Optimize QueryParser with grouping method changed from AND to OR.
3. Tune K1 and B value for Searcher

Adding filters prove to improve performance because they pre-process text and thus make the search engine more effiecent. OrGroup makes terms optional by default and search engine can find more relevant documents which will make improve the mean average precision. As to the BF25F algorithm, the default value for k1 is 4.5 and B is 0.5. K1 is a smooth parameter and B controls the effect of field-length normalization. Tuning the two values by trial and error will help enhance the result.

The final MAP performation after modifications is 0.4103.

In [0]:
filter2 = RegexTokenizer() | LowercaseFilter() | IntraWordFilter() | StopFilter() | StemFilter() | CustomFilter(LancasterStemmer().stem) | CustomFilter(WordNetLemmatizer().lemmatize, 'v')
SCHEMA_Q4=Schema(file_path = ID(stored=True),file_content = TEXT(analyzer = filter2))
INDEX_Q4 = createIndex(SCHEMA_Q4) # Replace None with your index for Q4
addFilesToIndex(INDEX_Q4, filesToIndex)
QP_Q4 = QueryParser("file_content", schema=INDEX_Q4.schema, group=qparser.OrGroup) # Replace None with your query parser for Q4
SEARCHER_Q4 = INDEX_Q4.searcher(weighting=scoring.BM25F(B=0.5, K1=4.5)) # Replace None with your searcher for Q4
pyTrecEval(TOPIC_FILE, QRELS_FILE, QP_Q4, SEARCHER_Q4)

done indexing.
num_q                    1       1.0000
num_ret                  1       469.0000
num_rel                  1       5.0000
num_rel_ret              1       5.0000
map                      1       0.0671
gm_map                   1       -2.7019
Rprec                    1       0.0000
bpref                    1       0.0000
recip_rank               1       0.0556
iprec_at_recall_0.00     1       0.0909
iprec_at_recall_0.10     1       0.0909
iprec_at_recall_0.20     1       0.0909
iprec_at_recall_0.30     1       0.0909
iprec_at_recall_0.40     1       0.0909
iprec_at_recall_0.50     1       0.0909
iprec_at_recall_0.60     1       0.0909
iprec_at_recall_0.70     1       0.0580
iprec_at_recall_0.80     1       0.0580
iprec_at_recall_0.90     1       0.0476
iprec_at_recall_1.00     1       0.0476
P_5                      1       0.0000
P_10                     1       0.0000
P_15                     1       0.0000
P_20                     1       0.0500
P_30                  

## Validation

In [0]:
# Run the following cells to make sure your code returns the correct value types

In [0]:
from whoosh.index import FileIndex
from whoosh.qparser import QueryParser
from whoosh.searching import Searcher
import os.path

### Q2 Validation

In [0]:
assert(isinstance(INDEX_Q2, FileIndex)), "Index Type"
assert(isinstance(QP_Q2, QueryParser)), "Query Parser Type"
assert(isinstance(SEARCHER_Q2, Searcher)), "Searcher Type"
print("Q2 Types Validated")

Q2 Types Validated


### Q3 Validation

In [0]:
assert(isinstance(INDEX_Q3, FileIndex)), "Index Type"
assert(isinstance(QP_Q3, QueryParser)), "Query Parser Type"
assert(isinstance(SEARCHER_Q3, Searcher)), "Searcher Type"
print("Q3 Types Validated")

Q3 Types Validated


### Q4 Validation

In [0]:
assert(isinstance(INDEX_Q4, FileIndex)), "Index Type"
assert(isinstance(QP_Q4, QueryParser)), "Query Parser Type"
assert(isinstance(SEARCHER_Q4, Searcher)), "Searcher Type"
print("Q4 Types Validated")

Q4 Types Validated
