# Assignment: Information Retrieval (IR)

## Preparations
* Put all your imports, and path constants in the next cells

In [25]:
!pip install whoosh
!pip install pytrec_eval
!pip install wget

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [26]:
import wget
wget.download("https://github.com/MIE1513HS-2022/course-datasets/raw/main/government.zip", "government.zip")

'government (1).zip'

In [27]:
!unzip government.zip

Archive:  government.zip
replace government/topics-with-full-descriptions.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [28]:
# imports
# Put all your imports here
from whoosh import index, writing, qparser,scoring
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import *
from whoosh.qparser import QueryParser
import os.path
from pathlib import Path
import tempfile
import subprocess
import pytrec_eval
import wget
import abc
from abc import abstractmethod
from whoosh.analysis import Filter

In [29]:
class IRSystem(metaclass=abc.ABCMeta):
    """
    Abstract class which is inherited by other IR system
    """

    def __init__(self, data_dir):
        # DON'T change the following names,topic_file, qrels_file, document_dir, file_list
        self.topic_file = os.path.join(data_dir, "gov.topics")
        self.qrels_file = os.path.join(data_dir, "gov.qrels")
        self.document_dir = os.path.join(data_dir, "documents") 
        self.file_list = [str(filePath) for filePath in Path(self.document_dir).glob("**/*") if filePath.is_file()]

        self.create_index()
        self.add_files()
        self.create_parser_searcher()

    @abstractmethod
    def create_index(self):
        pass

    @abstractmethod
    def add_files(self):
        pass

    @abstractmethod
    def create_parser_searcher(self):
        pass

    @abstractmethod
    def perform_search(self, topic_phrase):
        pass

    @staticmethod
    def post_process_score(score):
        return score

    @staticmethod
    def print_trec_eval_result(results):
        if not results:
            print('empty results')
            return

        def print_line(name, scope, num):
            print('{:25s}{:8s}{:.4f}'.format(name, scope, num))

        for query_id, query_measures in results.items():
            for measure, value in query_measures.items():
                if measure == "runid":
                    continue
                print_line(measure, query_id, value)

        for measure in query_measures.keys():
            if measure == "runid":
                continue
            print_line(
                measure,
                'all',
                pytrec_eval.compute_aggregated_measure(
                    measure,
                    [query_measures[measure]
                     for query_measures in results.values()]))
            
    def print_rel_name(self, q_id):
        with open(self.topic_file, "r") as tf:
            topics = tf.read().splitlines()
        for topic in topics:
            topic_id, topic_phrase = tuple(topic.split(" ", 1))
            if topic_id == q_id:
                print("---------------------------Topic_id and Topic_phrase----------------------------------")
                print(topic_id, topic_phrase)
                 # get search result
                topic_results = self.perform_search(topic_phrase)
                print("---------------------------Return documents----------------------------------")
                for (docnum, result) in enumerate(topic_results):
                    score = topic_results.score(docnum)
                    score = self.post_process_score(score)
                    print("%s Q0 %s %d %lf test" % (topic_id, os.path.basename(result["file_path"]), docnum, score))
                print("---------------------------Relevant documents----------------------------------")
                with open(self.qrels_file, 'r') as f_qrel:
                    qrels = f_qrel.readlines()
                    for i in qrels:
                        qid, _, doc, rel = i.rstrip().split(" ")
                        if qid == q_id and rel == "1":
                            print(i.rstrip())

    def py_trec_eval(self):
        # Load topic file - a list of topics(search phrases) used for evalutation
        with open(self.topic_file, "r") as tf:
            topics = tf.read().splitlines()

            # create an output file to which we'll write our results
        temp_output_file = tempfile.mkstemp()[1]
        with open(temp_output_file, "w") as outputTRECFile:
            # for each evaluated topic:
            # build a query and record the results in the file in TREC_EVAL format
            for topic in topics:
                topic_id, topic_phrase = tuple(topic.split(" ", 1))
                # get search result
                topic_results = self.perform_search(topic_phrase)
                # format the result
                for (docnum, result) in enumerate(topic_results):
                    score = topic_results.score(docnum)
                    outputTRECFile.write(
                        "%s Q0 %s %d %lf test\n" % (topic_id, os.path.basename(result["file_path"]), docnum, score))

        with open(self.qrels_file, 'r') as f_qrel:
            qrel = pytrec_eval.parse_qrel(f_qrel)

        with open(temp_output_file, 'r') as f_run:
            run = pytrec_eval.parse_run(f_run)

        evaluator = pytrec_eval.RelevanceEvaluator(
            qrel, pytrec_eval.supported_measures)

        results = evaluator.evaluate(run)

        self.print_trec_eval_result(results)


In [30]:
# Dont change this! Use it as-is in your code
# This filter will run for both the index and the query
class CustomFilter(Filter):
    is_morph = True
    def __init__(self, filterFunc, *args, **kwargs):
        self.customFilter = filterFunc
        self.args = args
        self.kwargs = kwargs
    def __eq__(self):
        return (other
                and self.__class__ is other.__class__)
    def __call__(self, tokens):
        for t in tokens:
            if t.mode == 'query': # if called by query parser
                t.text = self.customFilter(t.text, *self.args, **self.kwargs)
                yield t
            else: # == 'index' if called by indexer
                t.text = self.customFilter(t.text, *self.args, **self.kwargs)
                yield t

## Question 1
Provide your text answers in the following two markdown cells

### Q1 (a): 
MAP(Mean Average Precision) might be appropriate.


### Q1 (b): 
Since there are tons of documents in the government websites and the user might want to find all the documents that are relevant to their requirments not just a few, an appropriate measure is to measure the accuracy of information retrieval models, which means the measure should focusing on both the number of retrieved relevant documents and the ranking of these documents.
Hence Mean Average Precision might be an appropriate measure.

## Question 2

### Q2 (a): Write your code below

**1. The auto-grader will extract and use the following variables, DON'T change the their names:**

      self.topic_file  
      self.qrels_file  
      self.document_dir   
      self.file_list  
      self.index_sys  
      self.query_parser  
      self.searcher   



**2. DON'T change the names of the already defined funtions**  
**3. DON'T change the py_trec_eval function**  
**4. DON'T change the class names including CustomFilter, IRSystem, IRQ2, IRQ3, IRQ4**  
**5. DON'T change the CustomFilter class and DON'T create any new custom filter class that is used to define Whoosh schema**

In [31]:
class IRQ2(IRSystem):
    def create_index(self):
        """
        INPUT:
            None
        OUTPUT:
            None
        
        NOTE: Please update self.index_sys which should have type whoosh.index.FileIndex
        """
        # DON't change the name of 'index_sys'
        
        # Generate a temporary directory for the index
        indexDir = tempfile.mkdtemp()

        mySchema = Schema(file_path = ID(stored=True),
                  file_content = TEXT(analyzer = RegexTokenizer()))
        
        self.index_sys = index.create_in(indexDir, mySchema)

    def add_files(self):
        """
        INPUT:
            None
        OUTPUT:
            None
        
        NOTE: Add buffer to self.index_sys
        """
       # open writer
        writer = writing.BufferedWriter(self.index_sys, period=None, limit=1000)

        try:
            # write each file to index
            for docNum, filePath in enumerate(self.file_list):
                with open(filePath, "r", encoding="utf-8") as f:
                    fileContent = f.read()
                    writer.add_document(file_path = filePath,
                                        file_content = fileContent)

                    # print status every 1000 documents
                    if (docNum+1) % 1000 == 0:
                        print("already indexed:", docNum+1)
            print("done indexing.")

        finally:
            # close the index
            writer.close()

    def create_parser_searcher(self):
        """
        INPUT:
            None
        OUTPUT:
            None
        
        NOTE: Please update self.query_parser and self.searcher which should have type whoosh.qparser.default.QueryParser and whoosh.searching.Searcher respectively 
        """
         # DON't change the names of 'query_parser' and 'searcher'

        # define a query parser for the field "file_content" in the index
        self.query_parser = QueryParser("file_content", schema=self.index_sys.schema)
        self.searcher = self.index_sys.searcher()

    def perform_search(self, topic_phrase):
        """
        INPUT:
            topic_phrase: string
        OUTPUT:
            topic_results: whoosh.searching.Results
        
        NOTE: Utilize self.query_parser and self.searcher to calculate the result for topic_phrase
        """

        # run a sample query for the topic phrase
        sampleQuery = self.query_parser.parse(topic_phrase)
        topic_results = self.searcher.search(sampleQuery, limit=None)

        return topic_results

In [32]:
q2 = IRQ2("government")

already indexed: 1000
already indexed: 2000
already indexed: 3000
already indexed: 4000
done indexing.


In [33]:
q2.py_trec_eval()

num_q                    1       1.0000
num_ret                  1       1.0000
num_rel                  1       5.0000
num_rel_ret              1       0.0000
map                      1       0.0000
gm_map                   1       -11.5129
Rprec                    1       0.0000
bpref                    1       0.0000
recip_rank               1       0.0000
iprec_at_recall_0.00     1       0.0000
iprec_at_recall_0.10     1       0.0000
iprec_at_recall_0.20     1       0.0000
iprec_at_recall_0.30     1       0.0000
iprec_at_recall_0.40     1       0.0000
iprec_at_recall_0.50     1       0.0000
iprec_at_recall_0.60     1       0.0000
iprec_at_recall_0.70     1       0.0000
iprec_at_recall_0.80     1       0.0000
iprec_at_recall_0.90     1       0.0000
iprec_at_recall_1.00     1       0.0000
P_5                      1       0.0000
P_10                     1       0.0000
P_15                     1       0.0000
P_20                     1       0.0000
P_30                     1       0.000

In [34]:
q2.print_rel_name('1')

---------------------------Topic_id and Topic_phrase----------------------------------
1 mining gold silver coal
---------------------------Return documents----------------------------------
1 Q0 G00-90-0342721 0 26.645398 test
---------------------------Relevant documents----------------------------------
1 0 G00-00-1006224 1
1 0 G00-02-0901987 1
1 0 G00-03-1898526 1
1 0 G00-10-3730888 1
1 0 G00-10-3849661 1


### Q2 (b): 
The MAP measure of Whoosh system for all queries is 0.1971. 

### Q2 (c): 
The performance for topic 1,2,6,7,9,16,28 are poor since they got 0 on MAP measure. 

The performance for topic 18 and 24 are good since they got 1 on MAP measure.

## Question 3

In [35]:
q2.print_rel_name("9")

---------------------------Topic_id and Topic_phrase----------------------------------
9 genealogy searches
---------------------------Return documents----------------------------------
9 Q0 G00-26-1048210 0 12.268873 test
9 Q0 G00-59-3622783 1 5.132722 test
---------------------------Relevant documents----------------------------------
9 0 G00-91-3181951 1


### Q3 (a): 
For query 9: 

*   G00-26-1048210 file is false positive(FP) because it is not relevant but retrieved and highly ranked. 

  In this file, both the word "genealogy" and the word "searches" appeared once. However the positions of those two words are very different. Besides, the word "genealogical" and "reseach" appeared frequently in the file, which are very similar to the word "genealogy" and "searches" but with different meanings.

*   G00-91-3181951 file is false negative(FN) because it is relevant but not retrived and ranked.

  In this file, the word "genealogy" appeared many times but most the appeared words contained uppercases somewhere, while the words in the query are all lowercases. Also the words appeared infrequently relative to the length of the file.


Therefore, adjusting all documents not sensitive to uppercases and using stemming would help improve the overall performance.

### Q3 (b): Write your code below

**1. The auto-grader will extract and use the following variables, DON'T change the their names:**

      self.topic_file  
      self.qrels_file  
      self.document_dir   
      self.file_list  
      self.index_sys  
      self.query_parser  
      self.searcher   



**2. DON'T change the names of the already defined funtions**  
**3. DON'T change the py_trec_eval function**  
**4. DON'T change the class names including CustomFilter, IRSystem, IRQ2, IRQ3, IRQ4**  
**5. DON'T change the CustomFilter class and DON'T create any new custom filter class that is used to define Whoosh schema**

In [36]:
class IRQ3(IRSystem):
    def create_index(self):
        """
        INPUT:
            None
        OUTPUT:
            None
        
        NOTE: Please update self.index_sys which should have type whoosh.index.FileIndex
        """
        # DON't change the name of 'index_sys'
        self.index_sys = None

        #changed analyzer
        stmLwrAnalyzer = RegexTokenizer() | LowercaseFilter() | StemFilter()

        indexDir = tempfile.mkdtemp()

        #define a new schema with changed analyzer
        mySchema2 = Schema(file_path = ID(stored=True),
                   file_content = TEXT(analyzer = stmLwrAnalyzer))
        
        self.index_sys = index.create_in(indexDir, mySchema2)

    def add_files(self):
        """
        INPUT:
            None
        OUTPUT:
            None
        
        NOTE: Add buffer to self.index_sys
        """
        # open writer
        writer = writing.BufferedWriter(self.index_sys, period=None, limit=1000)

        try:
            # write each file to index
            for docNum, filePath in enumerate(self.file_list):
                with open(filePath, "r", encoding="utf-8") as f:
                    fileContent = f.read()
                    writer.add_document(file_path = filePath,
                                        file_content = fileContent)

                    # print status every 1000 documents
                    if (docNum+1) % 1000 == 0:
                        print("already indexed:", docNum+1)
            print("done indexing.")

        finally:
            # close the index
            writer.close()

    def create_parser_searcher(self):
        """
        INPUT:
            None
        OUTPUT:
            None
        
        NOTE: Please update self.query_parser and self.searcher which should have type whoosh.qparser.default.QueryParser and whoosh.searching.Searcher respectively 
        """
         # DON't change the names of 'query_parser' and 'searcher'
        self.query_parser = None
        self.searcher = None

        self.query_parser = QueryParser("file_content", schema=self.index_sys.schema)
        self.searcher = self.index_sys.searcher()

    def perform_search(self, topic_phrase):
        """
        INPUT:
            topic_phrase: string
        OUTPUT:
            topicResults: whoosh.searching.Results
        
        NOTE: Utilize self.query_parser and self.searcher to calculate the result for topic_phrase
        """
        sampleQuery = self.query_parser.parse(topic_phrase)
        topic_results = self.searcher.search(sampleQuery, limit=None)
        
        return topic_results

In [37]:
q3 = IRQ3("government")

already indexed: 1000
already indexed: 2000
already indexed: 3000
already indexed: 4000
done indexing.


In [38]:
q3.py_trec_eval()

num_q                    1       1.0000
num_ret                  1       3.0000
num_rel                  1       5.0000
num_rel_ret              1       0.0000
map                      1       0.0000
gm_map                   1       -11.5129
Rprec                    1       0.0000
bpref                    1       0.0000
recip_rank               1       0.0000
iprec_at_recall_0.00     1       0.0000
iprec_at_recall_0.10     1       0.0000
iprec_at_recall_0.20     1       0.0000
iprec_at_recall_0.30     1       0.0000
iprec_at_recall_0.40     1       0.0000
iprec_at_recall_0.50     1       0.0000
iprec_at_recall_0.60     1       0.0000
iprec_at_recall_0.70     1       0.0000
iprec_at_recall_0.80     1       0.0000
iprec_at_recall_0.90     1       0.0000
iprec_at_recall_1.00     1       0.0000
P_5                      1       0.0000
P_10                     1       0.0000
P_15                     1       0.0000
P_20                     1       0.0000
P_30                     1       0.000

In [39]:
q3.print_rel_name('1')

---------------------------Topic_id and Topic_phrase----------------------------------
1 mining gold silver coal
---------------------------Return documents----------------------------------
1 Q0 G00-90-0342721 0 25.321761 test
1 Q0 G00-55-3817584 1 14.643225 test
1 Q0 G00-69-2353421 2 7.818525 test
---------------------------Relevant documents----------------------------------
1 0 G00-00-1006224 1
1 0 G00-02-0901987 1
1 0 G00-03-1898526 1
1 0 G00-10-3730888 1
1 0 G00-10-3849661 1


### Q3 (c): 
I made the analyzer with lowercases and used stemming on it.

The overall performance of MAP measure has been imporved from 0.1971 to 0.3372. There are three topics(14, 18 and 24) performed well with MAP score 1 and only four topics(1, 6, 7, 16) performed badly with MAP score 0. Compared with the previous analyzer, the new analyzer has more performed well topics and fewer performed badly topics.

Besides, for query 9, the False Negative(relevant but not retrieved and ranked) is 0. All relevant documents have been retrieved. However, the number of False Positive(non relevant but retrieved and ranked) increased, which would give user more non-relevant documents.

In [40]:
q3.print_rel_name("9")

---------------------------Topic_id and Topic_phrase----------------------------------
9 genealogy searches
---------------------------Return documents----------------------------------
9 Q0 G00-30-0221651 0 14.031603 test
9 Q0 G00-79-2892445 1 13.679620 test
9 Q0 G00-26-1048210 2 12.292053 test
9 Q0 G00-01-2134408 3 10.727103 test
9 Q0 G00-59-0523165 4 10.708617 test
9 Q0 G00-95-3755341 5 10.708617 test
9 Q0 G00-06-1975174 6 10.708617 test
9 Q0 G00-24-0016657 7 10.648241 test
9 Q0 G00-95-3337324 8 10.648241 test
9 Q0 G00-88-2629440 9 10.640403 test
9 Q0 G00-33-1729611 10 10.561911 test
9 Q0 G00-01-2898660 11 10.525253 test
9 Q0 G00-43-3812747 12 10.170332 test
9 Q0 G00-91-3181951 13 9.645810 test
9 Q0 G00-21-1529615 14 9.473688 test
9 Q0 G00-55-0643570 15 9.473688 test
9 Q0 G00-49-2630728 16 9.290957 test
9 Q0 G00-67-1176122 17 9.087153 test
9 Q0 G00-00-2016453 18 8.893623 test
9 Q0 G00-08-3780534 19 8.846570 test
9 Q0 G00-02-1372443 20 8.743332 test
9 Q0 G00-08-1314254 21 8.743332 te

### Q3 (d): 
Yes, the overall Measures were increased. 

### Q3 (e): 
Yes. For example the MAP score of the query 14 increased from 0.25 to 1 while the query 26 has MAP score decreased from 0.1111 to 0.0778.

### Q3 (f): 
I think the overall performance is increasing since the MAP score of overall queries is increasing and the false negative is decreasing for some queries. But this modification also leads the number of false positive increased. Hence, I think the overall performance is improved but not too much. Compared with the previous analyzer, each has its own advantages and disadvantages.

## Question 4


### Q4 (a):
I found that for most index without NLTK filters always lead to a low performance on accuracy.

Since every English words may differ its morphology depending on the contexts, this may cause the words in documents may be in a different form or derivation of words in queries. Hence the seach engine might not search to the corresponding words in query and lead to a false negative.


### Q4 (b): Write your code below

**1. The auto-grader will extract and use the following variables, DON'T change the their names:**

      self.topic_file  
      self.qrels_file  
      self.document_dir   
      self.file_list  
      self.index_sys  
      self.query_parser  
      self.searcher   



**2. DON'T change the names of the already defined funtions**  
**3. DON'T change the py_trec_eval function**  
**4. DON'T change the class names including CustomFilter, IRSystem, IRQ2, IRQ3, IRQ4**  
**5. DON'T change the CustomFilter class and DON'T create any new custom filter class that is used to define Whoosh schema**

In [44]:
import nltk
nltk.download('omw-1.4')
from nltk.stem import *
nltk.download("wordnet")

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [68]:
class IRQ4(IRSystem):

    def create_index(self):
        """
        INPUT:
            None
        OUTPUT:
            None
        
        NOTE: Please update self.index_sys which should have type whoosh.index.FileIndex
        """
        # DON't change the name of 'index_sys'

        indexDir = tempfile.mkdtemp()

        # Whoosh filter for NLTK's LancasterStemmer and StopWords Filter.
        # NLTK contains packages for lemmatizing and tokenizing words. This would help us on useful information extraction on words.
        # apply NLTK and StopFilter to index and query.
        myFilter = RegexTokenizer() | LowercaseFilter() | CustomFilter(LancasterStemmer().stem) |StopFilter()


        # new schema with Whoosh filter for NLTK.
        mySchema3 = Schema(file_path = ID(stored=True),
                  file_content = TEXT(analyzer = myFilter))
        
        self.index_sys = index.create_in(indexDir, mySchema3)
        

    def add_files(self):
        """
        INPUT:
            None
        OUTPUT:
            None
        
        NOTE: Add buffer to self.index_sys
        """
        # open writer
        writer = writing.BufferedWriter(self.index_sys, period=None, limit=1000)

        try:
            # write each file to index
            for docNum, filePath in enumerate(self.file_list):
                with open(filePath, "r", encoding="utf-8") as f:
                    fileContent = f.read()
                    writer.add_document(file_path = filePath,
                                        file_content = fileContent)

                    # print status every 1000 documents
                    if (docNum+1) % 1000 == 0:
                        print("already indexed:", docNum+1)
            print("done indexing.")

        finally:
            # close the index
            writer.close()

    def create_parser_searcher(self):
        """
        INPUT:
            None
        OUTPUT:
            None
        
        NOTE: Please update self.query_parser and self.searcher which should have type whoosh.qparser.default.QueryParser and whoosh.searching.Searcher respectively 
        """
         # DON't change the names of 'query_parser' and 'searcher'
        self.query_parser = QueryParser("file_content", schema=self.index_sys.schema)
        self.searcher = self.index_sys.searcher()

    def perform_search(self, topic_phrase):
        """
        INPUT:
            topic_phrase: string
        OUTPUT:
            topicResults: whoosh.searching.Results
        
        NOTE: Utilize self.query_parser and self.searcher to calculate the result for topic_phrase
        """
        # run a sample query for the topic phrase
        sampleQuery = self.query_parser.parse(topic_phrase)
        topic_results = self.searcher.search(sampleQuery, limit=None)

        return topic_results

In [69]:
q4 = IRQ4("government")

already indexed: 1000
already indexed: 2000
already indexed: 3000
already indexed: 4000
done indexing.


In [70]:
q4.py_trec_eval()

num_q                    1       1.0000
num_ret                  1       3.0000
num_rel                  1       5.0000
num_rel_ret              1       0.0000
map                      1       0.0000
gm_map                   1       -11.5129
Rprec                    1       0.0000
bpref                    1       0.0000
recip_rank               1       0.0000
iprec_at_recall_0.00     1       0.0000
iprec_at_recall_0.10     1       0.0000
iprec_at_recall_0.20     1       0.0000
iprec_at_recall_0.30     1       0.0000
iprec_at_recall_0.40     1       0.0000
iprec_at_recall_0.50     1       0.0000
iprec_at_recall_0.60     1       0.0000
iprec_at_recall_0.70     1       0.0000
iprec_at_recall_0.80     1       0.0000
iprec_at_recall_0.90     1       0.0000
iprec_at_recall_1.00     1       0.0000
P_5                      1       0.0000
P_10                     1       0.0000
P_15                     1       0.0000
P_20                     1       0.0000
P_30                     1       0.000

In [52]:
q4.print_rel_name('1')

---------------------------Topic_id and Topic_phrase----------------------------------
1 mining gold silver coal
---------------------------Return documents----------------------------------
1 Q0 G00-90-0342721 0 28.132322 test
1 Q0 G00-69-2353421 1 5.610845 test
---------------------------Relevant documents----------------------------------
1 0 G00-00-1006224 1
1 0 G00-02-0901987 1
1 0 G00-03-1898526 1
1 0 G00-10-3730888 1
1 0 G00-10-3849661 1


### Q4 (b): 
I modified the schema with a changed filter and applied it on IRQ4.

In order to improve our model performance, I tried to use different NLTK Stemmers and Lemmatizers on the analyzer of the schema. Since NLTK contains packages for lemmatizing and tokenizing words, this would help us on useful information extraction on words. Eventually, I found that our model performed best when I applied Whoosh filter for NLTK's LancasterStemmer. Besides, I also added the StopFilter to filter the stop words.

### Q4 (c):

Yes, the overall performance is improved compared to Q3. The MAP score for the average of all queries increased from 0.3372 to 0.3458. However, the number of topics which performed well and badly remained the same.(3 topics got MAP score 1 and 4 topics got 0). Also compared with the retrieved result of query 9 on Q3, the rank of true relevant document decreased from 13 to 17, the model in Q3 performed better than Q4.

In general, the improvement was very limited, it only improved some overall measurements of trec_eval such as MAP and recip_rank.


In [71]:
q4.print_rel_name('9')

---------------------------Topic_id and Topic_phrase----------------------------------
9 genealogy searches
---------------------------Return documents----------------------------------
9 Q0 G00-30-0221651 0 14.329938 test
9 Q0 G00-79-2892445 1 13.414634 test
9 Q0 G00-26-1048210 2 12.385714 test
9 Q0 G00-55-0643570 3 11.485766 test
9 Q0 G00-02-1372443 4 10.760187 test
9 Q0 G00-08-1314254 5 10.760187 test
9 Q0 G00-08-0900666 6 10.760187 test
9 Q0 G00-01-2134408 7 10.529720 test
9 Q0 G00-24-0016657 8 10.438614 test
9 Q0 G00-95-3337324 9 10.438614 test
9 Q0 G00-88-2629440 10 10.398881 test
9 Q0 G00-59-0523165 11 10.398881 test
9 Q0 G00-95-3755341 12 10.398881 test
9 Q0 G00-06-1975174 13 10.398881 test
9 Q0 G00-33-1729611 14 10.156320 test
9 Q0 G00-01-2898660 15 10.050268 test
9 Q0 G00-43-3812747 16 9.720648 test
9 Q0 G00-91-3181951 17 9.616079 test
9 Q0 G00-21-1529615 18 9.419615 test
9 Q0 G00-67-1176122 19 9.129177 test
9 Q0 G00-08-3780534 20 9.076797 test
9 Q0 G00-49-2630728 21 8.697394

## Validation

#### Run the following cells to make sure your code returns the correct value types

In [46]:
from whoosh.index import FileIndex
from whoosh.qparser import QueryParser
from whoosh.searching import Searcher
import os.path

### Q2 Validation

In [47]:
assert(isinstance(q2.index_sys, FileIndex)), "Index Type"
assert(isinstance(q2.query_parser, QueryParser)), "Query Parser Type"
assert(isinstance(q2.searcher, Searcher)), "Searcher Type"
print("Q2 Types Validated")

Q2 Types Validated


### Q3 Validation

In [48]:
assert(isinstance(q3.index_sys, FileIndex)), "Index Type"
assert(isinstance(q3.query_parser, QueryParser)), "Query Parser Type"
assert(isinstance(q3.searcher, Searcher)), "Searcher Type"
print("Q3 Types Validated")

Q3 Types Validated


### Q4 Validation

In [72]:
assert(isinstance(q4.index_sys, FileIndex)), "Index Type"
assert(isinstance(q4.query_parser, QueryParser)), "Query Parser Type"
assert(isinstance(q4.searcher, Searcher)), "Searcher Type"
print("Q4 Types Validated")

Q4 Types Validated
