# Installation

In [1]:
!pip install whoosh

Collecting whoosh
  Downloading Whoosh-2.7.4-py2.py3-none-any.whl.metadata (3.1 kB)
Downloading Whoosh-2.7.4-py2.py3-none-any.whl (468 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.8/468.8 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: whoosh
Successfully installed whoosh-2.7.4


# Preparing the data

In [2]:
!kaggle datasets download -d stackoverflow/stacksample
!unzip stacksample.zip

Dataset URL: https://www.kaggle.com/datasets/stackoverflow/stacksample
License(s): other
Downloading stacksample.zip to /kaggle/working
 99%|█████████████████████████████████████▌| 1.10G/1.11G [00:11<00:00, 83.6MB/s]
100%|██████████████████████████████████████| 1.11G/1.11G [00:11<00:00, 99.9MB/s]
Archive:  stacksample.zip
  inflating: Answers.csv             
  inflating: Questions.csv           
  inflating: Tags.csv                


In [3]:
import pandas as pd
questions = pd.read_csv("Questions.csv", nrows=20000)
questions

Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body
0,80,26.0,2008-08-01T13:57:07Z,,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...
1,90,58.0,2008-08-01T14:41:24Z,2012-12-26T03:45:49Z,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...
2,120,83.0,2008-08-01T15:50:08Z,,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...
3,180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...
4,260,91.0,2008-08-01T23:22:08Z,,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...
...,...,...,...,...,...,...,...
19995,1114470,82266.0,2009-07-11T19:37:06Z,,0,"Trim all chars off file name after first ""_""",<p>I'd like to trim these purchase order file ...
19996,1114540,2288585.0,2009-07-11T20:16:06Z,,7,Xcode question: Quickly jump to a particular s...,<p>What is the quickest way to jump to a parti...
19997,1114550,131128.0,2009-07-11T20:20:11Z,,3,Serializing a generic collection with XMLSeria...,<p>Why won't XMLSerializer process my generic ...
19998,1114580,87271.0,2009-07-11T20:35:46Z,,1,Using Yahoo Fire Eagle on Grails / Java,<p>Has anyone implemented the Yahoo Fire Eagle...


# The Index and Schema objects

In [5]:
from whoosh.fields import Schema, TEXT, ID

# Defining index schema
schema = Schema(Id=ID(stored=True), Title=TEXT(stored=True), Body=TEXT(stored=True))

In [6]:
import os.path

index_dir = "indexdir"
if not os.path.exists(index_dir):
    os.mkdir(index_dir)

In [7]:
from whoosh.index import create_in
from whoosh.index import open_dir

# Creating the index
ix = create_in(index_dir, schema)

# Open the index writer
writer = ix.writer()

# Iterate over the DataFrame and add documents to the index
# we have indexed title, title_body and doc_id
for index, row in questions.iterrows():
    writer.add_document(Id=str(row['Id']), Title = row['Title'], Body=row['Body'])

# Commit and close the writer
writer.commit()

# How to search

In [19]:
from whoosh.qparser import QueryParser
from whoosh.scoring import TF_IDF
from whoosh import scoring

# create the query parser
qp = QueryParser("Title", schema=schema)

# parse the query
query_sentence = "How to install"
query = qp.parse(query_sentence)

# create a searcher object
searcher_tfidf = ix.searcher(weighting=scoring.TF_IDF())

# search documents and store them
# we are returing top 3 documents
results_tfidf = searcher_tfidf.search(query, limit=3, scored=True)

# print the documents
for hit in results_tfidf:
    print(hit["Id"])
    print('\n')
    print(hit["Title"])
    print('\n')
    print('------------------\n')

102850


How can I install CPAN modules locally without root access (DynaLoader.pm line 229 error)?


------------------

145900


How can I determine that Windows Installer is performing an upgrade rather than a first time install?


------------------

351640


How to install Hibernate Tools in Eclipse?


------------------



# Task 1

Test the previous search code with different queries. For each one check how
many matched results are returned.

In [20]:
queries = ['runtime error', 'database']

for q in queries:
    print(f'Processing query = {q}')
    # create the query parser
    qp = QueryParser("Title", schema=schema)

    # parse the query
    query_sentence = q
    query = qp.parse(query_sentence)

    # create a searcher object
    searcher_tfidf = ix.searcher(weighting=scoring.TF_IDF())

    # search documents and store them
    # we are returing top 3 documents
    results_tfidf2 = searcher_tfidf.search(query, limit=3, scored=True)

    # print the documents
    for hit in results_tfidf2:
        print(hit["Id"])
        print('\n')
        print(hit["Title"])
        print('\n')
        print('------------------\n')
        
    print('----------------------------------------------------------\n')

Processing query = runtime error
370730


Runtime error 1004: Application-defined or object-defined error


------------------

269090


.NET Runtime 2.0 Error in a service


------------------

368160


Loop through PivotItems: runtime error 91


------------------

----------------------------------------------------------

Processing query = database
44780


What's the best way to implement a SQL script that will grant permissions to a database role on all the user tables in a database?


------------------

606500


How to validate the clients database against my database schema?


------------------

839050


Database msi installer using Team System Database Edition


------------------

----------------------------------------------------------



# Task 2

Repeat the previous search using the BM25F scoring algorithm, which is used in probabilistic retrieval model. Do you see any difference in the returned results?

In [21]:
from whoosh.scoring import BM25F
from whoosh import scoring

# create the query parser
qp = QueryParser("Title", schema=schema)

# parse the query
query_sentence = "How to install"
query = qp.parse(query_sentence)

# create a searcher object with BM25F scoring
searcher_bm25f = ix.searcher(weighting=scoring.BM25F())

# search documents and store them
# we are returning top 3 documents
results_bm25f = searcher_bm25f.search(query, limit=3, scored=True)

# print the documents
for hit in results_bm25f:
    print(hit["Id"])
    print('\n')
    print(hit["Title"])
    print('\n')
    print('------------------\n')

921780


How to install ImageMagick on MAMP?


------------------

998260


How do you install JDK?


------------------

351640


How to install Hibernate Tools in Eclipse?


------------------



TF-IDF tends to give higher weights to terms that are rare in the entire document collection but common in the query, whereas BM25F considers the frequency of terms within individual documents and the length of documents.

# Query expansion

In [22]:
more_results = results_tfidf[0].more_like_this("Title")

for hit in more_results:
    print(hit["Id"])
    print('\n')
    print(hit["Title"])
    print('\n')
    print('------------------\n')

459590


What is the difference betwen including modules and embedding modules?


------------------

423330


Why can't DynaLoader.pm load SSleay.dll for Net::SSLeay and Crypt::SSLeay?


------------------

540640


How can I install a CPAN module into a local directory?


------------------

172040


How do you develop against OpenID locally


------------------

566290


Silverlight Development - Service URL while developing locally


------------------

766830


How can I locally manage C manuals?


------------------

799860


Using Mercurial locally, only with Subversion server


------------------

852280


Ubuntu: "Could not find rails locally or in a repository"


------------------

78900


How to check for memory leaks in Guile extension modules?


------------------

199180


Is there any way to get python omnicomplete to work with non-system modules in vim?


------------------



In [23]:
keywords = [keyword for keyword, score in results_tfidf.key_terms("Title", docs=10, numterms=5)]
keywords

['install', '229', 'cpan', 'dynaloader.pm', 'locally']

# Evaluating IR systems

In [24]:
queries = {
    'q1': "machine learning",
    'q2':"AI algorithms"
}

relevance = {
    'q1' : ["doc1", "doc2", "doc3"],
    'q2' : ["doc1", "doc2", "doc3", "doc4", "doc5"]
}

documents = {
    'doc1': "Artificial Intelligence (AI) is transforming various industries through automation and advanced algorithms. Machine learning, a subset of AI, enables computers to learn from data and make predictions. Algorithms are at the core of AI systems, guiding decision-making and problem-solving processes. AI-powered systems are increasingly used in healthcare for diagnosis and treatment planning. The ethical implications of AI algorithms, such as bias and fairness, are important considerations in their development.",
    'doc2': "Deep learning, a branch of machine learning, uses neural networks to process complex data. AI algorithms are capable of analyzing large datasets to extract meaningful insights. Natural Language Processing (NLP) algorithms enable computers to understand and generate human language. AI-driven recommendation algorithms personalize user experiences in e-commerce and content platforms. Ensuring the transparency and accountability of AI algorithms is essential for building trust in AI technologies.",
    'doc3': "Reinforcement learning algorithms enable AI agents to learn through trial and error interactions with their environment. AI algorithms are used in financial markets for high-frequency trading and risk management. Computer vision algorithms enable machines to interpret and analyze visual information. AI algorithms can enhance cybersecurity by detecting and mitigating cyber threats in real-time. Continuous research and development are essential for advancing AI algorithms and overcoming their limitations.",
    'doc4': "Evolutionary algorithms, inspired by natural selection, are used to optimize complex systems and processes. AI algorithms play a crucial role in autonomous vehicles for navigation and decision-making. Quantum computing algorithms have the potential to revolutionize AI by solving complex problems exponentially faster. AI algorithms are employed in predictive maintenance to anticipate equipment failures and reduce downtime. Ethical guidelines and regulations are needed to govern the development and deployment of AI algorithms.",
    'doc5': "Genetic algorithms are used to evolve solutions to optimization and search problems inspired by natural selection. AI algorithms enable personalized content recommendations in streaming services and social media platforms. Swarm intelligence algorithms mimic the collective behavior of social insects to solve optimization problems. AI algorithms are used in drug discovery to accelerate the identification of potential treatments. Collaborative efforts are essential for advancing AI algorithms and harnessing their full potential for societal benefit."
}

In [25]:
from whoosh.fields import Schema, TEXT, ID
from whoosh.index import create_in
from whoosh.index import open_dir
import os.path

# Defining index schema
schema = Schema(Id=ID(stored=True), Body=TEXT(stored=True))

index_dir = "indexdir_toy"
if not os.path.exists(index_dir):
    os.mkdir(index_dir)
    
# Creating the index
ix = create_in(index_dir, schema)

# Open the index writer
writer = ix.writer()
for doc in documents:
    writer.add_document(Id=doc, Body=documents[doc])
    
# Commit and close the writer
writer.commit()

In [28]:
from whoosh.qparser import QueryParser
from whoosh.scoring import TF_IDF
from whoosh import scoring

# create the query parser
qp = QueryParser("Body", schema=schema)

# parse the query
query_sentence = queries['q1']
print(f'Query = {query_sentence}\n\n')
query = qp.parse(query_sentence)

# create a searcher object
searcher_tfidf = ix.searcher(weighting=scoring.TF_IDF())

# search documents and store them
# we are returing top 3 documents
results_tfidf = searcher_tfidf.search(query, limit=3, scored=True)

# print the documents
for hit in results_tfidf:
    print(hit["Id"])
    print('\n')
    print(hit["Body"])
    print('\n')
    print('------------------\n')

Query = machine learning


doc2


Deep learning, a branch of machine learning, uses neural networks to process complex data. AI algorithms are capable of analyzing large datasets to extract meaningful insights. Natural Language Processing (NLP) algorithms enable computers to understand and generate human language. AI-driven recommendation algorithms personalize user experiences in e-commerce and content platforms. Ensuring the transparency and accountability of AI algorithms is essential for building trust in AI technologies.


------------------

doc1


Artificial Intelligence (AI) is transforming various industries through automation and advanced algorithms. Machine learning, a subset of AI, enables computers to learn from data and make predictions. Algorithms are at the core of AI systems, guiding decision-making and problem-solving processes. AI-powered systems are increasingly used in healthcare for diagnosis and treatment planning. The ethical implications of AI algorithms, such 

# Task 3

Compute the precision and recall for the retrieved documents in the previous example.

In [29]:
# Relevant documents for the query 'machine learning'
relevant_docs_q1 = set(relevance['q1'])

# Retrieved documents for the query 'machine learning'
retrieved_docs_q1 = set(hit["Id"] for hit in results_tfidf)

# Relevant and retrieved documents
intersection = relevant_docs_q1.intersection(retrieved_docs_q1)

# Precision
precision = len(intersection) / len(retrieved_docs_q1) if len(retrieved_docs_q1) > 0 else 0

# Recall
recall = len(intersection) / len(relevant_docs_q1) if len(relevant_docs_q1) > 0 else 0

print("Precision:", precision)
print("Recall:", recall)

Precision: 1.0
Recall: 0.6666666666666666


# Task 4

Modify the last code to test all queries and then report the precision and recall.

In [34]:
for query_key, query_sentence in queries.items():
    print(f'Query = {query_sentence}\n\n')
    
    # parse the query
    query = qp.parse(query_sentence)

    # search documents and store them
    # we are returning top 3 documents
    results_tfidf = searcher_tfidf.search(query, limit=3, scored=True)
    
    # print the documents
    for hit in results_tfidf:
        print(hit["Id"])

    # Relevant documents for the current query
    relevant_docs = set(relevance[query_key])

    # Retrieved documents for the current query
    retrieved_docs = set(hit["Id"] for hit in results_tfidf)

    # Relevant and retrieved documents
    intersection = relevant_docs.intersection(retrieved_docs)

    # Precision
    precision = len(intersection) / len(retrieved_docs) if len(retrieved_docs) > 0 else 0

    # Recall
    recall = len(intersection) / len(relevant_docs) if len(relevant_docs) > 0 else 0

    print("Precision:", precision)
    print("Recall:", recall)
    print("------------------\n")

Query = machine learning


doc2
doc1
Precision: 1.0
Recall: 0.6666666666666666
------------------

Query = AI algorithms


doc3
doc4
doc1
Precision: 1.0
Recall: 0.6
------------------

