# WHOOSH LIBRARY

### To install use pip install Whoosh

## Creating a Schema

In [15]:
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED # Later I willl explain type of fields
from whoosh.analysis import StemmingAnalyzer

schema = Schema(
                path=ID(stored=True),   # Inside Schema are fields, which is to be shown in search
                title=TEXT(stored=True),
                content=TEXT(analyzer=StemmingAnalyzer()),
                tags=KEYWORD)

In [16]:
schema

<Schema: ['content', 'path', 'tags', 'title']>

## Modifying the schema after indexing

In [17]:
import os, os.path
from whoosh import index

if not os.path.exists("indexdir"):
    os.mkdir("indexdir")

ix = index.create_in("indexdir", schema)  # Used for making directory

In [18]:
import whoosh.index as index

ix = index.open_dir("indexdir")# Open directory

In [6]:
writer = ix.writer() # This opens the writer where we cn add documents

In [7]:
writer.add_document(path=u"/a", title=u"A", content=u"Hello there")
writer.add_document(path=u"/a", title=u"A", content=u"Deja vu!")

In [8]:
writer.commit() # Closing the writer, If you want to add documents then you have to open first directory and writer than add.

## Practice

In [19]:
from whoosh.fields import Schema, TEXT, ID
from whoosh import index

In [20]:
import os.path
if not os.path.exists("indexdir"):
    os.mkdir("indexdir")

In [21]:
schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored = True)) # build schema
#TEXT -> having text document ; ID-> which are fixed(cnnot be change) like URL
 
ix = index.create_in("indexdir", schema) # build index
 
writer = ix.writer()
# writer.remove_field('title') 
writer.add_document(title=u"My document", content=u"This is my python document! hello big world",
                    path=u"/a")
writer.add_document(title=u"Second try", content=u"This is the second example hello world.",
                    path=u"/b")
writer.add_document(title=u"Third time's the charm", content=u"More examples. Examples are many.",
                    path=u"/c")

writer.commit()

In [22]:
from whoosh.qparser import QueryParser
 
with ix.searcher() as searcher:
     query = QueryParser("content", ix.schema).parse("This python")
     results = searcher.search(query, terms=True)
     
     for r in results:
         print (r, r.score)
         # Was this results object created with terms=True?
         if results.has_matched_terms():
            # What terms matched in the results?
            print(results.matched_terms())
         
     # What terms matched in each hit?
     print ("matched terms")
     for hit in results:
        print(hit.matched_terms())
     print ("more_results")
     first_hit = results[0]
     more_results = first_hit.more_like_this("content")
     print (more_results)   
    

<Hit {'content': 'This is my python document! hello big world', 'path': '/a', 'title': 'My document'}> 1.2583815502828914
{('content', b'python')}
matched terms
[('content', b'python')]
more_results
<Top 1 Results for Or([Term('content', 'big', boost=0.6588835188105945), Term('content', 'document', boost=0.6588835188105945), Term('content', 'my', boost=0.6588835188105945), Term('content', 'python', boost=0.6588835188105945), Term('content', 'hello', boost=0.5617184491361429)]) runtime=0.001674799999818788>


In [23]:
found = results.scored_length()
if results.has_exact_length():
    print("Scored", 'found', "of exactly", len(results), "documents")
else:
    low = results.estimated_min_length()
    high = results.estimated_length()
 
    print("Scored", 'found', "of between", low, "and", high, "documents") 

Scored found of exactly 1 documents


## Practical Example Coursera Search

In [24]:
import pandas as pd
import numpy as np

In [25]:
df=pd.read_excel(r'C:\Users\parth\Desktop\AlphaDynamic/Book1_final.xlsx')

In [26]:
df['text']=df['course_title']+' '+df['course_organization']

In [27]:
# df1=df[df['Category(1 ai, 2 comps, 3 business)']==1]
# df1.tail(50)
df

Unnamed: 0,course_title,"Category(1 ai, 2 comps, 3 business)",course_organization,course_Certificate_type,course_rating,course_difficulty,course_students_enrolled,course_link,text
0,IBM Data Science,1,IBM,PROFESSIONAL CERTIFICATE,4.8,Beginner,550k,https://www.coursera.org/professional-certific...,IBM Data Science IBM
1,Python for Everybody,2,University of Michigan,SPECIALIZATION,4.8,Beginner,1.7m,https://www.coursera.org/specializations/python,Python for Everybody University of Michigan
2,Introduction to Data Science,1,IBM,SPECIALIZATION,4.8,Beginner,350k,https://www.coursera.org/specializations/intro...,Introduction to Data Science IBM
3,Google IT Support,2,Google,PROFESSIONAL CERTIFICATE,4.8,Beginner,420k,https://www.coursera.org/professional-certific...,Google IT Support Google
4,Deep Learning,1,deeplearning.ai,SPECIALIZATION,4.8,Intermediate,770k,https://www.coursera.org/specializations/deep-...,Deep Learning deeplearning.ai
...,...,...,...,...,...,...,...,...,...
980,Advertising and Society,3,Duke University,COURSE,4.7,Mixed,16k,https://www.coursera.org/learn/role-of-adverti...,Advertising and Society Duke University
981,Music Business Foundations,3,Berklee College of Music,COURSE,4.8,Mixed,36k,https://www.coursera.org/learn/music-business-...,Music Business Foundations Berklee College of ...
982,Design: Creation of Artifacts in Society,4,University of Pennsylvania,COURSE,4.7,Beginner,44k,https://www.coursera.org/learn/design,Design: Creation of Artifacts in Society Unive...
983,Introduction to Big Data,1,University of California San Diego,COURSE,4.6,Mixed,180k,https://www.coursera.org/learn/big-data-introd...,Introduction to Big Data University of Califor...


In [28]:
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import StemmingAnalyzer
from whoosh.analysis import RegexTokenizer
schema = Schema(
                url=ID(stored=True),
                title=TEXT(analyzer=StemmingAnalyzer(),stored=True), # Stemming every word
                organization=TEXT(stored=True,field_boost=2.0),  # Give more priority to it
                tags=KEYWORD)

import os, os.path
from whoosh import index

if not os.path.exists("indexdir10"):
    os.mkdir("indexdir10")

idx = index.create_in("indexdir10", schema)

In [29]:
idx = index.open_dir("indexdir10")

In [30]:
writer=idx.writer()
for i in range(len(df)):
    writer.add_document(url=df.loc[i]['course_link'],title=df.loc[i]['course_title'],organization=df.loc[i]['course_organization'],tags=df.loc[i]['course_difficulty'])
writer.commit()

In [31]:
from whoosh.qparser import QueryParser
from whoosh import scoring
with idx.searcher(weighting=scoring.TF_IDF()) as searcher:
     query = QueryParser("title", idx.schema).parse(u"Intelligence")
#      results = searcher.search_page(query,1)
#      restrict_q = query.Term("tag", "Advanced")

     results=searcher.search(query,limit=None)
     print(len(results))
#      print(results[0]['url'])
     for r in results:
         print (r, r.score)

3
<Hit {'organization': 'deeplearning.ai', 'title': 'Introduction to TensorFlow for Artificial Intelligence, Machine Learning, and Deep Learning', 'url': 'https://www.coursera.org/learn/introduction-tensorflow'}> 6.506347280052198
<Hit {'organization': 'University of Colorado System', 'title': 'Data Warehousing for Business Intelligence', 'url': 'https://www.coursera.org/specializations/data-warehousing'}> 6.506347280052198
<Hit {'organization': 'IBM', 'title': 'Introduction to Artificial Intelligence (AI)', 'url': 'https://www.coursera.org/learn/introduction-to-ai'}> 6.506347280052198


In [32]:
from whoosh.qparser import MultifieldParser
from whoosh import scoring
with idx.searcher(weighting=scoring.TF_IDF()) as searcher:
     query = MultifieldParser(["title","organization"], idx.schema).parse("learn")
#      results = searcher.search_page(query,1)
#      restrict_q = query.Term("tag", "Advanced")

     results=searcher.search(query,limit=None)
     print(len(results))
#      print(results[0]['url'])
     arr=[]
     for r in results:
#             arr.append(r)
         print (r, r.score)
#          print(type(r))
# arr

46
<Hit {'organization': 'University of California San Diego', 'title': 'Learning How to Learn: Powerful mental tools to help you master tough subjects', 'url': 'https://www.coursera.org/learn/learning-how-to-learn'}> 8.08498807892406
<Hit {'organization': 'deeplearning.ai', 'title': 'Introduction to TensorFlow for Artificial Intelligence, Machine Learning, and Deep Learning', 'url': 'https://www.coursera.org/learn/introduction-tensorflow'}> 8.08498807892406
<Hit {'organization': 'New York University', 'title': 'Machine Learning and Reinforcement Learning in Finance', 'url': 'https://www.coursera.org/specializations/machine-learning-reinforcement-finance'}> 8.08498807892406
<Hit {'organization': 'University of Illinois at Urbana-Champaign', 'title': 'e-Learning Ecologies: Innovative Approaches to Teaching and Learning for the Digital Age', 'url': 'https://www.coursera.org/learn/elearning'}> 8.08498807892406
<Hit {'organization': 'University of California San Diego', 'title': 'Aprendien

In [34]:
from whoosh.qparser import MultifieldParser  # Parse multifield
from whoosh import scoring

with idx.searcher(weighting=scoring.TF_IDF()) as searcher:
     query = MultifieldParser(["title","organization"], idx.schema).parse(u"robot")
#      results = searcher.search_page(query,1)
#      restrict_q = query.Term("tag", "Advanced")

     results=searcher.search(query,limit=None)
     print(len(results))
#      print(results[0]['url'])
     for r in results:
         print (r, r.score)

3
<Hit {'organization': 'University of Pennsylvania', 'title': 'Robotics', 'url': 'https://www.coursera.org/specializations/robotics'}> 6.506347280052198
<Hit {'organization': 'Northwestern University', 'title': 'Modern Robotics:  Mechanics, Planning, and Control', 'url': 'https://www.coursera.org/specializations/modernrobotics'}> 6.506347280052198
<Hit {'organization': 'Georgia Institute of Technology', 'title': 'Control of Mobile Robots', 'url': 'https://www.coursera.org/learn/mobile-robot'}> 6.506347280052198
