<a href="https://colab.research.google.com/github/ninad-1234/Web_Minning/blob/main/DA1_Web_Minning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Package Installation

In [2]:
!pip install Whoosh
# Installed Whoosh Library

Collecting Whoosh
  Downloading Whoosh-2.7.4-py2.py3-none-any.whl (468 kB)
[?25l[K     |▊                               | 10 kB 25.5 MB/s eta 0:00:01[K     |█▍                              | 20 kB 27.3 MB/s eta 0:00:01[K     |██                              | 30 kB 18.8 MB/s eta 0:00:01[K     |██▉                             | 40 kB 15.7 MB/s eta 0:00:01[K     |███▌                            | 51 kB 5.7 MB/s eta 0:00:01[K     |████▏                           | 61 kB 6.0 MB/s eta 0:00:01[K     |█████                           | 71 kB 5.4 MB/s eta 0:00:01[K     |█████▋                          | 81 kB 6.1 MB/s eta 0:00:01[K     |██████▎                         | 92 kB 6.4 MB/s eta 0:00:01[K     |███████                         | 102 kB 5.4 MB/s eta 0:00:01[K     |███████▊                        | 112 kB 5.4 MB/s eta 0:00:01[K     |████████▍                       | 122 kB 5.4 MB/s eta 0:00:01[K     |█████████                       | 133 kB 5.4 MB/s eta 0:00:01

Importing Required Libraries and Packages 

In [40]:
from typing import Dict, List, Sequence
from whoosh.index import create_in
from whoosh.fields import *
from whoosh.qparser import MultifieldParser
from whoosh.filedb.filestore import RamStorage
from whoosh.analysis import StemmingAnalyzer
import json
from collections import OrderedDict

Defining Search Engine 

In [41]:
class SearchEngine:

    def __init__(self, schema):
        self.schema = schema
        schema.add('raw', TEXT(stored=True))
        self.ix = RamStorage().create_index(self.schema)

    def index_documents(self, docs: Sequence):
        writer = self.ix.writer()
        for doc in docs:
            d = {k: v for k,v in doc.items() if k in self.schema.stored_names()}
            d['raw'] = json.dumps(doc) # raw version of all of doc
            writer.add_document(**d)
        writer.commit(optimize=True)

    def get_index_size(self) -> int:
        return self.ix.doc_count_all()

    def query(self, q: str, fields: Sequence, highlight: bool=True) -> List[Dict]:
        search_results = []
        with self.ix.searcher() as searcher:
            results = searcher.search(MultifieldParser(fields, schema=self.schema).parse(q))
            for r in results:
                d = json.loads(r['raw'])
                if highlight:
                    for f in fields:
                        if r[f] and isinstance(r[f], str):
                            d[f] = r.highlights(f) or r[f]

                search_results.append(d)

        return search_results

Search Engine Working and Creating Inverted Index 

In [51]:
if __name__ == '__main__':

    docs = [
        {
            "id": "1",
            "title": "First document banana",
            "description": "This is the first document we've added in San Francisco!",
            "tags": ['foo', 'bar'],
            "extra": "kittens and cats"
        },
        {
            "id": "2",
            "title": "Second document hatstand",
            "description": "The second one is even more interesting!",
            "tags": ['alice'],
            "extra": "kittens  foals and horses"
        },
        {
            "id": "3",
            "title": "Third document slug",
            "description": "The third one is less interesting!",
            "tags": ['bob'],
            "extra": "bunny and rabbit"
        },
    ]
    schema = Schema(
        id=ID(stored=True),
        title=TEXT(stored=True),
        description=TEXT(stored=True, analyzer=StemmingAnalyzer()),
        tags=KEYWORD(stored=True)
    )

    engine = SearchEngine(schema)
    engine.index_documents(docs)

    print(f"indexed {engine.get_index_size()} documents")

    fields_to_search = ["title", "description", "tags"]

    for q in ["hatstand", "banana", "first", "second", "alice", "bob", "san francisco"]:
        print(f"Query:: {q}")
        print("\t", engine.query(q, fields_to_search, highlight=True))
        #print("-"*70)
    inverted_index={}
    #l=[]   
    f_l=[] 
    for q in docs:
      l=[]
      #print(q['id'])
      l.append(q['id'])
      l.append(q['title'])
      l.append(q['description'])
      l.append(q['extra'])
      f_l.append(' '.join(l))


    #print(f_l)
    
    for i ,doc in enumerate(f_l):
      #print(doc)
      for term in doc.split():
        #print(term)
        if term in inverted_index:
          inverted_index[term].add(i+1)
        else:
          inverted_index[term]={i+1}

    new_d = OrderedDict(sorted(inverted_index.items(), key=lambda t:t[0]))
    print('{} : {} : {} '.format("Word","Frequency","Available in Document ID"))
    print()
    for k,v in new_d.items():
       print('{} : {} : {} '.format(k,len(v),v))        

indexed 3 documents
Query:: hatstand
	 [{'id': '2', 'title': 'Second document <b class="match term0">hatstand</b>', 'description': 'The second one is even more interesting!', 'tags': ['alice'], 'extra': 'kittens  foals and horses'}]
Query:: banana
	 [{'id': '1', 'title': 'First document <b class="match term0">banana</b>', 'description': "This is the first document we've added in San Francisco!", 'tags': ['foo', 'bar'], 'extra': 'kittens and cats'}]
Query:: first
	 [{'id': '1', 'title': '<b class="match term0">First</b> document banana', 'description': 'This is the <b class="match term1">first</b> document we\'ve added', 'tags': ['foo', 'bar'], 'extra': 'kittens and cats'}]
Query:: second
	 [{'id': '2', 'title': '<b class="match term0">Second</b> document hatstand', 'description': 'The <b class="match term1">second</b> one is even more interesting', 'tags': ['alice'], 'extra': 'kittens  foals and horses'}]
Query:: alice
	 [{'id': '2', 'title': 'Second document hatstand', 'description': 