# Boolean Query


In [None]:
!pip install elasticsearch

In [None]:
from elasticsearch import Elasticsearch, helpers
import os, json, time
from datetime import datetime
from tqdm import tqdm 

In [None]:
import warnings
warnings.filterwarnings('ignore')

## Load Processed Data

In [None]:
import json
dataset_path = 'IR_data_news_12k.json'
with open(dataset_path) as f:
    data = json.load(f)
print(data['0'].keys())

dict_keys(['title', 'content', 'tags', 'date', 'url', 'category'])


## Connect to the Elasticsearch Cluster and Create an Index


In [None]:
index_name = 'my_index_name'
es = Elasticsearch("http://localhost:9200")
es.indices.create(index = index_name)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'my_index_name'})

### Check the Cluster Status

In [None]:
dict(es.info())

## Indexing Documents 

In [None]:
for i in tqdm(range(len(data))):
    es.index(index = index_name, id=i, document=data[str(i)])
    i = i + 1

100%|█████████████████████████████████████| 12202/12202 [03:41<00:00, 55.02it/s]


###  Bulk API

In [None]:
from elasticsearch.helpers import bulk

def bulk_sync():
    actions = [
        {
            '_index': 'ta3_index',
            '_id':doc_id,
            '_source': doc
        } for doc_id,doc in data.items()
    ]
    bulk(es, actions)


In [None]:
start = time.time()
bulk_sync()
end = time.time()
print("Indexing all documents took about {:.2f} seconds".format(end - start))

Indexing all documents took about 5.69 seconds


### check index

In [None]:
es.count(index = index_name)

ObjectApiResponse({'count': 12202, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}})

## Boolean Retrieval 

In [None]:
# sample query: تحریم هسته‌ای ً آمریکا !ایران
query= {
        "bool": {
          "should": [
              { 
                "match": {
                  "content": {
                    "query": "", # add query word 
                  }
                }
              }, 
              
              { 
                "match_phrase":{
                  "content":{
                    "query":"", # add query word 
                  }
                }  
              },
          ],
          "must_not": [
              {
                "match": {
                  "content": {
                    "query": "", # add query word 
                  }
                }
              }
          ],
        },
    }

### Search query


In [None]:
res = es.search(index = index_name, query=query, _source= ["url"])
res = dict(res)

### Results

In [None]:
print("{} results in {} s: ".format(res['hits']['total']['value'] ,res['took']/1000))
for doc in res['hits']['hits']:
    print(doc['_source']['url'])