In [1]:
import elasticsearch
import elasticsearch.helpers
import json

In [4]:
es_host = 'http://localhost:9200'
es = elasticsearch.Elasticsearch([es_host])

In [5]:
es.info()

ObjectApiResponse({'name': 'a67b25461fc0', 'cluster_name': 'docker-cluster', 'cluster_uuid': '6UTfLDoPTOC9l1kunEtm8w', 'version': {'number': '8.9.2', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': 'e8179018838f55b8820685f92e245abef3bddc0f', 'build_date': '2023-08-31T02:43:14.210479707Z', 'build_snapshot': False, 'lucene_version': '9.7.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [6]:
import elasticsearch
import elasticsearch.helpers
import json

index_name = 'laptops'

def read_documents(file_name):
    """
    Returns a generator of documents to be indexed by elastic, read from file_name
    """
    with open(file_name, 'r') as json_file:
        data = json.load(json_file)
        for laptop in data:
            doc = {"_index": index_name, "_source": laptop}
            yield doc

def create_index(es, index_name, body={}):
    # delete index when it already exists
    es.indices.delete(index=index_name, ignore=[400, 404])
    # create the index 
    es.indices.create(index=index_name, body=body)
                
def index_documents(es, collection_file_name, index_name, body={}):
    create_index(es, index_name, body)
    # bulk index the documents from file_name
    return elasticsearch.helpers.bulk(
        es, 
        read_documents(collection_file_name),
        index=index_name,
        chunk_size=2000,
        request_timeout=30
    )

In [16]:
body = {} # no indexing options (leave default)
index_documents(es, 'data/6-laptops-dataset.json', index_name, body)

  es.indices.delete(index=index_name, ignore=[400, 404])
  return elasticsearch.helpers.bulk(


(4835, [])

In [7]:
count_molecule_query = {
    "query": {
        "simple_query_string": {
            "query": "mac os",
        }
    }
}

In [8]:
term = 'mac os'
body = {"track_total_hits": True, "query": {"query_string": 
                                            {"query": term, 
                                             "default_operator":"AND", 
                                             "auto_generate_synonyms_phrase_query": True }}}
result = es.search(index='laptops', body=body)

In [9]:
result["hits"]["hits"]

[{'_index': 'laptops',
  '_id': 'JXpb7YsBJ46_XTzAkxDS',
  '_score': 12.182031,
  '_ignored': ['reviews.reviewText.keyword'],
  '_source': {'source': 'Amazon',
   'imageURLs': ['https://m.media-amazon.com/images/I/81VbWDN53oL._AC_SX355_.jpg',
    'https://m.media-amazon.com/images/I/81VbWDN53oL._AC_SX450_.jpg',
    'https://m.media-amazon.com/images/I/81VbWDN53oL._AC_SX425_.jpg',
    'https://m.media-amazon.com/images/I/81VbWDN53oL._AC_SX466_.jpg',
    'https://m.media-amazon.com/images/I/81VbWDN53oL._AC_SX522_.jpg',
    'https://m.media-amazon.com/images/I/81VbWDN53oL._AC_SX569_.jpg',
    'https://m.media-amazon.com/images/I/81VbWDN53oL._AC_SX679_.jpg'],
   'productURL': 'https://www.amazon.com/Apple-MacBook-ME294LL-15-4-Inch-Refurbished/dp/B079C8TPRX',
   'reviewURL': 'https://www.amazon.com/Apple-MacBook-ME294LL-15-4-Inch-Refurbished/product-reviews/B079C8TPRX/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&amp;reviewerType=all_reviews',
   'sku': 'B079C8TPRX',
   'brand': 'Apple',
   'model': '

In [85]:
lucene_query = 'brand:Dell'

# operatingSystem:Linux AND systemMemoryRam:16gb AND totalStorageCapacity:500gb AND screenSize:16 AND batteryLife:long AND productWeight:[1 TO 100]
test_query = "systemMemoryRam:16~ AND totalStorageCapacity:500gb~"

# Define the search query
search_body = {
    "query": {
        "query_string": {
            "query": test_query,
            "default_field": "brand",  # Specify the default field to search
        }
    }
}

# Perform the search
result = es.search(index=index_name, body=search_body)

# Print the results
for hit in result['hits']['hits']:
    print(hit['_source'])

{'source': 'Amazon', 'imageURLs': ['https://m.media-amazon.com/images/I/51RourAB39L._AC_SY355_.jpg', 'https://m.media-amazon.com/images/I/51RourAB39L._AC_SY450_.jpg', 'https://m.media-amazon.com/images/I/51RourAB39L._AC_SX425_.jpg', 'https://m.media-amazon.com/images/I/51RourAB39L._AC_SX466_.jpg', 'https://m.media-amazon.com/images/I/51RourAB39L._AC_SX522_.jpg', 'https://m.media-amazon.com/images/I/51RourAB39L._AC_SX569_.jpg', 'https://m.media-amazon.com/images/I/51RourAB39L._AC_SX679_.jpg'], 'productURL': 'https://www.amazon.com/Dell-Latitude-5430-Laptop-i5-1245U/dp/B00D98CY0O', 'reviewURL': 'https://www.amazon.com/Dell-Latitude-5430-Laptop-i5-1245U/product-reviews/B00D98CY0O/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&amp;reviewerType=all_reviews', 'sku': 'B00D98CY0O', 'brand': 'Dell', 'model': 'Latitude', 'modelNumber': 'Latitude 5430', 'title': 'Dell Latitude 5430 Laptop - 14&#34; FHD IPS Display - 3.3 GHz Intel Core i5-1245U 10-Core (12th Gen) - 16GB - 512GB SSD - Windows 11 Pro', 'price'

In [10]:
count_query = {
    "query": {
        "match_all": {}
    }
}

document_count = es.count(index=index_name, body=count_query)

num_documents = document_count["count"]

print(f"Number of documents in index '{index_name}': {num_documents}")

Number of documents in index 'laptops': 4835
