In [1]:
import json
import uuid

import pandas as pd



In [5]:
import sys
import os

# Get the current working directory and the relative path to your module
module_path = os.path.abspath(os.path.join("..", ".."))
if module_path not in sys.path:
    sys.path.append(module_path)

# Now you can import your module
from src.api.repository.search import SearchRepo

In [6]:
es = SearchRepo()

Connected to Elasticsearch!
{'cluster_name': 'docker-cluster',
 'cluster_uuid': '9I9q3mFYSISbvBg75cK23A',
 'name': '5eeda383872f',
 'tagline': 'You Know, for Search',
 'version': {'build_date': '2024-08-05T10:05:34.233336849Z',
             'build_flavor': 'default',
             'build_hash': '1a77947f34deddb41af25e6f0ddb8e830159c179',
             'build_snapshot': False,
             'build_type': 'docker',
             'lucene_version': '9.11.1',
             'minimum_index_compatibility_version': '7.0.0',
             'minimum_wire_compatibility_version': '7.17.0',
             'number': '8.15.0'}}


In [7]:
idx="vehicle"

# 1. Extract Data

In [8]:
df = pd.read_json("./data/processed_vehicle_machine_data.json",  orient='records')


In [9]:
df

Unnamed: 0,description,facets
0,Experience the thrill of driving with this 200...,"{'type': 'Vehicle', 'category': 'Coupe', 'bran..."
1,Experience the future of driving with the 2014...,"{'type': 'Vehicle', 'category': 'Hatchback', '..."
2,Discover the power and reliability of the JCB ...,"{'type': 'Machine', 'category': 'Hydraulic Exc..."
3,"Discover the Ezee-On 5537-56-13-10, a robust c...","{'type': 'Machine', 'category': 'Cultivator', ..."
4,Discover the power and reliability of the Cate...,"{'type': 'Machine', 'category': 'Shovel', 'bra..."
...,...,...
4995,Experience the thrill of the road with this 20...,"{'type': 'Vehicle', 'category': 'Sedan', 'bran..."
4996,Experience the power and efficiency of the New...,"{'type': 'Machine', 'category': 'Combine', 'br..."
4997,Discover the rugged reliability of the Case IH...,"{'type': 'Machine', 'category': '4WD Tractor',..."
4998,Experience the thrill of driving with this sty...,"{'type': 'Vehicle', 'category': 'Coupe, Conver..."


In [10]:
df.iloc[0]["facets"]

{'type': 'Vehicle',
 'category': 'Coupe',
 'brand': 'Lotus',
 'model': 'Exige S',
 'year': '2008',
 'engine_type': 'I4',
 'engine_size': '1.8L',
 'transmission': 'Manual',
 'drive_type': 'RWD',
 'fuel_type': 'Gasoline',
 'exterior_color': 'Red',
 'interior_color': 'Standard',
 'trim_level': 'Standard',
 'notable_features': ['Leather seats', 'Navigation', 'Bluetooth'],
 'safety_features': ['ABS', 'Airbags', 'Stability control'],
 'price': '5700'}

# 2. Transform Data

In [11]:
def process_hierarchy(hierarchy, field_name="type"):
    accumulated_path = ""
    result = []
    for level, entry in enumerate(hierarchy, start=1):
        accumulated_path = f"{accumulated_path}/{entry['name']}" if accumulated_path else entry["name"]
        result.append({
            "id": entry.get("id"),
            "name": field_name,
            "value": accumulated_path,
            "type": entry["type"],
            #"level": level
        })
    return result

In [12]:
def flatten_list(nested_list):
    flattened = []
    for item in nested_list:
        if isinstance(item, list):
            # Recursively flatten the nested list
            flattened.extend(flatten_list(item))
        else:
            # Append individual items
            flattened.append(item)
    return flattened

In [13]:
def transform(input: dict):
    number_list = ["price"]
    path_list = ["type", "category", "brand", "model"]

    description = input["description"]
    facets = input["facets"]

    term_facets = [
        {"name": k, "value": v}
        for k, v in facets.items()
        if k not in number_list and k not in path_list
    ]

    number_facets = [
        {"name": k, "value": v} for k, v in facets.items() if k in number_list
    ]

    hierarchy_type = [
        {"name": "Type", "type": "root"},
        {"name": facets["type"], "type": "type"},
        {"name": facets["category"], "type": "category"},
    ]

    hierarchy_brand = [
        {"name": "Brand", "type": "root"},
        {"name": facets["brand"], "type": "brand"},
        {"name": facets["model"], "type": "model"},
    ]

    path_facets = [process_hierarchy(hierarchy_type, field_name="type")] + [
        process_hierarchy(hierarchy_brand, field_name="brand")
    ]

    full_text = description

    full_text_boosted = [v for k, v in facets.items() if k not in number_list]
    full_text_boosted_list = flatten_list(full_text_boosted)
    full_text_boosted_str = " ".join(full_text_boosted_list)

    _id = str(uuid.uuid4())

    output = {
        "termFacetData": term_facets,
        "numberFacetData": number_facets,
        "pathFacetData": path_facets,
        "search_data": {
            "full_text": full_text,
            "full_text_boosted": full_text_boosted_str,
        },
        "completion_terms": [{"tag": v} for v in full_text_boosted_list],
        "suggestion_terms": full_text_boosted_list,  # to improve
        "id": _id,
    }

    return output


In [14]:
transform(df.iloc[0])

{'termFacetData': [{'name': 'year', 'value': '2008'},
  {'name': 'engine_type', 'value': 'I4'},
  {'name': 'engine_size', 'value': '1.8L'},
  {'name': 'transmission', 'value': 'Manual'},
  {'name': 'drive_type', 'value': 'RWD'},
  {'name': 'fuel_type', 'value': 'Gasoline'},
  {'name': 'exterior_color', 'value': 'Red'},
  {'name': 'interior_color', 'value': 'Standard'},
  {'name': 'trim_level', 'value': 'Standard'},
  {'name': 'notable_features',
   'value': ['Leather seats', 'Navigation', 'Bluetooth']},
  {'name': 'safety_features',
   'value': ['ABS', 'Airbags', 'Stability control']}],
 'numberFacetData': [{'name': 'price', 'value': '5700'}],
 'pathFacetData': [[{'id': None,
    'name': 'type',
    'value': 'Type',
    'type': 'root'},
   {'id': None, 'name': 'type', 'value': 'Type/Vehicle', 'type': 'type'},
   {'id': None,
    'name': 'type',
    'value': 'Type/Vehicle/Coupe',
    'type': 'category'}],
  [{'id': None, 'name': 'brand', 'value': 'Brand', 'type': 'root'},
   {'id': None

In [15]:
documents = [transform(row) for index, row in df.iterrows()]


In [16]:
documents

[{'termFacetData': [{'name': 'year', 'value': '2008'},
   {'name': 'engine_type', 'value': 'I4'},
   {'name': 'engine_size', 'value': '1.8L'},
   {'name': 'transmission', 'value': 'Manual'},
   {'name': 'drive_type', 'value': 'RWD'},
   {'name': 'fuel_type', 'value': 'Gasoline'},
   {'name': 'exterior_color', 'value': 'Red'},
   {'name': 'interior_color', 'value': 'Standard'},
   {'name': 'trim_level', 'value': 'Standard'},
   {'name': 'notable_features',
    'value': ['Leather seats', 'Navigation', 'Bluetooth']},
   {'name': 'safety_features',
    'value': ['ABS', 'Airbags', 'Stability control']}],
  'numberFacetData': [{'name': 'price', 'value': '5700'}],
  'pathFacetData': [[{'id': None,
     'name': 'type',
     'value': 'Type',
     'type': 'root'},
    {'id': None, 'name': 'type', 'value': 'Type/Vehicle', 'type': 'type'},
    {'id': None,
     'name': 'type',
     'value': 'Type/Vehicle/Coupe',
     'type': 'category'}],
   [{'id': None, 'name': 'brand', 'value': 'Brand', 'type':

# 3. Load Data

In [17]:
with open("mapping/settings.json", "r") as file:
    settings = json.load(file)["settings"]

with open("mapping/mappings.json", "r") as file:
    mappings = json.load(file)["mappings"]


In [18]:
resp = es.reindex(
    index=idx,
    documents=documents,
    mappings=mappings,
    settings=settings,
    # pipeline="ingest",
)
print(
    f'Index with {len(resp["items"])} documents created '
    f'in {resp["took"]} milliseconds.'
)

Index 'vehicle' created successfully.
5000 documents inserted into 'vehicle'
Index with 5000 documents created in 239017 milliseconds.


In [19]:
resp.body

{'errors': False,
 'took': 239017,
 'items': [{'index': {'_index': 'vehicle',
    '_id': 'SRvIvZMBPskx_3jzO4fF',
    '_version': 1,
    'result': 'created',
    '_shards': {'total': 2, 'successful': 1, 'failed': 0},
    '_seq_no': 0,
    '_primary_term': 1,
    'status': 201}},
  {'index': {'_index': 'vehicle',
    '_id': 'ShvIvZMBPskx_3jzO4fF',
    '_version': 1,
    'result': 'created',
    '_shards': {'total': 2, 'successful': 1, 'failed': 0},
    '_seq_no': 1,
    '_primary_term': 1,
    'status': 201}},
  {'index': {'_index': 'vehicle',
    '_id': 'SxvIvZMBPskx_3jzO4fF',
    '_version': 1,
    'result': 'created',
    '_shards': {'total': 2, 'successful': 1, 'failed': 0},
    '_seq_no': 2,
    '_primary_term': 1,
    'status': 201}},
  {'index': {'_index': 'vehicle',
    '_id': 'TBvIvZMBPskx_3jzO4fF',
    '_version': 1,
    'result': 'created',
    '_shards': {'total': 2, 'successful': 1, 'failed': 0},
    '_seq_no': 3,
    '_primary_term': 1,
    'status': 201}},
  {'index': {'_i

In [20]:
resp = es.search(index="vehicle", size=1)

Search results from 'vehicle' retrieved successfully in 60 ms.


In [21]:
resp.body

{'took': 60,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 3258, 'relation': 'eq'},
  'max_score': 1.0,
  'hits': [{'_index': 'vehicle',
    '_id': 'SRvIvZMBPskx_3jzO4fF',
    '_score': 1.0,
    '_source': {'termFacetData': [{'name': 'year', 'value': '2008'},
      {'name': 'engine_type', 'value': 'I4'},
      {'name': 'engine_size', 'value': '1.8L'},
      {'name': 'transmission', 'value': 'Manual'},
      {'name': 'drive_type', 'value': 'RWD'},
      {'name': 'fuel_type', 'value': 'Gasoline'},
      {'name': 'exterior_color', 'value': 'Red'},
      {'name': 'interior_color', 'value': 'Standard'},
      {'name': 'trim_level', 'value': 'Standard'},
      {'name': 'notable_features',
       'value': ['Leather seats', 'Navigation', 'Bluetooth']},
      {'name': 'safety_features',
       'value': ['ABS', 'Airbags', 'Stability control']}],
     'numberFacetData': [{'name': 'price', 'value': '5700'}],
     'pathFacetD