In [1]:
import json
import os
import uuid
import pandas as pd
import numpy as np
import opensearch
from opensearch import OpenSearch
from opensearch import helpers
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm
from dotenv import load_dotenv

load_dotenv("secret.env")

class Reader(object):
    def __init__(self, file_name):
        self.file_name = file_name
    
    def run(self):
        df = pd.read_csv(self.file_name, chunksize=3000)
        df = next(df)
        df = df.fillna("")
        return df

class Tokenizer(object):
    def __init__(self):
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
    
    def get_token(self, documents):
        sentences  = [documents]
        sentence_embeddings = self.model.encode(sentences)
        _ = list(sentence_embeddings.flatten())
        encod_np_array = np.array(_)
        encod_list = encod_np_array.tolist()
        return encod_list

class OpenSearchImports(object):
    def __init__(self, df, index_name='posting'):
        self.df = df
        self.index_name = index_name
        self.client = OpenSearch(
            [os.getenv("ENDPOINT")],
            http_auth=(os.getenv("USERNAME"), os.getenv("PASSWORD")),
            use_ssl=True,
            verify_certs=True,
            ca_certs=certifi.where())
        
    def run(self):
        elk_data = self.df.to_dict("records")
        for job in elk_data:
            try:
                helper_token = Tokenizer()
                embeddings = helper_token.get_token(job["jobpost"])
                job["vectors"] = embeddings
                self.client.index(index=self.index_name, body=job)
            except Exception as e:
                print(f"Error: {e}")
                return False
        return True

class OpenSearchQuery(object):
    def __init__(self, index_name='posting', endpoint=os.getenv("ENDPOINT")):
        self.index_name = index_name
        self.endpoint = endpoint
        self.client = OpenSearch(
            [self.endpoint],
            http_auth=(os.getenv("USERNAME"), os.getenv("PASSWORD")),
            use_ssl=True,
            verify_certs=True,
            ca_certs=certifi.where())
        
    def run(self, query):
        helper_token = Tokenizer()
        embeddings = helper_token.get_token(query)
        search_query = {
            "size": 50,
            "_source": ["Title", "log_file_path", "line_number"],
            "query": {
                "knn": {
                    "vectors": {
                        "vector": embeddings,
                        "k": 20}}}}

        # Perform the search
        res = self.client.search(index=self.index_name, body=search_query, request_timeout=55)
        results = []
        for hit in res["hits"]["hits"]:
            result = {
                "title": hit["_source"]["Title"],
                "log_file_path": hit["_source"]["log_file_path"],
                "line_number": hit["_source"]["line_number"]
            }
            results.append(result)
        
        return results

# Load the log data from the log file
with open("log_file.txt", "r") as f:
    log_data = f.read()

# Import the log data into the OpenSearch index with embeddings
helper = Reader(file_name=log_data)
df = helper.run()
helper_elk = OpenSearchImports(df=df)
helper_elk.run()

# Perform a semantic search on the OpenSearch index
INPUT = input("Enter the Input Query ")
helper_query = OpenSearchQuery()
results = helper_query.run(INPUT)
print(results)

ModuleNotFoundError: No module named 'description'