<a href="https://colab.research.google.com/github/nxxk23/AI-Engineer/blob/main/sample/elasticsearch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip -q install elasticsearch pandas transformers torch sentence_transformers

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/524.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m524.6/524.6 kB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/245.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.3/245.3 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/64.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.4/64.4 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from elasticsearch import Elasticsearch
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer

In [6]:
# Load the DataFrame
df = pd.read_excel('/content/drive/MyDrive/AIEngineer/elasticsearch/jobdb.xlsx')
df

Unnamed: 0,No,Position,Description,Responsibilities,Qualification
0,1,Network Engineer,Responsible for managing and maintaining the c...,"Design, implement, and maintain network system...","Bachelor’s degree in Computer Science, Informa..."
1,2,Software Developer,Develop and maintain web-based applications an...,"Write clean, scalable code for web application...","Bachelor’s degree in Computer Science, Softwar..."
2,3,IT Support Specialist,Provide technical assistance and support to co...,"Diagnose hardware, software, and network probl...",Diploma or Bachelor’s degree in Information Te...
3,4,Digital Marketing Specialist,"Plan, execute, and optimize digital marketing ...",Develop and manage online marketing campaigns ...,"Bachelor’s degree in Marketing, Communications..."
4,5,Project Manager,Oversee and coordinate the successful executio...,"Develop project plans, timelines, and budgets....","Bachelor’s degree in Business, IT, or a relate..."
5,6,System Administrator,Manage and maintain the company's server infra...,"Install, configure, and maintain server hardwa...","Bachelor’s degree in Information Technology, C..."
6,7,Business Analyst,Analyze business processes and data to provide...,Gather and document business requirements from...,"Bachelor’s degree in Business, Economics, or r..."
7,8,Cybersecurity Specialist,Ensure the company’s digital assets and networ...,Implement security measures and protocols.\nMo...,"Bachelor’s degree in Cybersecurity, Informatio..."
8,9,Data Scientist,Use advanced data analytics and machine learni...,"Collect, process, and analyze large datasets.\...","Bachelor’s or Master’s degree in Data Science,..."
9,10,Sales Manager,Lead the sales team and drive revenue growth b...,Develop and implement sales strategies.\nLead ...,"Bachelor’s degree in Business, Marketing, or r..."


In [9]:
df.columns

Index(['No', 'Position', 'Description', 'Responsibilities', 'Qualification'], dtype='object')

In [10]:
mappings = {
    "mappings": {
        "properties": {
            "No": {"type": "integer"},
            "Position": {"type": "text"},
            "Description": {"type": "text"},
            "Responsibilities": {"type": "text"},
            "Qualification": {"type": "text"},
            "embedding": {"type": "dense_vector","dims": 384}
            }
        }
    }

In [16]:
# Elasticsearch credentials
user = "natthaphol.po"
passwords = "cnEM5CeFrG"

# Elasticsearch setup with basic authentication and HTTPS
es = Elasticsearch(
    hosts=[{'host': 'elk.manageai.co.th', 'port': 443, 'scheme': 'https'}],
    basic_auth=("natthaphol.po", "cnEM5CeFrG")  # Ensure the credentials are correct
)

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')




In [19]:
def test_elasticsearch_connection():
    try:
        es.ping()
        print("Connection successful!")
    except Exception as e:
        print(f"Error: {e}")

test_elasticsearch_connection()


Connection successful!


In [11]:
# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Function to encode text to embeddings
def rag_encode(text):
    encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)
    # Perform pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
    # Convert tensor to flat list
    return sentence_embeddings.squeeze().tolist()  # Flatten to 1D list

## embed to es

In [23]:
# Create index with correct mapping
def create_index_if_not_exists(index_name):
    if not es.indices.exists(index=index_name):  # Only create if the index doesn't exist
        es.indices.create(index=index_name, body=mappings)

def index_row_by_row(df, index_name):
    create_index_if_not_exists(index_name)

    # Index each row individually
    for _, row in df.iterrows():
        try:
            # Prepare document for indexing
            document = {
                "No": row['No'],
                "Position": row['Position'],
                "Description": row['Description'],
                "Responsibilities": row['Responsibilities'],
                "Qualification": row['Qualification'],
                "embedding": rag_encode(f"{row['Position']} {row['Description']} {row['Responsibilities']} {row['Qualification']}")
            }

            # Index the document
            es.index(index=index_name, body=document)
        except Exception as e:
            print(f"Error indexing row: {row} - {str(e)}")

# Process and index the DataFrame row by row
index_row_by_row(df, 'internal-manageai-jobdb-index')

## Search

In [35]:
# Query Elasticsearch index
def query_elasticsearch(index_name, query_body):
    try:
        response = es.search(index=index_name, body=query_body)
        return response['hits']['hits']
    except Exception as e:
        print(f"Error querying index: {str(e)}")
        return []

# Example query to search documents by Position (job role)
def search_by_position(position, index_name):
    query_body = {
        "query": {
            "match": {
                "Position": position
            }
        }
    }

    results = query_elasticsearch(index_name, query_body)
    for result in results:
        print(f"Found document: {result['_source']}")


In [40]:
search_by_position("Business", 'internal-manageai-jobdb-index')

Found document: {'No': 7, 'Position': 'Business Analyst', 'Description': 'Analyze business processes and data to provide insights and support decision-making for operational improvements.', 'Responsibilities': 'Gather and document business requirements from stakeholders.\nAnalyze data and produce detailed reports.\nIdentify areas for process improvement and recommend solutions.\nCollaborate with IT teams to ensure technical solutions meet business needs.\nPrepare project documentation and presentations for management.', 'Qualification': 'Bachelor’s degree in Business, Economics, or related field.\n2-4 years of experience as a business analyst or in a similar role.\nStrong analytical skills and experience with data analysis tools (Excel, SQL).\nExcellent communication and presentation skills.\nExperience with process modeling is an advantage.', 'embedding': [0.03277675062417984, 0.015531940385699272, -0.05410613864660263, 0.0005425480776466429, -0.08528139442205429, 0.01453491859138012,

In [42]:
# Check the index mapping
mapping = es.indices.get_mapping(index="internal-manageai-jobdb-index")
print(mapping)


{'internal-manageai-jobdb-index': {'mappings': {'properties': {'Description': {'type': 'text'}, 'No': {'type': 'integer'}, 'Position': {'type': 'text'}, 'Qualification': {'type': 'text'}, 'Responsibilities': {'type': 'text'}, 'embedding': {'type': 'dense_vector', 'dims': 384, 'index': True, 'similarity': 'cosine'}}}}}


In [43]:
# Quick check to see existing documents
def check_documents(index_name):
    query_body = {
        "query": {
            "match_all": {}
        }
    }

    results = query_elasticsearch(index_name, query_body)
    for result in results:
        print(f"Document: {result['_source']}")

# Check documents in your index
check_documents("internal-manageai-jobdb-index")


Document: {'No': 1, 'Position': 'Network Engineer', 'Description': 'Responsible for managing and maintaining the company’s network infrastructure, ensuring seamless connectivity and optimal performance.', 'Responsibilities': 'Design, implement, and maintain network systems (LAN, WAN, Internet).\nMonitor network performance and ensure availability and reliability.\nTroubleshoot network issues and provide timely solutions.\nWork with other teams to support infrastructure needs.\nConduct regular network security assessments and audits.', 'Qualification': 'Bachelor’s degree in Computer Science, Information Technology, or related field.\n2-5 years of experience in network management and support.\nStrong knowledge of routing, switching, and firewall configurations.\nFamiliarity with Cisco, Juniper, or similar hardware.\nRelevant certifications (CCNA, CCNP) are a plus.', 'embedding': [-0.009150550700724125, -0.001943079405464232, 0.02472006157040596, 0.028129635378718376, -0.04223489388823509

In [45]:
# Quick check to see existing documents
def check_documents(index_name):
    query_body = {
        "query": {
            "match_all": {}
        }
    }

    results = query_elasticsearch(index_name, query_body)
    for result in results:
        print(f"Document: {result['_source']}")

# Check documents in your index
check_documents("internal-manageai-rag-costsheet")


Document: {'type': 'SO', 'Sonumber': 'SO13-20240400933', 'CS': 'CS-202404094555', 'ContractStartDate': '01/05/2024', 'ContractEndDate': '30/04/2025', 'Customer_id': '18565', 'Customer_name': 'ไอเน็ต แมเนจด์ เซอร์วิสเซส', 'Sale_id': '57005', 'Sale_name': 'นางสาวพัชราภรณ์ แนบเนียน', 'total': '6,000.00', 'Original': 'Cloud-VMware', 'Service': 'Cloud-VMware', 'Internal': '6,000.00', 'External': '0', 'embedding': [-0.004548392724245787, -0.03803795948624611, -0.04036878049373627, -0.005431054625660181, -0.009913560934364796, 0.006202440708875656, 0.030498908832669258, 0.024065623059868813, 0.0037603636737912893, -0.018169982358813286, 0.0675744041800499, -0.045115210115909576, 0.003096480155363679, 0.008769542910158634, -0.04550839960575104, 0.03064529411494732, -0.004642002750188112, -0.008562698028981686, -0.033708494156599045, 0.019700434058904648, 0.06096542626619339, -0.07426564395427704, 0.05289080739021301, -0.05119723826646805, -0.006225053686648607, 0.00915919803082943, -0.04037896