In [2]:
import json
import csv
import re
import pandas as pd

In [4]:
def get_headers_from_csv(csv_file):
    """Lấy danh sách tiêu đề từ file CSV mẫu."""
    data = pd.read_csv(csv_file, nrows=0)
    headers = data.columns.tolist()
    return headers

def split_log_line(headers, log_line, line_id):
    """Tách dòng log và chuyển thành JSON theo định dạng yêu cầu"""
    parts = re.split(r'\s+', log_line.strip())
    log_block = {
        headers[0]: line_id,
        **{headers[i]: parts[i-1] for i in range(1, len(headers)-3)},
        headers[-3]: " ".join(parts[len(headers)-3:len(parts)]),
    }
    return log_block

def convert_log_to_json(log_file, csv_template, output_json):
    """Chuyển đổi file log thành JSON theo tiêu đề từ CSV mẫu."""
    headers = get_headers_from_csv(csv_template)  # Lấy tiêu đề từ CSV
    log_data = []

    with open(log_file, "r", encoding="utf-8") as file:
        for i,line in enumerate(file):
            try:
                log_entry = split_log_line(headers,line, i)
                log_data.append(log_entry)

            except Exception as e:
                print(f"Lỗi xử lý dòng {i+1}: {e}")

    with open(output_json, "w", encoding="utf-8") as json_out:
        json.dump(log_data, json_out, indent=4, ensure_ascii=False)

    print(f"Chuyển đổi {log_file} thành {output_json} thành công!")

In [27]:
print(get_headers_from_csv("data/BGL/BGL_2k.log_structured.csv"))

['LineId', 'Label', 'Timestamp', 'Date', 'Node', 'Time', 'NodeRepeat', 'Type', 'Component', 'Level', 'Content', 'EventId', 'EventTemplate']


In [90]:
print(split_log_line(
    get_headers_from_csv("data/Thunderbird/Thunderbird_2k.log_structured.csv"),
    "- 1131566461 2005.11.09 dn228 Nov 9 12:01:01 dn228/dn228 crond(pam_unix)[2915]: session closed for user root",
    0
))

{'LineId': 0, 'Label': '-', 'Timestamp': '1131566461', 'Date': '2005.11.09', 'User': 'dn228', 'Month': 'Nov', 'Day': '9', 'Time': '12:01:01', 'Location': 'dn228/dn228', 'Component': 'crond(pam_unix)[2915]:', 'PID': 'session', 'Content': 'for user root'}


In [88]:
convert_log_to_json("data/BGL/BGL.log", "data/BGL/BGL_2k.log_structured.csv", "data/BGL/BGL.json")


Chuyển đổi data/BGL/BGL.log thành data/BGL/BGL.json thành công!


In [89]:
convert_log_to_json("data/Thunderbird/Thunderbird_2k.log", "data/Thunderbird/Thunderbird_2k.log_structured.csv", "data/Thunderbird/Thunderbird_2k.json")


Chuyển đổi data/Thunderbird/Thunderbird_2k.log thành data/Thunderbird/Thunderbird_2k.json thành công!


In [17]:
from elasticsearch import Elasticsearch

# Kết nối đến Elasticsearch
es = Elasticsearch(["http://localhost:9200"])  # Đảm bảo Elasticsearch đang chạy

# Kiểm tra kết nối
if es.ping():
    print("Kết nối thành công đến Elasticsearch!")
else:
    print("Không thể kết nối đến Elasticsearch.")


Kết nối thành công đến Elasticsearch!


  if es.ping():


In [7]:
# Tên index
index_name = "bgl"

config = {
    "template" : {
        "settings" : {
          "index" : {
            "number_of_shards" : "3",
            "number_of_replicas" : "0",
            "routing" : {
              "allocation" : {
                "include" : {
                  "_tier_preference" : "data_content"
                }
              }
            }
          }
        },
        "mappings": {
            "dynamic": False,
            "properties": {
                "LineId": { "type": "integer" },
                "Label": { "type": "keyword" },
                "Timestamp": { "type": "long" },
                "Date": { "type": "date", "format": "yyyy.MM.dd" },
                "Node": { "type": "keyword" },
                "Time": { "type": "date", "format": "yyyy-MM-dd-HH.mm.ss.SSSSSS" },
                "NodeRepeat": { "type": "keyword" },
                "Type": { "type": "keyword" },
                "Component": { "type": "keyword" },
                "Level": { "type": "keyword" },
                "Content": { "type": "text" }
            }
        },
        "aliases" : { }
  }
}


In [8]:
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)
    print(f"Deleted existing index: {index_name}")
# Tạo index mới với mapping
es.indices.create(
    index=index_name,
)
print(f"Index '{index_name}' created successfully!")



  if es.indices.exists(index=index_name):
  es.indices.delete(index=index_name)


Deleted existing index: bgl
Index 'bgl' created successfully!


  es.indices.create(


In [11]:
template_name = "template-config"
if es.cluster.exists_component_template(name=template_name):
    es.cluster.delete_component_template(name=template_name)
    print(f"Deleted existing template: {template_name}")
#create a general template
es.cluster.put_component_template(
    name=template_name,
    template={
        "settings" : {
          "index" : {
            "number_of_replicas" : "0",
          }
        },
        "mappings": {
            "dynamic": False,
            "properties": {
                "LineId": { "type": "integer" },
                "Label": { "type": "keyword" },
                "Timestamp": { "type": "long" },
                "Date": { "type": "date", "format": "yyyy.MM.dd" },
                "Node": { "type": "keyword" },
                "Time": { "type": "date", "format": "yyyy-MM-dd-HH.mm.ss.SSSSSS" },
                "NodeRepeat": { "type": "keyword" },
                "Type": { "type": "keyword" },
                "Component": { "type": "keyword" },
                "Level": { "type": "keyword" },
                "Content": { "type": "text" }
            }
        }
    },
)
print(f"Template '{template_name}' created successfully!")

  if es.cluster.exists_component_template(name=template_name):


BadRequestError: BadRequestError(400, 'illegal_argument_exception', 'component templates [template-config] cannot be removed as they are still in use by index templates [bgl-template]')

In [12]:
es.cluster.get_component_template(name="template-config")

  es.cluster.get_component_template(name="template-config")


ObjectApiResponse({'component_templates': [{'name': 'template-config', 'component_template': {'template': {'settings': {'index': {'number_of_replicas': '0'}}, 'mappings': {'dynamic': False, 'properties': {'Type': {'type': 'keyword'}, 'LineId': {'type': 'integer'}, 'NodeRepeat': {'type': 'keyword'}, 'Content': {'type': 'text'}, 'Node': {'type': 'keyword'}, 'Label': {'type': 'keyword'}, 'Time': {'format': 'yyyy-MM-dd-HH.mm.ss.SSSSSS', 'type': 'date'}, 'Level': {'type': 'keyword'}, 'Component': {'type': 'keyword'}, 'Timestamp': {'type': 'long'}, 'Date': {'format': 'yyyy.MM.dd', 'type': 'date'}}}}}}]})

In [15]:
req= es.indices.simulate_template(
    index_patterns=["bgl"],
    composed_of=["template-config"],
    template={
        "settings": {
            "index.number_of_shards": 3
        }
    },
)


  req= es.indices.simulate_template(


In [16]:
print(es.indices.get_index_template())

{'index_templates': [{'name': '.ml-state', 'index_template': {'index_patterns': ['.ml-state*'], 'template': {'settings': {'index': {'hidden': 'true', 'lifecycle': {'name': 'ml-size-based-ilm-policy', 'rollover_alias': '.ml-state-write'}, 'auto_expand_replicas': '0-1'}}, 'mappings': {'_meta': {'version': '7172899'}, 'enabled': False}, 'aliases': {}}, 'composed_of': [], 'priority': 2147483647, 'version': 7172899, '_meta': {'managed': True, 'description': 'index template for ML state indices'}}}, {'name': 'ilm-history', 'index_template': {'index_patterns': ['ilm-history-5*'], 'template': {'settings': {'index': {'lifecycle': {'name': 'ilm-history-ilm-policy'}, 'number_of_shards': '1', 'auto_expand_replicas': '0-1', 'number_of_replicas': '0'}}, 'mappings': {'dynamic': False, 'properties': {'index_age': {'type': 'long'}, '@timestamp': {'format': 'epoch_millis', 'type': 'date'}, 'error_details': {'type': 'text'}, 'success': {'type': 'boolean'}, 'index': {'type': 'keyword'}, 'state': {'dynamic

  print(es.indices.get_index_template())


In [13]:
index_template_name = "bgl-template"
es.indices.put_index_template(
    name=index_template_name,
    index_patterns=["bgl*"],  # Matches index names like "logs-2024", "logs-xyz", etc.
    composed_of=[template_name],  # Links to the component template
    priority=100,
)

  es.indices.put_index_template(


ObjectApiResponse({'acknowledged': True})

In [127]:
# Convert the response to a dictionary
req_dict = req.body
# Export the configuration to a file
with open("data/BGL/bgl-template.json", "w") as file:
    json.dump(req_dict, file, indent=4)

In [14]:
# Get the settings of the index
settings = es.indices.get_settings(index=index_name)
print(settings)

# Get the mappings of the index
mappings = es.indices.get_mapping(index=index_name)
print(mappings)

# Get the applied template (Elasticsearch 7+)
# template_response = es.indices.get_index_template(name=index_name)
# print(template_response)

  settings = es.indices.get_settings(index=index_name)
  mappings = es.indices.get_mapping(index=index_name)


{'bgl': {'settings': {'index': {'routing': {'allocation': {'include': {'_tier_preference': 'data_content'}}}, 'number_of_shards': '1', 'provided_name': 'bgl', 'creation_date': '1740922234178', 'number_of_replicas': '0', 'uuid': 'CZ4eyot2RsaQOlgUnHTSTg', 'version': {'created': '7172899'}}}}}
{'bgl': {'mappings': {'dynamic': 'false', 'properties': {'Component': {'type': 'keyword'}, 'Content': {'type': 'text'}, 'Date': {'type': 'date', 'format': 'yyyy.MM.dd'}, 'Label': {'type': 'keyword'}, 'Level': {'type': 'keyword'}, 'LineId': {'type': 'integer'}, 'Node': {'type': 'keyword'}, 'NodeRepeat': {'type': 'keyword'}, 'Time': {'type': 'date', 'format': 'yyyy-MM-dd-HH.mm.ss.SSSSSS'}, 'Timestamp': {'type': 'long'}, 'Type': {'type': 'keyword'}}}}}


NotFoundError: NotFoundError(404, 'resource_not_found_exception', 'index template matching [bgl] not found')

In [18]:
# Load data from JSON file
with open("data/BGL/BGL_2k.json", "r") as file:
    data = json.load(file)  # Assumes the JSON file contains a list of objects

# Insert data into Elasticsearch
for i, doc in enumerate(data):
    es.index(index=index_name, id=i+1, body=doc)

print("Data inserted successfully!")

  es.index(index=index_name, id=i+1, body=doc)


Data inserted successfully!


In [19]:
# Fetch all documents
response = es.search(index=index_name, body={"query": {"match_all": {}}}, size=10)  # Fetch first 10 docs

# Print results
for hit in response['hits']['hits']:
    print(hit["_source"])  # Print document content

  response = es.search(index=index_name, body={"query": {"match_all": {}}}, size=10)  # Fetch first 10 docs


{'LineId': 0, 'Label': '-', 'Timestamp': '1117838570', 'Date': '2005.06.03', 'Node': 'R02-M1-N0-C:J12-U11', 'Time': '2005-06-03-15.42.50.675872', 'NodeRepeat': 'R02-M1-N0-C:J12-U11', 'Type': 'RAS', 'Component': 'KERNEL', 'Level': 'INFO', 'Content': 'cache parity error corrected'}
{'LineId': 1, 'Label': '-', 'Timestamp': '1117838573', 'Date': '2005.06.03', 'Node': 'R02-M1-N0-C:J12-U11', 'Time': '2005-06-03-15.42.53.276129', 'NodeRepeat': 'R02-M1-N0-C:J12-U11', 'Type': 'RAS', 'Component': 'KERNEL', 'Level': 'INFO', 'Content': 'cache parity error corrected'}
{'LineId': 2, 'Label': '-', 'Timestamp': '1117838976', 'Date': '2005.06.03', 'Node': 'R02-M1-N0-C:J12-U11', 'Time': '2005-06-03-15.49.36.156884', 'NodeRepeat': 'R02-M1-N0-C:J12-U11', 'Type': 'RAS', 'Component': 'KERNEL', 'Level': 'INFO', 'Content': 'cache parity error corrected'}
{'LineId': 3, 'Label': '-', 'Timestamp': '1117838978', 'Date': '2005.06.03', 'Node': 'R02-M1-N0-C:J12-U11', 'Time': '2005-06-03-15.49.38.026704', 'NodeRepeat

  response = es.search(index=index_name, body={"query": {"match_all": {}}}, size=10)  # Fetch first 10 docs
