In [1]:
import findspark
findspark.init() # find spark 

import datetime as dt
from pyspark.sql import SparkSession
import re
import findspark
import pandas as pd 
from elasticsearch import Elasticsearch
from elasticsearch import helpers

In [None]:
# ES 적재 함수 

def bulk_insert(host, port, df, index):
    es = Elasticsearch(host = host, port = port)

    data = [
      {
        "_index": index,
        "_source": {
            "datetime": x[0],
            "log-level": x[1],
            "message":x[2]}
      }
        for x in zip(df['Datetime'],df['Status'],df['Message'])
    ]

    helpers.bulk(es, data)

In [81]:
# log 파싱 함수 

def parsing(logs):
    import re

    # 정규표현식 
    regex = r"\[(\d+-\d+-\d+\s\d+:\d+:\d+,\d+)\] \{\S+\} ([N|n]ote|NOTE|[A|a]lert|ALERT|[T|t]race|TRACE|[D|d]ebug|DEBUG|[N|n]otice|NOTICE|[I|i]nfo|INFO|[W|w]arn?(?:ing)?|WARN?(?:ING)?|[E|e]rr?(?:or)?|ERR?(?:OR)?|[C|c]rit?(?:ical)?|CRIT?(?:ICAL)?|[F|f]atal|FATAL|[S|s]evere|SEVERE|EMERG(?:ENCY)?|[Ee]merg(?:ency)?)([^][]*)|(.*)“# ([^][]*)|([^][]*)([^][]*)(\’\S+)"

    # match 데이터 찾기 
    matches = re.finditer(regex, logs, re.MULTILINE)

    dict_list = []

    for matchNum, match in enumerate(matches):

        # Timestamp or Status or Message에 하나라도 값이 없으면 제거 
        if match.group(1) and match.group(3) and  match.group(4):
            row_ = dict()
            row_["Datetime"]= match.group(1)
            row_["Status"] = match.group(3)
            row_["Message"] = match.group(4)

            dict_list.append(row_)
    
    return dict_list

In [90]:
# Main 함수

def main():
    
    # Spark Session 생성 
    spark = SparkSession.builder.master('local[2]').appName('airflow log test').getOrCreate()    
    
    # Data 읽기 
    data = spark.read.text("error_log_ex.txt") # change path 
    
    # value 열에서 데이터 추출 후 string으로 형변환 
    logs = str([item['value'] for item in data.collect()])

    # 로그 파싱 
    parseData = parsing(logs)

    # pandas DataFrame 변경 
    df = pd.DataFrame(parseData)   
    
    # datetime 형식 지정 
    df["Datetime"] = pd.to_datetime(df["Datetime"],format="%Y-%m-%d %H:%M:%S", errors = 'coerce')  
    print(df.head()) # 데이터 확인 
    
    # ES 적재 
    # bulk_insert("localhost", "9200", df, "airflow_log_2022-02-06") # host, port, data, index


In [91]:
if __name__ == "__main__":
    main()

    Datetime Status                                            Message
0 2022-01-29   INFO  Dependencies all met for <TaskInstance: MySQLT...
1 2022-01-29   INFO  Dependencies all met for <TaskInstance: MySQLT...
2 2022-01-29   INFO  ', '------------------------------------------...
3 2022-01-29   INFO                        Starting attempt 1 of 1', '
4 2022-01-29   INFO  ', '------------------------------------------...
