In [1]:
import pandas as pd
import os
import re
import numpy as np
from os import listdir
from os.path import isfile, join

In [2]:
# Data extraction pipeline

def format_datetime(datetime):
    # Bad hardcoded solution! Generalise later if needed
    
    day = datetime[1:3]
    month = "09"
    year = "2019"
    time = datetime.split(':', 1)[1]
    
    formatted = year + "-" + month + "-" + day + " " + time
    
    return formatted

def clean_log(log, con_dict):
    log_dict = dict()
    bits = log.split()
    
    formatted = format_datetime(re.sub('["'']', '',bits[-8]))

    log_dict = {
        "resp_time": re.sub('["'']', '',bits[-1]),
        "bytes_sent": re.sub('["'']', '',bits[-2]),
        "resp_code": re.sub('["'']', '',bits[-3]),
        "url": re.sub('["'']', '',bits[-5]),
        "datetime": formatted
    }
    
    event_dict = {**log_dict, **con_dict}
    
    return event_dict

def clean_error(error, con_dict):
    error_dict = dict()
    bits = error.split()
    
    error_dict = {
        "resp_time": np.nan,
        "bytes_sent": np.nan,
        "resp_code": re.sub('[(:]', '',bits[10]),
        "url": re.sub('["'']', '',bits[-6]),
        "datetime": re.sub('[/]', '-',bits[2]) + " " + bits[3]
    }
    
    event_dict = {**error_dict, **con_dict}
    
    return event_dict

def extract_con_info(filename):
    con_dict = dict()
    con_string = filename.split('4C', 1)[1].split('.', 1)[0]
    container_list = con_string.split('X')
    
    con_dict = {
        "app_1_containers": int(container_list[0].split('-')[1]),
        "app_2_containers": int(container_list[1].split('-')[1]),
        "app_3_containers": int(container_list[2].split('-')[1]),
        "app_4_containers": int(container_list[3].split('-')[1])
    }
    
    return con_dict

def generate_log_df(log_files_path):
    all_logs = list()

    files = [f for f in listdir(mypath) if isfile(join(mypath, f)) and "logs" in f]

    files.sort()

    for file in files:
        # Extract container information from filename
        container_dict = extract_con_info(file)

        with open((mypath+"{}").format(file), "r") as f:
            line = f.readline()
            while line:
                line = f.readline()
                if len(line.split()) == 13:
                    all_logs.append(clean_log(line, container_dict))
                elif len(line.split()) == 32:
                    all_logs.append(clean_error(line, container_dict))
            print(file + " done!")   

    logs_df = pd.DataFrame(all_logs)
    
    return logs_df
    

In [3]:
mypath = "../Data/"

In [4]:
log_test_df = generate_log_df(mypath)

A-logs-5-09-19-3bias4C1-3X2-3X3-3X4-3.txt done!
B-logs-6-09-19-2bias4C1-3X2-3X3-3X4-3.txt done!
C-logs-6-09-19-4bias4C1-3X2-3X3-3X4-3.txt done!
D-logs-6-09-19-2_3bias4C1-3X2-3X3-3X4-3.txt done!
E-logs-6-09-19-2_4bias4C1-3X2-3X3-3X4-3.txt done!
F-logs-6-09-19-3_4bias4C1-3X2-3X3-3X4-3.txt done!
G-logs-6-09-19-2_3_4bias4C1-3X2-3X3-3X4-3.txt done!
H-logs-6-09-19-2bias4C1-3X2-1X3-1X4-1.txt done!
I-logs-7-09-19-3bias4C1-3X2-1X3-1X4-1.txt done!
J-logs-7-09-19-4bias4C1-3X2-1X3-1X4-1.txt done!
K-logs-7-09-19-2_3bias4C1-3X2-1X3-1X4-1.txt done!
L-logs-7-09-19-2_4bias4C1-3X2-1X3-1X4-1.txt done!
M-logs-7-09-19-3_4bias4C1-3X2-1X3-1X4-1.txt done!
N-logs-7-09-19-2_3_4bias4C1-3X2-1X3-1X4-1.txt done!
O-logs-7-09-19-2bias4C1-1X2-3X3-1X4-1.txt done!
P-logs-7-09-19-3bias4C1-1X2-3X3-1X4-1.txt done!
Q-logs-8-09-19-4bias4C1-1X2-3X3-1X4-1.txt done!
R-logs-8-09-19-2_3bias4C1-1X2-3X3-1X4-1.txt done!
S-logs-8-09-19-2_4bias4C1-1X2-3X3-1X4-1.txt done!
T-logs-8-09-19-3_4bias4C1-1X2-3X3-1X4-1.txt done!
U-logs-8-09-19

In [5]:
log_test_df.to_parquet('logs.parquet')