In [1]:
import pandas as pd
import os
import re
import numpy as np
from os import listdir
from os.path import isfile, join

In [2]:
# Data extraction pipeline

def format_datetime(datetime):
    # Bad hardcoded solution! Generalise later if needed
    
    day = datetime[1:3]
    month = "09"
    year = "2019"
    time = datetime.split(':', 1)[1]
    
    formatted = year + "-" + month + "-" + day + " " + time
    
    return formatted

def clean_log(log, con_dict):
    log_dict = dict()
    bits = log.split()
    
    formatted = format_datetime(re.sub('["'']', '',bits[-8]))

    log_dict = {
        "resp_time": re.sub('["'']', '',bits[-1]),
        "bytes_sent": re.sub('["'']', '',bits[-2]),
        "resp_code": re.sub('["'']', '',bits[-3]),
        "url": re.sub('["'']', '',bits[-5]),
        "datetime": formatted
    }
    
    event_dict = {**log_dict, **con_dict}
    
    return event_dict

def clean_error(error, con_dict):
    error_dict = dict()
    bits = error.split()
    
    error_dict = {
        "resp_time": np.nan,
        "bytes_sent": np.nan,
        "resp_code": re.sub('[(:]', '',bits[10]),
        "url": re.sub('["'']', '',bits[-6]),
        "datetime": re.sub('[/]', '-',bits[2]) + " " + bits[3]
    }
    
    event_dict = {**error_dict, **con_dict}
    
    return event_dict

def extract_con_info(filename):
    con_dict = dict()
    con_string = filename.split('4C', 1)[1].split('.', 1)[0]
    container_list = con_string.split('X')
    
    con_dict = {
        "app_1_containers": int(container_list[0].split('-')[1]),
        "app_2_containers": int(container_list[1].split('-')[1]),
        "app_3_containers": int(container_list[2].split('-')[1]),
        "app_4_containers": int(container_list[3].split('-')[1])
    }
    
    return con_dict

def generate_log_df(log_files_path):
    all_logs = list()

    files = [f for f in listdir(mypath) if isfile(join(mypath, f)) and "logs" in f]

    files.sort()

    for file in files:
        # Extract container information from filename
        container_dict = extract_con_info(file)

        with open((mypath+"{}").format(file), "r") as f:
            line = f.readline()
            while line:
                line = f.readline()
                if len(line.split()) == 13:
                    all_logs.append(clean_log(line, container_dict))
                elif len(line.split()) == 32:
                    all_logs.append(clean_error(line, container_dict))
            print(file + " done!")   

    logs_df = pd.DataFrame(all_logs)
    
    return logs_df
    

In [3]:
mypath = "../Data/"

In [None]:
log_test_df = generate_log_df(mypath)

In [9]:
log_test_df.head(10)

Unnamed: 0,resp_time,bytes_sent,resp_code,url,datetime,app_1_containers,app_2_containers,app_3_containers,app_4_containers
0,0.031,197,200,/2/4-3/65,2019-09-05 22:37:10,3,3,3,3
1,0.38,198,200,/3/3-4/37,2019-09-05 22:37:10,3,3,3,3
2,0.298,188,200,/4/3/188,2019-09-05 22:37:11,3,3,3,3
3,0.009,188,200,/3/3/180,2019-09-05 22:37:11,3,3,3,3
4,0.331,199,200,/3/3-4/115,2019-09-05 22:37:11,3,3,3,3
5,0.148,229,200,/3/3-3-3-3-3/101,2019-09-05 22:37:11,3,3,3,3
6,0.023,188,200,/3/3/180,2019-09-05 22:37:11,3,3,3,3
7,0.298,189,200,/4/3/188,2019-09-05 22:37:11,3,3,3,3
8,0.027,207,200,/3/4-2-2/38,2019-09-05 22:37:11,3,3,3,3
9,0.352,210,200,/3/4-4-3/60,2019-09-05 22:37:11,3,3,3,3


In [8]:
log_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3650450 entries, 0 to 3650449
Data columns (total 9 columns):
resp_time           float64
bytes_sent          object
resp_code           object
url                 object
datetime            object
app_1_containers    int64
app_2_containers    int64
app_3_containers    int64
app_4_containers    int64
dtypes: float64(1), int64(4), object(4)
memory usage: 250.7+ MB


In [7]:
log_test_df.resp_time = log_test_df.resp_time.astype(float)

In [10]:
log_test_df.resp_time.describe()

count    3.427581e+06
mean     1.329759e+01
std      1.689617e+01
min      0.000000e+00
25%      1.045000e+00
50%      7.608000e+00
75%      1.676200e+01
max      9.247300e+01
Name: resp_time, dtype: float64

In [11]:
log_test_df.to_parquet('logs.parquet.gzip', compression='gzip')