# Log Processing
Transformation of logs generated by `log_parser.py` to useful csv's

Make sure to:
- Run `make pull_data` in the vm directory OR save the log files and update `data_dir`
- Launch jupyter notebook in data_processing (relative paths will break if not)

## Dependencies

In [1]:
import csv
import os

import pandas as pd
import numpy as np
import seaborn as sns

data_dir = '../data/'

## Saving to DataFrame
### ESB & Host

In [2]:
df_esb = pd.DataFrame()
df_host = pd.DataFrame()
df_trance = pd.DataFrame()

for x in [str(x) for x in os.listdir(data_dir) if 'csv' in str(x)]:
    print('Reading from file: ' + str(x))
    if 'host' in x:
        df = pd.read_csv(data_dir + str(x), header=None)
        df_host = df_host.append(df, ignore_index=True)
    elif 'esb' in x:
        df = pd.read_csv(data_dir + str(x), header=None)
        df_esb = df_esb.append(df, ignore_index=True)

Reading from file: 12_esb.csv
Reading from file: 16_host.csv
Reading from file: 17_host.csv
Reading from file: 07_trace.csv
Reading from file: 13_trace.csv
Reading from file: 15_trace.csv
Reading from file: 13_esb.csv
Reading from file: 08_esb.csv
Reading from file: 06_host.csv
Reading from file: 07_host.csv
Reading from file: 12_trace.csv
Reading from file: 11_esb.csv
Reading from file: 06_trace.csv
Reading from file: 10_esb.csv
Reading from file: 11_host.csv
Reading from file: 10_host.csv
Reading from file: 14_trace.csv
Reading from file: 09_esb.csv
Reading from file: 08_trace.csv
Reading from file: 05_trace.csv
Reading from file: 08_host.csv
Reading from file: 09_host.csv
Reading from file: 11_trace.csv
Reading from file: 05_esb.csv
Reading from file: 15_esb.csv
Reading from file: 15_host.csv
Reading from file: 14_host.csv
Reading from file: 14_esb.csv
Reading from file: 17_trace.csv
Reading from file: 12_host.csv
Reading from file: 06_esb.csv
Reading from file: 16_esb.csv
Reading f

In [3]:
df_esb.columns = ['service_name', 'start_time', 'avg_time', 'num', 'succee_num', 'succee_rate']

df_host.columns = ['item_id', 'name', 'bomc_id', 'timestamp', 'value', 'cmdb_id']
host_types = ['os', 'db', 'container', 'docker', 'redis']
host_dfs = {}

for host_type in host_types:
    print('Saving ' + host_type + ' to df')
    host_dfs[host_type] = df_host[[(host_type in str(x)) for x in df_host['cmdb_id']]].sort_values(by='timestamp')

Saving os to df
Saving db to df
Saving container to df
Saving docker to df
Saving redis to df


### Trace 

In [4]:
calltypes = ['JDBC', 'LOCAL', 'CSF', 'OSB', 'RemoteProcess', 'FlyRemote']
lists = {}
for calltype in calltypes:
    lists[calltype] = []
    
for x in [str(x) for x in os.listdir(data_dir) if 'trace.csv' in str(x)]:
    print('Reading from file: ' + str(x))
    with open(data_dir+str(x), newline='') as f:
        reader = csv.reader(f)
        i = 0
        for row in reader:
            i += 1
            lists[row[-1]].append(row)

Reading from file: 07_trace.csv
Reading from file: 13_trace.csv
Reading from file: 15_trace.csv
Reading from file: 12_trace.csv
Reading from file: 06_trace.csv
Reading from file: 14_trace.csv
Reading from file: 08_trace.csv
Reading from file: 05_trace.csv
Reading from file: 11_trace.csv
Reading from file: 17_trace.csv
Reading from file: 09_trace.csv
Reading from file: 10_trace.csv
Reading from file: 16_trace.csv


In [5]:
trace_dfs = {}
for calltype in calltypes:
    print('Saving ' + calltype + ' data to df')
    columns=['startTime','elapsedTime','success','traceId','id','pid','cmdb_id','serviceName', 'callType']
    if calltype == 'LOCAL':
        trace_dfs[calltype] = pd.DataFrame(lists[calltype],columns=(columns[:-1]+['dsName']+[columns[-1]]))
    elif calltype == 'JDBC':
        trace_dfs[calltype] = pd.DataFrame(lists[calltype],columns=(columns[:-2] + ['dsName'] + [columns[-1]]))
    else:
        trace_dfs[calltype] = pd.DataFrame(lists[calltype], columns=columns)
    

Saving JDBC data to df
Saving LOCAL data to df
Saving CSF data to df
Saving OSB data to df
Saving RemoteProcess data to df
Saving FlyRemote data to df


## Review

In [6]:
df_esb.head()

Unnamed: 0,service_name,start_time,avg_time,num,succee_num,succee_rate
0,osb_001,1606881600000,0.6214,361,361,1.0
1,osb_001,1606881660000,0.5861,343,343,1.0
2,osb_001,1606881720000,5.0125,126,122,0.9683
3,osb_001,1606881780000,9.3487,70,70,1.0
4,osb_001,1606881840000,1.4756,375,375,1.0


In [7]:
for name, df in host_dfs.items():
    print('-'*80)
    print('-'*80)
    print('Host Type: ' + name.upper())
    print(df.head())

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Host Type: OS
                 item_id                      name     bomc_id      timestamp  \
1130105  999999996430952      Processor_load_5_min  ZJ-001-005  1606859432000   
1130109  999999996437612          Memory_available  ZJ-001-025  1606859432000   
1130108  999999996430592          Memory_available  ZJ-001-025  1606859432000   
1130107         63309572  Outgoing_network_traffic  ZJ-001-072  1606859432000   
1130106  999999996483992                Send_total  ZJ-001-074  1606859432000   

             value cmdb_id  
1130105   0.018333  os_015  
1130109   8.874672  os_002  
1130108  44.089100  os_010  
1130107   0.000585  os_008  
1130106   0.313710  os_018  
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
H

In [8]:
for name, df in trace_dfs.items():
    print('-'*80)
    print('-'*80)
    print('Calltype: ' + name)
    print(df.head())

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Calltype: JDBC
       startTime elapsedTime success               traceId  \
0  1606863598904        89.0    True  5c3fb17262a9db023878   
1  1606863598995         0.0    True  5c3fb17262a9db023878   
2  1606863598428       618.0    True  5250417262a9d9f52832   
3  1606863599048         2.0    True  5250417262a9d9f52832   
4  1606863599058         1.0    True  5250417262a9d9f52832   

                     id                   pid     cmdb_id  dsName callType  
0  07f9b17262a9dd385687  5c72c17262a9dd375684  docker_003  db_009     JDBC  
1  8807617262a9dd935694  8e3cc17262a9dd915691  docker_003  db_009     JDBC  
2  342ee17262a9db5c4402  c49e717262a9db5c4399  docker_006  db_003     JDBC  
3  8a1e717262a9ddc84493  c49e717262a9db5c4399  docker_006  db_003     JDBC  
4  8aa5417262a9ddd29572  2219e17262a9ddd29569  docker_002  db_009

# Saving

In [14]:
!mkdir "../data/test_data"
print('Saving ESB DataFrame to CSV')
df_esb.to_csv('../data/test_data/esb.csv', index=False)

!mkdir "../data/test_data/host"
for name, df in host_dfs.items():
    print('Saving host_' + name + ' DataFrame to CSV')
    df.to_csv('../data/test_data/host/'+name+'.csv', index=False)

!mkdir "../data/test_data/trace"
for name, df in trace_dfs.items():
    print('Saving trace_' + name.lower() + ' DataFrame to CSV')
    df.to_csv('../data/test_data/trace/trace_'+name.lower()+'.csv', index=False)

Saving ESB DataFrame to CSV
Saving host_os DataFrame to CSV
Saving host_db DataFrame to CSV
Saving host_container DataFrame to CSV
Saving host_docker DataFrame to CSV
Saving host_redis DataFrame to CSV
Saving trace_jdbc DataFrame to CSV
Saving trace_local DataFrame to CSV
Saving trace_csf DataFrame to CSV
Saving trace_osb DataFrame to CSV
Saving trace_remoteprocess DataFrame to CSV
Saving trace_flyremote DataFrame to CSV
