In [1]:
import pandas as pd
import os, sys

# current_dir = os.path.dirname(__file__)
current_dir = os.getcwd()
root_dir = os.path.abspath(os.path.join(current_dir, '..'))
sys.path.insert(0, root_dir)

def get_file_by_suffix(suffix_name, dir_name=None, inclusive_keyword=None, exclusive_keyword=None):
    if dir_name:
        target_dir = os.path.abspath(os.path.join(current_dir, dir_name))
    else:
        target_dir = current_dir
        
    if type(suffix_name) is not str:
        suffix_name = str(suffix_name)
        
    paths_hash = dict()
    
    if inclusive_keyword:
        for path, sub_dirs, files in os.walk(target_dir):
            for file in files:
                if((file.endswith('.' + suffix_name) or 
                    file.endswith('.' + suffix_name.lower()) or 
                    file.endswith('.' + suffix_name.title()) or 
                    file.endswith('.' + suffix_name.upper()))) and (inclusive_keyword in file):
                    paths_hash[file] = os.path.join(path, file)
    
    elif exclusive_keyword:
        for path, sub_dirs, files in os.walk(target_dir):
            for file in files:
                if((file.endswith('.' + suffix_name) or 
                    file.endswith('.' + suffix_name.lower()) or 
                    file.endswith('.' + suffix_name.title()) or 
                    file.endswith('.' + suffix_name.upper()))) and (exclusive_keyword not in file):
                    paths_hash[file] = os.path.join(path, file)
                    
    else:
        for path, sub_dirs, files in os.walk(target_dir):
            for file in files:
                if(file.endswith('.' + suffix_name) or 
                    file.endswith('.' + suffix_name.lower()) or 
                    file.endswith('.' + suffix_name.title()) or 
                    file.endswith('.' + suffix_name.upper())):
                    paths_hash[file] = os.path.join(path, file)
                
    return paths_hash

log_type_col = "log_type"

cpu = "cpu"
date = 'date'
timestamp = 'timestamp'
appid = 'appid'
ctxid = 'ctxid'
level = "level"
payload = 'payload'
payload_ext = "payload_ext"
cols = [date, timestamp, appid, ctxid, level, payload, payload_ext]
rename_cols = dict()
for i in range(len(cols) - 1):
    rename_cols[i] = cols[i]

rename_zst_cols = dict()
zst_cols = [date, timestamp, cpu, appid, ctxid, level, payload, payload_ext]
for i in range(len(zst_cols) - 1):
    rename_zst_cols[i] = zst_cols[i]


def extract_zst_files(dir_name, log_type):
    all_gz_dict = get_file_by_suffix("txt.zst", dir_name=dir_name,  inclusive_keyword=log_type)
    count = len(all_gz_dict)
    i = 0
    print(f"Found {count} txt.zst files.")
    for key in all_gz_dict.keys():
        i += 1
        try:
            print(f"Extracting {key} with completion {i}/{count}.")
            log = pd.read_fwf(all_gz_dict[key], 
                              compression={'method': 'zstd'},
                              # compression="gzip", 
                              # skiprows=3,
                              colspecs="infer", 
                              header=None,
                              encoding = "ISO-8859-1")
            file_path = os.path.join(dir_name, key.replace("txt.zst", "csv"))
            log.rename(columns=rename_zst_cols, inplace=True)
            log.to_csv(file_path, encoding = "ISO-8859-1", index=None)
        except pd.errors.EmptyDataError:
                print(f"Note: {key} was empty. Skipping.")
                continue # will skip the rest of the block and move to next file
    print("Extraction completed.")

def extract_gz_files(dir_name, log_type):
    all_gz_dict = get_file_by_suffix("txt.gz", dir_name=dir_name,  inclusive_keyword=log_type)
    count = len(all_gz_dict)
    i = 0
    print(f"Found {count} txt.gz files.")
    for key in all_gz_dict.keys():
        i += 1
        try:
            print(f"Extracting {key} with completion {i}/{count}.")
            log = pd.read_fwf(all_gz_dict[key], 
                                   compression="gzip", 
                                   skiprows=3,
                                   colspecs="infer", 
                                   header=None,
                                   encoding = "ISO-8859-1")
            file_path = os.path.join(dir_name, key.replace("txt.gz", "csv"))
            log.rename(columns=rename_cols, inplace=True)
#             log[log_type_col] = log_type
            log.to_csv(file_path, encoding = "ISO-8859-1", index=None)
        except pd.errors.EmptyDataError:
                print(f"Note: {key} was empty. Skipping.")
                continue # will skip the rest of the block and move to next file
    print("Extraction completed.")
    
def merge_log_csv_files(dir_name, inclusive_keyword=None):
    all_csv_dict = get_file_by_suffix("csv", dir_name=dir_name,  inclusive_keyword=inclusive_keyword)
    all_log = pd.DataFrame()
    count = len(all_csv_dict)
    i = 0
    print(f"Found {count} txt.gz files.")
    
    for key in all_csv_dict.keys():
        i += 1
        print(f"Merging {key} with completion {i}/{count}.")
        temp_log = pd.read_csv(all_csv_dict[key], encoding = "ISO-8859-1", low_memory=False)
        all_log = pd.concat([all_log, temp_log])
    all_log.sort_values(by=["timestamp"], inplace=True)
    print("Merging compeleted.")
    
    if inclusive_keyword is None:
        inclusive_keyword = "all_log"
    output_file_path = inclusive_keyword + ".csv"
    all_log.to_csv(output_file_path, encoding = "ISO-8859-1", index=None)
    return all_log

In [2]:
file_path = os.path.join(root_dir, "logs/85847897_LSJWM609XPZ833471_1698636600230")
file_path

'/Users/mac/Downloads/logs/85847897_LSJWM609XPZ833471_1698636600230'

In [3]:
zst_file = '/Users/mac/Downloads/logs/85847897_LSJWM609XPZ833471_1698636600230/EV045BN1QUUSUEX3SY6X_20231030_104858_235_37_aplogcat.txt.zst'

In [4]:
pd.read_fwf(zst_file, compression={'method': 'zstd'}, header=None, colspecs="infer", encoding = "ISO-8859-1")

Unnamed: 0,0,1,2,3,4,5,6
0,10-30,10:48:57.440,0,2646,4628,I,AudioStreamer: io_thread_func(L: 545): read 11...
1,10-30,10:48:57.446,4,3274,4453,I,ZebraLog: BMMapviewObserverImpl::OnCheckIngDat...
2,10-30,10:48:57.448,1,3481,4072,I,LocSrv_LocationRtdProvider: rtdLocationCallbac...
3,10-30,10:48:57.448,1,3481,4072,I,LocSrv_LocationRtdProvider: rtdLocationCallbac...
4,10-30,10:48:57.448,1,3481,4072,I,"LocationFuse: status,sc,0,aA,1,Hz,1,hD,1,ON,1,..."
...,...,...,...,...,...,...,...
250652,10-30,10:50:16.479,0,3936,4491,I,vehicle_function: [vehiclefunction.cpp-carServ...
250653,10-30,10:50:16.479,0,3936,4491,I,vehicle_function: [vehiclefunction.cpp-carServ...
250654,10-30,10:50:16.479,0,3936,4491,I,sockpeerchan@commonipc: requestsync with reque...
250655,10-30,10:50:16.479,0,3936,4491,I,logicengine: doasync with action id: 3


In [5]:
extract_zst_files(file_path, "logcat")

Found 36 txt.zst files.
Extracting EV045BN1QUUSUEX3SY6X_20231030_104858_235_37_aplogcat.txt.zst with completion 1/36.
Extracting EV045GARKGRIM9YP3Q64_20231030_110212_472_46_aplogcat.txt.zst with completion 2/36.
Extracting EV045H70X3M8AOBH7XOF_20231030_110612_537_48_aplogcat.txt.zst with completion 3/36.
Extracting EV045BNEWS8VB4PIZCL7_20231030_105016_636_38_aplogcat.txt.zst with completion 4/36.
Extracting EV0451MJPW0KLF5GYSRF_20231030_094305_132_0_aplogcat.txt.zst with completion 5/36.
Extracting EV0454AZZCY3QWEC1D9B_20231030_103012_547_25_aplogcat.txt.zst with completion 6/36.
Extracting EV045IPXRBLP4ESQ5AVV_20231030_111412_487_52_aplogcat.txt.zst with completion 7/36.
Extracting EV045GP9DS2S26GO99ON_20231030_110412_473_47_aplogcat.txt.zst with completion 8/36.
Extracting EV0459VMONAM94KX3JR2_20231030_104510_736_34_aplogcat.txt.zst with completion 9/36.
Extracting EV045CR6Q80FJHZ8Z2V2_20231030_105124_540_39_aplogcat.txt.zst with completion 10/36.
Extracting EV045J4KJB7VUO87NVEW_2023

In [None]:
extract_gz_files(file_path, "logcat")

In [None]:
extract_gz_files(file_path, "kernel")

In [None]:
extract_gz_files(file_path, "event")

In [6]:
all_log = merge_log_csv_files(file_path)

Found 36 txt.gz files.
Merging EV0459VMONAM94KX3JR2_20231030_104510_736_34_aplogcat.csv with completion 1/36.
Merging EV045GARKGRIM9YP3Q64_20231030_110212_472_46_aplogcat.csv with completion 2/36.
Merging EV045970BTQQIJT3CSOU_20231030_104351_477_33_aplogcat.csv with completion 3/36.
Merging EV045CRDTC1N956D02AC_20231030_105241_536_40_aplogcat.csv with completion 4/36.
Merging EV045B25CC30IFN0PBLA_20231030_104737_545_36_aplogcat.csv with completion 5/36.
Merging EV045F0VOC2U7KW73V7W_20231030_105624_539_43_aplogcat.csv with completion 6/36.
Merging EV045E30ZJ0RI39KU2GP_20231030_105504_543_42_aplogcat.csv with completion 7/36.
Merging EV045IPXRBLP4ESQ5AVV_20231030_111412_487_52_aplogcat.csv with completion 8/36.
Merging EV0454AZZCY3QWEC1D9B_20231030_103012_547_25_aplogcat.csv with completion 9/36.
Merging EV045H70X3M8AOBH7XOF_20231030_110612_537_48_aplogcat.csv with completion 10/36.
Merging EV045HL4PI676IZ7B3KQ_20231030_110812_449_49_aplogcat.csv with completion 11/36.
Merging EV0456YWW5

In [8]:
all_log.head()

Unnamed: 0,date,timestamp,cpu,appid,ctxid,level,payload,7,8,9,10,11,12,13,14,15,16,17,18,19
837,10-30,09:43:05.000,0,1341,2380,W,car_service: [ISignalHandler.cpp:207]:IPCL_CAN...,,,,,,,,,,,,,
845,10-30,09:43:05.000,1,1339,2461,I,system_service: [SystemHalProtocol.cpp:GetTime...,,,,,,,,,,,,,
844,10-30,09:43:05.000,1,1339,2461,E,system_service: [Context.cpp:sendToService:645...,,,,,,,,,,,,,
843,10-30,09:43:05.000,1,1339,2461,I,system_service:,,,,,,,,,,,,,
838,10-30,09:43:05.000,1,1339,2461,I,system_service: [BaseDateTime.cpp:sendTimeChan...,,,,,,,,,,,,,


In [9]:
all_log.to_csv("all_log.csv", encoding = "ISO-8859-1", index=None)

In [35]:
all_log.loc[((all_log[appid] == 3921) | (all_log[ctxid] == 3946) | 
             (all_log[appid] == 3274) | (all_log[ctxid] == 3274)) 
            & 
            ~(all_log[payload].str.contains("ZebraLog")) &
            ~(all_log[payload].str.contains("VoiceManager"))
#             &
#             (all_log[timestamp] > "10:47:00.000") &
#             (all_log[timestamp] < "10:53:00.000"), 
            [timestamp, payload]].to_csv("traffic.csv", encoding = "ISO-8859-1", index=None)

In [16]:
all_log.dtypes

date         object
timestamp    object
cpu           int64
appid         int64
ctxid         int64
level        object
payload      object
7            object
8            object
9            object
10           object
11           object
12           object
13           object
14           object
15           object
16           object
17           object
18           object
19           object
dtype: object

In [None]:
emc_error = pd.read_csv("emc_error.csv", encoding = "ISO-8859-1", low_memory=False)

In [None]:
emc_error.head()

In [None]:
emc_error[emc_error[date].isnull()].sum()