In [None]:
import pandas as pd
import os, sys

# current_dir = os.path.dirname(__file__)
current_dir = os.getcwd()
root_dir = os.path.abspath(os.path.join(current_dir, '..'))
sys.path.insert(0, root_dir)

def get_file_by_suffix(suffix_name, dir_name=None, inclusive_keyword=None, exclusive_keyword=None):
    if dir_name:
        target_dir = os.path.abspath(os.path.join(current_dir, dir_name))
    else:
        target_dir = current_dir
        
    if type(suffix_name) is not str:
        suffix_name = str(suffix_name)
        
    paths_hash = dict()
    
    if inclusive_keyword:
        for path, sub_dirs, files in os.walk(target_dir):
            for file in files:
                if((file.endswith('.' + suffix_name) or 
                    file.endswith('.' + suffix_name.lower()) or 
                    file.endswith('.' + suffix_name.title()) or 
                    file.endswith('.' + suffix_name.upper()))) and (inclusive_keyword in file):
                    paths_hash[file] = os.path.join(path, file)
    
    elif exclusive_keyword:
        for path, sub_dirs, files in os.walk(target_dir):
            for file in files:
                if((file.endswith('.' + suffix_name) or 
                    file.endswith('.' + suffix_name.lower()) or 
                    file.endswith('.' + suffix_name.title()) or 
                    file.endswith('.' + suffix_name.upper()))) and (exclusive_keyword not in file):
                    paths_hash[file] = os.path.join(path, file)
                    
    else:
        for path, sub_dirs, files in os.walk(target_dir):
            for file in files:
                if(file.endswith('.' + suffix_name) or 
                    file.endswith('.' + suffix_name.lower()) or 
                    file.endswith('.' + suffix_name.title()) or 
                    file.endswith('.' + suffix_name.upper())):
                    paths_hash[file] = os.path.join(path, file)
                
    return paths_hash

log_type_col = "log_type"

date = 'date'
timestamp = 'timestamp'
appid = 'appid'
ctxid = 'ctxid'
level = "level"
payload = 'payload'
payload_ext = "payload_ext"
cols = [date, timestamp, appid, ctxid, level, payload, payload_ext]
rename_cols = dict()
for i in range(len(cols) - 1):
    rename_cols[i] = cols[i]
rename_cols

def extract_gz_files(dir_name, log_type):
    all_gz_dict = get_file_by_suffix("txt.gz", dir_name=dir_name,  inclusive_keyword=log_type)
    count = len(all_gz_dict)
    i = 0
    print(f"Found {count} txt.gz files.")
    for key in all_gz_dict.keys():
        i += 1
        try:
            print(f"Extracting {key} with completion {i}/{count}.")
            log = pd.read_fwf(all_gz_dict[key], 
                                   compression="gzip", 
                                   skiprows=3,
                                   colspecs="infer", 
                                   header=None,
                                   encoding = "ISO-8859-1")
            file_path = os.path.join(dir_name, key.replace("txt.gz", "csv"))
            log.rename(columns=rename_cols, inplace=True)
#             log[log_type_col] = log_type
            log.to_csv(file_path, encoding = "ISO-8859-1", index=None)
        except pd.errors.EmptyDataError:
                print(f"Note: {key} was empty. Skipping.")
                continue # will skip the rest of the block and move to next file
    print("Extraction completed.")
    
def merge_log_csv_files(dir_name, inclusive_keyword=None):
    all_csv_dict = get_file_by_suffix("csv", dir_name=dir_name,  inclusive_keyword=inclusive_keyword)
    all_log = pd.DataFrame()
    count = len(all_csv_dict)
    i = 0
    print(f"Found {count} txt.gz files.")
    
    for key in all_csv_dict.keys():
        i += 1
        print(f"Merging {key} with completion {i}/{count}.")
        temp_log = pd.read_csv(all_csv_dict[key], encoding = "ISO-8859-1", low_memory=False)
        all_log = pd.concat([all_log, temp_log])
    all_log.sort_values(by=["timestamp"], inplace=True)
    print("Merging compeleted.")
    
    if inclusive_keyword is None:
        inclusive_keyword = "all_log"
    output_file_path = inclusive_keyword + ".csv"
    all_log.to_csv(output_file_path, encoding = "ISO-8859-1", index=None)
    return all_log

In [None]:
file_path = "emc/emc_sws"

In [None]:
extract_gz_files(file_path, "logcat")

In [None]:
extract_gz_files(file_path, "kernel")

In [None]:
extract_gz_files(file_path, "event")

In [None]:
all_log = merge_log_csv_files(file_path)

In [None]:
all_log.fillna("", inplace=True)

In [None]:
all_log[all_log["6"].str.contains("monitor-monitor_data_frame_handler")]

In [None]:
before_buffer_full = all_log[all_log[timestamp] < "14:11:46.464"].copy()

In [None]:
before_buffer_full.to_csv("before_buffer_full.csv", encoding = "ISO-8859-1", index=None)

In [None]:
new_log = all_log.drop_duplicates(subset=[timestamp, payload, level]).copy()

In [None]:
new_log[((new_log[payload].str.lower().str.contains("power")) 
        | (new_log[payload].str.lower().str.contains("kernel"))
        | (new_log[payload].str.lower().str.contains("ipk")))
        & ((new_log[timestamp] < "10:30") & (new_log[timestamp] > "10:24"))
       ].to_csv("log_focus.csv", encoding = "ISO-8859-1", index=None)

In [None]:
all_log[((all_log[payload].str.contains("SRV")) 
        | (all_log[payload].str.contains("kernel"))
        | (all_log[payload].str.contains("answer")))
        & (all_log[timestamp] < "15:03:12")
       ].to_csv("log_focus.csv", encoding = "ISO-8859-1", index=None)

In [None]:
formated = all_log.drop_duplicates(subset=[timestamp, payload]).copy()

In [None]:
all_log.to_csv("all_log.csv", encoding = "ISO-8859-1", index=None)

In [None]:
emc_error = pd.read_csv("emc_error.csv", encoding = "ISO-8859-1", low_memory=False)

In [None]:
emc_error.head()

In [None]:
emc_error[emc_error[date].isnull()].sum()