## Pre-NN

**Prepare logs to neural network.**

I will follow the steps below：

1. Import packages and keywords

2. Load log files

3. Transforming keywords to integers

4. Extract keyword information from logs

## Import packages and keywords

In [1]:
# import packages
import numpy as np
import pandas as pd
import re
import csv
from itertools import chain

import boto3
from pyspark import SparkConf, SparkContext

In [2]:
# load keywords from csv
keywords = pd.read_csv('keywords.csv')
keywords.head()

Unnamed: 0,keyword
0,java.lang.InterruptedException
1,NoSuchObjectException
2,AddressNotFoundException
3,java.io.IOException
4,java.net.SocketException


## Load log files

In [3]:
# load file name of logs from S3
client = boto3.client('s3') # low-level functional API
resource = boto3.resource('s3') # high-level object-oriented API
my_bucket = resource.Bucket('adp.spark.app.logs.us-west-1.prd') # subsitute this for your s3 bucket name.
files = list(my_bucket.objects.filter(Prefix='logs/aggregates')) # put the log files's path and name into a list

# load log content using PySpark
conf = (SparkConf()
         .setMaster("local")
         .setAppName("My app") # used to be Simple App
         .set("spark.driver.maxResultSize", "2g"))

sc = SparkContext(conf=conf)

In [4]:
print("Number of log files : {}".format(len(files)))

Number of log files : 108382


In [5]:
# extract file name and file content to filename and filecontent, respectively
filename = []
filecontent = []

def append_func(files):
    global filename
    global filecontent
    
    filename = []
    filecontent = []
    for file in files:
        if 'big.data.services.team.log' in file.key:
            filename.append(file.key.replace('logs/aggregates/','').replace('_asrd.cp.big.data.services.team.log',''))
            file_content = sc.textFile('s3n://adp.spark.app.logs.us-west-1.prd/' + file.key).collect()
            filecontent.append(file_content)

In [6]:
#append_func(files)
#print(f'filename: {len(filename)}')
#print(f'filecontent: {len(filecontent)}')

## Transforming keywords to integers

In [7]:
# transform keywords to integer labels
keywords['label'] = np.array(keywords.index)+1
keywords.label = keywords.label.astype('str')

In [8]:
keywords.head()

Unnamed: 0,keyword,label
0,java.lang.InterruptedException,1
1,NoSuchObjectException,2
2,AddressNotFoundException,3
3,java.io.IOException,4
4,java.net.SocketException,5


## Extract keyword information from logs

In [9]:
# match keywords that appear in one line of one log file
def match_keywords(line):
    for index, value in enumerate(list(keywords.keyword)):
        line = line.replace(value,'$' + keywords.label[index] + '$')
    return line

In [10]:
# extract keywords from logs and stored in nested lists, encode keyword to $label$
def extract_keywords(filecontent, keywords):

    def file_extract(filecontent_file):
        keyword_in_file = list(map(lambda x: re.findall(r'\$\d+\$', match_keywords(x)), filecontent_file))
        return keyword_in_file
    
    keyword_in_file_all = list(map(file_extract ,filecontent))
    
    return keyword_in_file_all

In [11]:
# unlist results generated from extract_keywords, and decode $label$ to label
def unlist(keyword_in_file_all):
    
    def unnest_list(x):
        sub_list = list(chain.from_iterable(x))
        def decode(x):
            decoded = x.replace('$','')
            return decoded
        word_list = list(map(decode, sub_list))
        return word_list

    unlist_keywords = list(map(unnest_list, keyword_in_file_all))
        
    # assign 0 to those files that does not contain any listing keywords
    def assign_zero(x):
        if not x:
            x = ['0']
        return x
    
    keyword_in_file_all_unlist = list(map(assign_zero ,unlist_keywords))
            
    return keyword_in_file_all_unlist

In [12]:
# extract log results(fail or success)
# assumption1: one log file only contain one result, and this result is only for current log.
# assumption2: one log file = one log
def extract_results(filecontent):
    
    def which_result_appear(x):
        if ("SparkSubmit.exceptionExitHook[failure]" in x):
            result = 0
        elif ("SparkSubmit.successfulExitHook[success]" in x):
            result = 1
        else:
            result = -1
        return result
    
    def file_extract(filecontent_file):
        job_result = list(map(which_result_appear, filecontent_file))
        if (0 in job_result):
            job_result = 0
        elif (1 in job_result):
            job_result = 1
        else:
            job_result = -1
        return str(job_result)
    
    keyword_in_file_all = list(map(file_extract ,filecontent))
    
    return keyword_in_file_all

In [13]:
# define result variables
keyword_in_file_all = []
keyword_in_file_all_unlist = []
job_result = []
file_name = []

In [14]:
# extract keyword information from logs
def parsing():
    size = np.arange(31532,len(files),1)
    for i in range(len(size)):
        if i == 2468:
            break
        sample_files = files[size[i]:size[i+1]]
        
        append_func(sample_files)
    
        file_name.append(filename)
    
        keyword_in_file_all_iter = extract_keywords(filecontent, keywords)
        keyword_in_file_all.append(keyword_in_file_all_iter)
        
        keyword_in_file_all_unlist_iter = unlist(keyword_in_file_all_iter)
        keyword_in_file_all_unlist.append(keyword_in_file_all_unlist_iter)
        
        job_result_iter = extract_results(filecontent)
        job_result.append(job_result_iter)

In [15]:
# run parsing function
parsing()

In [16]:
# unlist result variables
keyword_in_file_all = list(chain.from_iterable(keyword_in_file_all))
keyword_in_file_all_unlist = list(chain.from_iterable(keyword_in_file_all_unlist))
job_result = list(chain.from_iterable(job_result))
file_name = list(chain.from_iterable(file_name))

In [17]:
print(f'file_name: {len(file_name)}')
print(f'keyword_in_file_all: {len(keyword_in_file_all)}')
print(f'keyword_in_file_all_unlist: {len(keyword_in_file_all_unlist)}')
print(f'job_result: {len(job_result)}')

file_name: 2463
keyword_in_file_all: 2463
keyword_in_file_all_unlist: 2463
job_result: 2463


In [18]:
# export file to csv
res = job_result
csvfile = '/home/ec2-user/UCD_MSBA_team/6Shanxing_Branch/results_in_csv/8job_result2.csv'

#Assuming res is a list of lists
with open(csvfile, "w") as output:
    writer = csv.writer(output, lineterminator='\n')
    for val in res:
        writer.writerow([val])

In [19]:
# export file to csv
res = keyword_in_file_all_unlist
csvfile = '/home/ec2-user/UCD_MSBA_team/6Shanxing_Branch/results_in_csv/8keyword_in_job2.csv'

#Assuming res is a list of lists
with open(csvfile, "w") as output:
    writer = csv.writer(output, lineterminator='\n')
    writer.writerows(res)

In [20]:
# export file to csv
res = file_name
csvfile = '/home/ec2-user/UCD_MSBA_team/6Shanxing_Branch/results_in_csv/8file_name2.csv'

#Assuming res is a flat list
with open(csvfile, "w") as output:
    writer = csv.writer(output, lineterminator='\n')
    for val in res:
        writer.writerow([val])

In [21]:
# timing the algorithm
#%timeit parsing()

In [22]:
# timing the algorithm
#%timeit extract_keywords(filecontent, keywords)

In [23]:
# timing the algorithm
#%timeit unlist(keyword_in_file_all)

In [24]:
# timing the algorithm
#%timeit extract_results(filecontent)

### [Conclusion]

[description of your conclusion]

### [Reference]

- https://spark.apache.org/docs/0.9.1/python-programming-guide.html

[reference website 2]

[reference website 3]