# Synchronize and compare the wmarchive to the console

In [1]:
import pandas as pd

## 1. Load failing workflow from wmarchive

In [2]:
import filter_messages

In [3]:
test_task = '/pdmvserv_task_HIG-RunIIFall17wmLHEGS-02145__v1_T_180705_162228_8813/HIG-RunIIFall17wmLHEGS-02145_0/HIG-RunIIFall17DRPremix-02708_0'

In [15]:
# Time range has to include the desired workflow 
timerange = [20180704, 20181004]

In [16]:
# Load the data in the timerange
avro_rdd = filter_messages.load_data(sc, timerange)

[20180704, 20181004]
['hdfs:///cms/wmarchive/avro/fwjr/2018/07/04', 'hdfs:///cms/wmarchive/avro/fwjr/2018/07/05', 'hdfs:///cms/wmarchive/avro/fwjr/2018/07/06', 'hdfs:///cms/wmarchive/avro/fwjr/2018/07/07', 'hdfs:///cms/wmarchive/avro/fwjr/2018/07/08', 'hdfs:///cms/wmarchive/avro/fwjr/2018/07/09', 'hdfs:///cms/wmarchive/avro/fwjr/2018/07/10', 'hdfs:///cms/wmarchive/avro/fwjr/2018/07/11', 'hdfs:///cms/wmarchive/avro/fwjr/2018/07/12', 'hdfs:///cms/wmarchive/avro/fwjr/2018/07/13', 'hdfs:///cms/wmarchive/avro/fwjr/2018/07/14', 'hdfs:///cms/wmarchive/avro/fwjr/2018/07/15', 'hdfs:///cms/wmarchive/avro/fwjr/2018/07/16', 'hdfs:///cms/wmarchive/avro/fwjr/2018/07/17', 'hdfs:///cms/wmarchive/avro/fwjr/2018/07/18', 'hdfs:///cms/wmarchive/avro/fwjr/2018/07/19', 'hdfs:///cms/wmarchive/avro/fwjr/2018/07/20', 'hdfs:///cms/wmarchive/avro/fwjr/2018/07/21', 'hdfs:///cms/wmarchive/avro/fwjr/2018/07/22', 'hdfs:///cms/wmarchive/avro/fwjr/2018/07/23', 'hdfs:///cms/wmarchive/avro/fwjr/2018/07/24', 'hdfs:///cms

In [18]:
# filter the tasks - keep only failing 
def getFailing(row):
    rec = row[0]
    task_name = rec["task"]
    meta = rec.get('meta_data', {})
    if task_name != test_task: 
        return False
    if meta.get('jobstate', '') != 'jobfailed':
        return False
    return True

In [19]:
failures = avro_rdd.filter(lambda x : getFailing(x)).collect()

In [20]:
print len(failures)

40


In [22]:
# Save the results to disk
import pickle
path = 'data/test_task.pkl'
with open(path, 'wb') as f:
    pickle.dump(failures, f)

## 2. Load the console workflow

In [23]:
# Load the actionshistory
data = pd.read_json('/eos/user/l/llayer/AIErrorLogAnalysis/spark/data/actionshistory_300719.json', orient='index')
# Reset index
data_index_reset = data.reset_index()
data_index_reset = data_index_reset.rename(columns={'index': 'task_name'})

In [25]:
console_wf = data_index_reset[data_index_reset['task_name'] == test_task].iloc[0]

In [32]:
def merge_two_dicts(x, y):
    z = x.copy()   # start with x's keys and values
    z.update(y)    # modifies z with y's keys and values & returns None
    return z

console_wf_site_dict = merge_two_dicts(console_wf['errors']['good_sites'], console_wf['errors']['bad_sites'])

In [33]:
print console_wf_site_dict

{u'11003': {u'T1_FR_CCIN2P3': 1, u'T1_UK_RAL': 1, u'T2_US_Florida': 1, u'T1_RU_JINR': 2}, u'71305': {u'T2_CH_CERN_HLT': 7}, u'99303': {u'T2_IT_Rome': 2}, u'-1': {u'T1_ES_PIC_Disk': 1, u'T2_IT_Rome': 1, u'T1_FR_CCIN2P3_Disk': 1, u'T1_DE_KIT_Disk': 1, u'T1_IT_CNAF_Disk': 1, u'T1_RU_JINR_Disk': 1, u'T1_UK_RAL_Disk': 1}, u'99400': {u'NoReportedSite': 1}, u'71104': {u'Unknown': 1}, u'92': {u'T2_DE_RWTH': 1, u'T1_ES_PIC': 1, u'T2_UK_London_IC': 2, u'T2_UK_London_Brunel': 2, u'T2_US_Purdue': 1}, u'85': {u'T2_BE_IIHE': 1, u'T2_US_Purdue': 1, u'T2_FR_IPHC': 2}, u'50115': {u'T2_US_UCSD': 1}}


## 3. Compare the entries

### Play with sites and error codes

In [83]:
test_code = '85'
test_site = 'T2_US_Purdue'

In [84]:
def print_console_entries(sites, test_code = None, test_site = None):
    counter = 0
    for error, site_dict in sites.iteritems():
        
        if test_code is not None:
            if error != test_code:
                continue        
        
        for site, count in site_dict.iteritems():
            
            if test_site is not None:
                if site != test_site:
                    continue

            counter += count
            print error, site, count
            
    print 'Total counts console', counter

In [85]:
def print_wmarchive_entries(entry, test_code = None, test_site = None):

    steps = entry[0][u'steps']
    found_entry = False
    for n in range(len(steps)):
        
        errors = steps[n]['errors']
        site = steps[n]['site']
        if test_site is not None:
            if site != test_site:
                continue
            
        for i in range(len(errors)):
   
            error = errors[i]['exitCode']
            
            if test_code is not None:
                if error != int(test_code):
                    continue   
                    
            error_type = errors[i]['type']
            print 'Step', n
            print error, error_type, site
            
            found_entry = True
            
    return found_entry


In [86]:
print_console_entries(console_wf_site_dict, test_code, test_site)

85 T2_US_Purdue 1
Total counts console 1


In [87]:
counter = 0
for i, entry in enumerate(failures):
    
    found = print_wmarchive_entries(entry, test_code, test_site)
    if found == True:
        print
    counter += found
    
print 'Total wmarchive entries:', counter

Step 2
85 CMSSWStepFailure T2_US_Purdue
Step 2
85 WMAgentStepExecutionError T2_US_Purdue

Total wmarchive entries: 1


### Print the wmarchive entries missing in actionshistory

In [103]:
def search_wmarchive_entries(entries, test_code, test_site):
    
    for entry in entries:
        
        steps = entry[0][u'steps']
        found_entry = False
        
        for n in range(len(steps)):   
            errors = steps[n]['errors']
            site = steps[n]['site']  
            if site != test_site:
                continue
            
            for i in range(len(errors)):
                error = errors[i]['exitCode']
                if error != int(test_code):
                    continue
                else:
                    return True
        
    

def print_missing_wmarchive(sites):
    
    for error, site_dict in sites.iteritems():
        for site, count in site_dict.iteritems():
            if search_wmarchive_entries(failures, error, site) == True:
                continue
            else:
                print error, site, count

In [104]:
print_missing_wmarchive(console_wf_site_dict)

71305 T2_CH_CERN_HLT 7
-1 T1_ES_PIC_Disk 1
-1 T2_IT_Rome 1
-1 T1_FR_CCIN2P3_Disk 1
-1 T1_DE_KIT_Disk 1
-1 T1_IT_CNAF_Disk 1
-1 T1_RU_JINR_Disk 1
-1 T1_UK_RAL_Disk 1
71104 Unknown 1
