# Create word embeddings from small error messages

In [1]:
import utils as ut

In [2]:
reload(ut)

<module 'utils' from 'utils.pyc'>

In [1]:
! hdfs dfs -ls hdfs:///cms/users/llayer/

19/08/05 13:09:27 WARN ipc.Client: Exception encountered while connecting to the server : org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.ipc.StandbyException): Operation category READ is not supported in state standby. Visit https://s.apache.org/sbnn-error
Found 22 items
-rw-r--r--   3 llayer zh   81467745 2019-05-08 17:46 hdfs:///cms/users/llayer/actionhist.csv
-rw-r--r--   3 llayer zh   34737856 2019-05-08 16:47 hdfs:///cms/users/llayer/actionshist.h5
-rw-r--r--   3 llayer zh   20926756 2019-03-06 14:07 hdfs:///cms/users/llayer/actionshistory.json
drwxr-xr-x   - llayer zh          0 2019-05-19 18:47 hdfs:///cms/users/llayer/debug0.csv
drwxr-xr-x   - llayer zh          0 2019-05-20 16:48 hdfs:///cms/users/llayer/debug1.csv
drwxr-xr-x   - llayer zh          0 2019-05-20 19:05 hdfs:///cms/users/llayer/debug2.csv
drwxr-xr-x   - llayer zh          0 2019-05-20 19:09 hdfs:///cms/users/llayer/debug3.csv
drwxr-xr-x   - llayer zh          0 2019-05-20 19:23 hdfs:///cms/users/llayer/d

In [24]:
! hdfs dfs -rm -r -skipTrash hdfs:///cms/users/llayer/output*

Deleted hdfs:///cms/users/llayer/output.csv


## 1. Load the data from HDFS with the WMArchive entries

In [344]:
#timerange = [20180706, 20180706]
#timerange = [20171011, 20190401]
#timerange = [20190207, 20190801]
#timerange = [20180601, 20190207]
#timerange = [20171011, 20180601]
#timerange = [20180704, 20181004]  # for test task '/pdmvserv_task_HIG-RunIIFall17wmLHEGS-02145__v1_T_180705_162228_8813/HIG-RunIIFall17wmLHEGS-02145_0/HIG-RunIIFall17DRPremix-02708_0'
timerange = [20170101, 20171009]
dirs = ut.getDirs( timerange )

In [345]:
print dirs

['hdfs:///cms/wmarchive/avro/fwjr/2017/01/01', 'hdfs:///cms/wmarchive/avro/fwjr/2017/01/02', 'hdfs:///cms/wmarchive/avro/fwjr/2017/01/03', 'hdfs:///cms/wmarchive/avro/fwjr/2017/01/04', 'hdfs:///cms/wmarchive/avro/fwjr/2017/01/05', 'hdfs:///cms/wmarchive/avro/fwjr/2017/01/06', 'hdfs:///cms/wmarchive/avro/fwjr/2017/01/07', 'hdfs:///cms/wmarchive/avro/fwjr/2017/01/08', 'hdfs:///cms/wmarchive/avro/fwjr/2017/01/09', 'hdfs:///cms/wmarchive/avro/fwjr/2017/01/10', 'hdfs:///cms/wmarchive/avro/fwjr/2017/01/11', 'hdfs:///cms/wmarchive/avro/fwjr/2017/01/12', 'hdfs:///cms/wmarchive/avro/fwjr/2017/01/13', 'hdfs:///cms/wmarchive/avro/fwjr/2017/01/14', 'hdfs:///cms/wmarchive/avro/fwjr/2017/01/15', 'hdfs:///cms/wmarchive/avro/fwjr/2017/01/16', 'hdfs:///cms/wmarchive/avro/fwjr/2017/01/17', 'hdfs:///cms/wmarchive/avro/fwjr/2017/01/18', 'hdfs:///cms/wmarchive/avro/fwjr/2017/01/19', 'hdfs:///cms/wmarchive/avro/fwjr/2017/01/20', 'hdfs:///cms/wmarchive/avro/fwjr/2017/01/21', 'hdfs:///cms/wmarchive/avro/fwjr/

In [346]:
#schema_file = 'hdfs:///cms/wmarchive/avro/schema.avsc'
schema_file = 'hdfs:///cms/wmarchive/avro/schemas/current.avsc.20161215'
rdd = sc.textFile(schema_file, 1).collect()

In [347]:
# define input avro schema, the rdd is a list of lines (sc.textFile similar to readlines)
avsc = reduce(lambda x, y: x + y, rdd) # merge all entries from rdd list
schema = ''.join(avsc.split()) # remove spaces in avsc map
conf = {"avro.schema.input.key": schema}

In [348]:
# define newAPIHadoopFile parameters, java classes
aformat="org.apache.avro.mapreduce.AvroKeyInputFormat"
akey="org.apache.avro.mapred.AvroKey"
awrite="org.apache.hadoop.io.NullWritable"
aconv="org.apache.spark.examples.pythonconverters.AvroWrapperToJavaConverter"

In [349]:
data_path = dirs
# load data from HDFS
if  isinstance(data_path, list):
    avro_rdd = sc.union([sc.newAPIHadoopFile(f, aformat, akey, awrite, aconv, conf=conf) for f in data_path])
else:
    avro_rdd = sc.newAPIHadoopFile(data_path, aformat, akey, awrite, aconv, conf=conf)

## 2. Filter out the failing tasks and create a DF

In [13]:
# filter the tasks - keep only failing 
def getFailing(row):
    rec = row[0]
    meta = rec.get('meta_data', {})
    if meta.get('jobstate', '') != 'jobfailed':
        return False
    return True

# create task, site, error key  
def avro_rdd_KV(row):
    rec = row[0]
    task = rec["task"]
    steps = rec.get('steps', [])
    exit_code = []
    site = []
    error_msg = []
    error_type = []
    
    
    for step in steps:
        errors = step['errors']
        #details = []
        #exitCodes = []
        for error in errors:
            #details.append(error['details'])
            #exitCodes.append(error['exitCode'])
            exit_code.append(error['exitCode'])
            error_type.append(error['type'])
            error_msg.append( error['details'].replace("\n", " ").replace('\r', ' ') )
            site.append( step.get('site','') )
    
    return [(task, site, error, error_type, msg) for error, msg, site, error_type in zip(exit_code, error_msg, site, error_type) ]

In [70]:
# Filter the data
failing_workflows = avro_rdd.filter(lambda x : getFailing(x)).flatMap(lambda x : avro_rdd_KV(x))

In [23]:
failing_workflows_df_test = failing_workflows.toDF(["task_name", "site", "error", "error_msg"])

In [25]:
failing_workflows_df_test.show()

+--------------------+----+-----+--------------------+
|           task_name|site|error|           error_msg|
+--------------------+----+-----+--------------------+
|/pdmvserv_task_BT...|null|99303|Could not find jo...|
|/pdmvserv_task_BT...|null|99303|Could not find jo...|
|/pdmvserv_task_BT...|null|99303|Could not find jo...|
|/pdmvserv_task_BT...|null|99303|Could not find jo...|
|/pdmvserv_task_BT...|null|99303|Could not find jo...|
|/pdmvserv_task_BT...|null|99303|Could not find jo...|
|/pdmvserv_task_BT...|null|99303|Could not find jo...|
|/pdmvserv_task_BT...|null|99303|Could not find jo...|
|/pdmvserv_task_BT...|null|99303|Could not find jo...|
|/pdmvserv_task_BT...|null|99303|Could not find jo...|
|/pdmvserv_task_BT...|null|99303|Could not find jo...|
|/pdmvserv_task_BT...|null|99303|Could not find jo...|
|/pdmvserv_task_BT...|null|99303|Could not find jo...|
|/pdmvserv_task_BT...|null|99303|Could not find jo...|
|/pdmvserv_task_BT...|null|99303|Could not find jo...|
|/pdmvserv

In [None]:
def map_to_KV(row):
    
    # Assume that the first error code per step is the correct one
    rec = row[0]
    task = rec["task"]
    steps = rec.get('steps', [])
    
    exit_code_first = []
    sites_first = []
    exit_codes = []
    error_msg = []
    error_type = []
    steps = []
    names = []
    peakvaluerss = []
    peakvaluevsize = []
    writeTotalMB = []
    readPercentageOps = []
    readAveragekB = []
    readTotalMB = []
    readNumOps = []
    readCachePercentageOps = []
    readMBSec = []
    writeTotalSecs = []
    readTotalSecs = []
    readMaxMSec = []
    TotalJobCPU = []
    NumberOfStreams = []
    TotalInitCPU = []
    TotalEventCPU = []
    AvgEventCPU = []
    EventThroughput = []
    TotalInitTime = []
    AvgEventTime = []
    NumberOfThreads = []
    MinEventCPU = []
    MaxEventTime = []
    TotalJobTime = []
    TotalLoopCPU = []
    MinEventTime = []
    MaxEventCPU = []
    
    
    
    for counter, step in enumerate(steps):
        errors = step['errors']
        
        if len(errors) > 0:
            # Save the first exit code per step
            exit_code_first.append(errors[0]['exitCode'])
            # Save the site
            sites_first.append( step.get('site','') )
            # Save the step number
            #steps.append(counter)
            # Save the name
            #names.append( step.get('name', '') )
            """
            # Save memory
            peakvaluerss.append( errors['performance']['memory']['PeakValueRss'] )
            peakvaluevsize.append( errors['performance']['memory']['PeakValueVsize'] )
            # Save storage
            writeTotalMB.append(errors['performance']['storage']['writeTotalMB'])
            readPercentageOps.append(errors['performance']['storage']['readPercentageOps'])
            readAveragekB.append(errors['performance']['storage']['readAveragekB'])
            readTotalMB.append(errors['performance']['storage']['readTotalMB'])
            readNumOps.append(errors['performance']['storage']['readNumOps'])
            readCachePercentageOps.append(errors['performance']['storage']['readCachePercentageOps'])
            readMBSec.append(errors['performance']['storage']['readMBSec'])
            writeTotalSecs.append(errors['performance']['storage']['writeTotalSecs'])
            readTotalSecs.append(errors['performance']['storage']['readTotalSecs'])
            readMaxMSec.append(errors['performance']['storage']['readMaxMSec'])
            # Save CPU
            TotalJobCPU.append(errors['performance']['cpu']['TotalJobCPU'])
            NumberOfStreams.append(errors['performance']['cpu']['NumberOfStreams'])
            TotalInitCPU.append(errors['performance']['cpu']['TotalInitCPU'])
            TotalEventCPU.append(errors['performance']['cpu']['TotalEventCPU'])
            AvgEventCPU.append(errors['performance']['cpu']['AvgEventCPU'])
            EventThroughput.append(errors['performance']['cpu']['EventThroughput'])
            TotalInitTime.append(errors['performance']['cpu']['TotalInitTime'])
            AvgEventTime.append(errors['performance']['cpu']['AvgEventTime'])
            NumberOfThreads.append(errors['performance']['cpu']['NumberOfThreads'])
            MinEventCPU.append(errors['performance']['cpu']['MinEventCPU'])
            MaxEventTime.append(errors['performance']['cpu']['MaxEventTime'])
            TotalJobTime.append(errors['performance']['cpu']['TotalJobTime'])
            TotalLoopCPU.append(errors['performance']['cpu']['TotalLoopCPU'])
            MinEventTime.append(errors['performance']['cpu']['MinEventTime'])
            MaxEventCPU.append(errors['performance']['cpu']['MaxEventCPU'])            
            """
            
            # Loop over the errors
            for error in errors:
                                   
                exit_codes.append(error['exitCode'])
                error_type.append( error['type'] )
                error_msg.append( error['details'] )
                
    #return [("{0}{1}{2}".format(task, exit_code, site), (task, exit_code, site)) for exit_code, site in zip(exit_code_first, site_first)]
    return  [(task, site, exit_code) for site, exit_code in zip(sites_first, exit_code_first) ]
    #return [((task, e, s), rec) for e, s in zip(exit_code_first, sites)]

## 2.1 Filter the error chain

In [372]:
def map_to_KV(row):
    
    # Assume that the first error code per step is the correct one
    rec = row[0]
    task = rec["task"]
    steps = rec.get('steps', [])
    
    exit_code_first = []
    sites = []
    exit_codes = []
    error_msg = []
    error_type = []
    steps_counter = []
    names = []
    peakvaluerss = []
    peakvaluevsize = []
    writeTotalMB = []
    readPercentageOps = []
    readAveragekB = []
    readTotalMB = []
    readNumOps = []
    readCachePercentageOps = []
    readMBSec = []
    writeTotalSecs = []
    readTotalSecs = []
    readMaxMSec = []
    TotalJobCPU = []
    NumberOfStreams = []
    TotalInitCPU = []
    TotalEventCPU = []
    AvgEventCPU = []
    EventThroughput = []
    TotalInitTime = []
    AvgEventTime = []
    NumberOfThreads = []
    MinEventCPU = []
    MaxEventTime = []
    TotalJobTime = []
    TotalLoopCPU = []
    MinEventTime = []
    MaxEventCPU = []
    
    
    
    for counter, step in enumerate(steps):
        errors = step['errors']
        
        if len(errors) > 0:
            # Save the first exit code per step
            exit_code_first.append(errors[0]['exitCode'])
            # Save the site
            sites.append( step.get('site','') )
            # Save the step number
            #steps.append(counter)
            # Save the name
            #names.append( step.get('name', '') )
            
            # Loop over the errors
            for error in errors:
                                   
                exit_codes.append(error['exitCode'])
                steps_counter.append(counter)
                names.append( step.get('name', '') )
                error_type.append( error['type'] )
                error_msg.append( error['details'].replace("\n", " ").replace('\r', ' ') )
                
                
                # Save memory
                for k,v in step['performance']['memory'].iteritems():
                    
                    value = -1. if v is None else v
                    if k == 'PeakValueVsize':
                        peakvaluevsize.append(value)
                    if k == 'PeakValueRss':
                        peakvaluerss.append(value)
                
                # Save storage
                for k,v in step['performance']['storage'].iteritems():
                    
                    value = -1. if v is None else v
                    if k == 'writeTotalMB':
                        writeTotalMB.append(value)
                    if k == 'readPercentageOps':
                        readPercentageOps.append(value)                        
                    if k == 'readAveragekB':
                        readAveragekB.append(value)
                    if k == 'readTotalMB':
                        readTotalMB.append(value)                     
                    if k == 'readNumOps':
                        readNumOps.append(value)
                    if k == 'readCachePercentageOps':
                        readCachePercentageOps.append(value)                        
                    if k == 'writeTotalSecs':
                        writeTotalSecs.append(value)
                    if k == 'readMBSec':
                        readMBSec.append(value)                  
                    if k == 'readTotalSecs':
                        readTotalSecs.append(value)
                    if k == 'readMaxMSec':
                        readMaxMSec.append(value)                  

                # Save cpu
                for k,v in step['performance']['cpu'].iteritems():
                    
                    value = -1. if v is None else v
                    if k == 'TotalJobCPU':
                        TotalJobCPU.append(value)
                    if k == 'NumberOfStreams':
                        NumberOfStreams.append(value)  
                    if k == 'TotalInitCPU':
                        TotalInitCPU.append(value)
                    if k == 'TotalEventCPU':
                        TotalEventCPU.append(value)                 
                    if k == 'AvgEventCPU':
                        AvgEventCPU.append(value)
                    if k == 'EventThroughput':
                        EventThroughput.append(value)  
                    if k == 'TotalInitTime':
                        TotalInitTime.append(value)
                    if k == 'AvgEventTime':
                        AvgEventTime.append(value)                  
                    if k == 'NumberOfThreads':
                        NumberOfThreads.append(value)
                    if k == 'MinEventCPU':
                        MinEventCPU.append(value)  
                    if k == 'MaxEventTime':
                        MaxEventTime.append(value)
                    if k == 'TotalJobTime':
                        TotalJobTime.append(value)
                    if k == 'TotalLoopCPU':
                        TotalLoopCPU.append(value)  
                    if k == 'MinEventTime':
                        MinEventTime.append(value)
                    if k == 'MaxEventCPU':
                        MaxEventCPU.append(value)          
                        
                        
    # Hack for the 2017 schema
    if len(NumberOfStreams) == 0:
        for i in range(len(AvgEventCPU)):
            NumberOfStreams.append(-1.)
            TotalInitCPU.append(-1.) 
            TotalInitTime.append(-1.) 
            NumberOfThreads.append(-1.)
    
    res = (exit_codes, error_msg, error_type, steps_counter, names, peakvaluerss, peakvaluevsize, writeTotalMB, readPercentageOps, readAveragekB, readTotalMB,\
            readNumOps, readCachePercentageOps, readMBSec, writeTotalSecs, readTotalSecs, readMaxMSec, 
            TotalJobCPU, NumberOfStreams, TotalInitCPU, TotalEventCPU, AvgEventCPU, EventThroughput, 
            TotalInitTime, AvgEventTime, NumberOfThreads, MinEventCPU, MaxEventTime, TotalJobTime,
            TotalLoopCPU, MinEventTime,MaxEventCPU)
        
    #return [("{0}{1}{2}".format(task, exit_code, site), (task, exit_code, site)) for exit_code, site in zip(exit_code_first, site_first)]
    #return  [(task, site, exit_code) for site, exit_code in zip(sites_first, exit_code_first) ]
    return [((task, e, s), res) for e, s in zip(exit_code_first, sites)]


In [373]:
test = avro_rdd.filter(lambda x : getFailing(x)).take(1)

In [367]:
print list(test[0][0]['steps'][0]['performance']['cpu'])

[u'TotalJobCPU', u'TotalEventCPU', u'AvgEventCPU', u'EventThroughput', u'AvgEventTime', u'MinEventCPU', u'MaxEventTime', u'TotalJobTime', u'TotalLoopCPU', u'MinEventTime', u'MaxEventCPU']


In [370]:
# Filter the data
failing_workflows = avro_rdd.filter(lambda x : getFailing(x)).flatMap(lambda x : map_to_KV(x))
#failing_workflows = avro_rdd.filter(lambda x : getFailing(x)).flatMap(lambda x : map_to_KV(x))

In [374]:
print failing_workflows.take(1)



In [None]:
failing_workflows.show()

In [180]:
def get_key(x):
    return "{0}{1}{2}".format(x[0], x[1], x[2])
m = failing_workflows.map(lambda x: (get_key(x),x))

In [375]:
# Reduce redundant keys
r = failing_workflows.reduceByKey(lambda x,y: (x))

In [376]:
r.take(1)

[((u'/pdmvserv_task_B2G-RunIISummer16DR80Premix-01169__v1_T_161227_164426_7583/B2G-RunIISummer16DR80Premix-01169_0/B2G-RunIISummer16DR80Premix-01169_1/B2G-RunIISummer16DR80Premix-01169_1MergeAODSIMoutput',
   99999,
   u'T2_FR_GRIF_LLR'),
  ([99999, 139, 50115, 99999],
   [u'Could not find report file for step stageOut1!',
    u"  Adding last 25 lines of CMSSW stdout: #16 0x00002af28c0f88e6 in XrdAdaptor::RequestManager::handle(std::shared_ptr<XrdAdaptor::ClientRequest>) () from /cvmfs/cms.cern.ch/slc6_amd64_gcc530/cms/cmssw/CMSSW_8_0_21/lib/slc6_amd64_gcc530/libUtilitiesXrdAdaptor.so #17 0x00002af28c0d0c8f in XrdFile::read(void*, unsigned long) () from /cvmfs/cms.cern.ch/slc6_amd64_gcc530/cms/cmssw/CMSSW_8_0_21/lib/slc6_amd64_gcc530/libUtilitiesXrdAdaptor.so #18 0x00002af27f42db46 in StorageAccountProxy::read(void*, unsigned long) () from /cvmfs/cms.cern.ch/slc6_amd64_gcc530/cms/cmssw/CMSSW_8_0_21/lib/slc6_amd64_gcc530/libUtilitiesStorageFactory.so #19 0x00002af27f44ddb5 in IOInput::x

In [339]:
def map_to_frame_format(row):
    keys = row[0]
    rec = row[1]
    
    variables = zip(*rec)
    
    return [keys + var for var in variables]

In [378]:
r2 = r.flatMap(lambda x: map_to_frame_format(x))

In [355]:
r2.take(1)

[]

In [379]:
df = r2.toDF(["task_name", "error", "site", "exit_codes", "error_msg", "error_type", "steps_counter", "names","peakvaluerss", "peakvaluevsize" , "writeTotalMB", "readPercentageOps", "readAveragekB", "readTotalMB",\
"readNumOps", "readCachePercentageOps", "readMBSec", "writeTotalSecs", "readTotalSecs", "readMaxMSec", 
"TotalJobCPU", "NumberOfStreams", "TotalInitCPU", "TotalEventCPU", "AvgEventCPU", "EventThroughput", 
"TotalInitTime", "AvgEventTime", "NumberOfThreads", "MinEventCPU", "MaxEventTime", "TotalJobTime",
"TotalLoopCPU", "MinEventTime", "MaxEventCPU" ])

In [380]:
df.show()

+--------------------+-----+---------------+----------+--------------------+--------------------+-------------+---------+------------+--------------+------------+------------------+-----------------+-----------+----------+----------------------+------------------+--------------+-------------+-----------+-----------+---------------+------------+-------------+-----------+---------------+-------------+------------+---------------+-----------+------------+------------+------------+------------+-----------+
|           task_name|error|           site|exit_codes|           error_msg|          error_type|steps_counter|    names|peakvaluerss|peakvaluevsize|writeTotalMB| readPercentageOps|    readAveragekB|readTotalMB|readNumOps|readCachePercentageOps|         readMBSec|writeTotalSecs|readTotalSecs|readMaxMSec|TotalJobCPU|NumberOfStreams|TotalInitCPU|TotalEventCPU|AvgEventCPU|EventThroughput|TotalInitTime|AvgEventTime|NumberOfThreads|MinEventCPU|MaxEventTime|TotalJobTime|TotalLoopCPU|MinEventTi

In [381]:
df_res = df.join(labeled_failing_tasks, ['task_name', 'error', 'site'])

In [293]:
df_res.show()

+--------------------+-----+------------+----------+--------------------+--------------------+-------------+---------+------------+-------+
|           task_name|error|        site|exit_codes|           error_msg|          error_type|steps_counter|    names|peakvaluerss|    _c0|
+--------------------+-----+------------+----------+--------------------+--------------------+-------------+---------+------------+-------+
|/pdmvserv_task_B2...|   92|T2_ES_CIEMAT|     99996|Failed to find a ...|ReportManipulatin...|            0|stageOut1|        -1.0| 136664|
|/pdmvserv_task_B2...|   92|T2_ES_CIEMAT|        92|  Adding last 25 ...|    CMSSWStepFailure|            2|  cmsRun1|        -1.0| 136664|
|/pdmvserv_task_B2...|   92|T2_ES_CIEMAT|      8028|An exception of c...|     Fatal Exception|            2|  cmsRun1|        -1.0| 136664|
|/pdmvserv_task_B2...|   92|T2_ES_CIEMAT|     99999|Adding extra erro...|ErrorLoggingAddition|            2|  cmsRun1|        -1.0| 136664|
|/pdmvserv_task_HI..

In [382]:
df_res.write.format('com.databricks.spark.csv').save('hdfs:///cms/users/llayer/single_writeout_010117_101117.csv',header = 'true')

In [53]:
! hdfs dfs -put actionshistory_300719.csv hdfs://analytix/user/llayer

19/08/04 13:25:49 WARN ipc.Client: Exception encountered while connecting to the server : org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.ipc.StandbyException): Operation category READ is not supported in state standby. Visit https://s.apache.org/sbnn-error


In [54]:
from pyspark import SQLContext, StorageLevel
sql = SQLContext(sc)
labeled_failing_tasks = (sql.read
     .format("com.databricks.spark.csv")
     .option("header", "true")
     .load("actionshistory_300719.csv"))

In [56]:
from pyspark.sql.types import IntegerType
labeled_failing_tasks = labeled_failing_tasks.withColumn("error", labeled_failing_tasks["error"].cast(IntegerType()))

In [57]:
labeled_failing_tasks.show()

+---+--------------------+-----+---------------+
|_c0|           task_name|error|           site|
+---+--------------------+-----+---------------+
|  3|/amaltaro_Run2018...|   85|      T1_UK_RAL|
|  4|/amaltaro_Run2018...|50664|     T2_DE_RWTH|
|  5|/amaltaro_Run2018...|50664|     T2_DE_RWTH|
|  8|/amaltaro_Run2018...|99400| NoReportedSite|
|  9|/amaltaro_Run2018...|50664|     T2_DE_RWTH|
| 11|/amaltaro_Run2018...|   92|      T2_US_MIT|
| 15|/amaltaro_Run2018...|   85|   T2_PL_Swierk|
| 18|/amaltaro_Run2018...|50664| T2_CH_CERN_HLT|
| 22|/amaltaro_Run2018...|   85|      T2_US_MIT|
| 24|/amaltaro_Run2018...|   85|T1_US_FNAL_Disk|
| 25|/amaltaro_Run2018...|   85|T1_US_FNAL_Disk|
| 26|/amaltaro_Run2018...|   85|T1_US_FNAL_Disk|
| 27|/amaltaro_Run2018...|   85|T1_US_FNAL_Disk|
| 31|/amaltaro_Run2018...|   85|T1_US_FNAL_Disk|
| 34|/amaltaro_Run2018...|   85|T1_US_FNAL_Disk|
| 35|/amaltaro_task_BT...|50664|     T1_US_FNAL|
| 36|/amaltaro_task_BT...|50660|     T1_RU_JINR|
| 37|/areinsvo_task_

### Debugging...

In [12]:
def filter_for_problem(row):
    #task_name = row[0]
    rec = row[0]
    task_name = rec["task"]
    #task_name = row[0]
    #error = row[2]
    #test_task = '/pdmvserv_task_HIG-RunIIFall17wmLHEGS-03477__v1_T_190208_210318_4829/HIG-RunIIFall17wmLHEGS-03477_0'
    #test_error = 99109
    #test_task = '/pdmvserv_task_B2G-RunIISummer15wmLHEGS-01375__v1_T_180703_203928_6625/B2G-RunIISummer15wmLHEGS-01375_0'
    #test_task = '/pdmvserv_task_HIG-RunIIFall17wmLHEGS-00630__v1_T_180216_120647_5907/HIG-RunIIFall17wmLHEGS-00630_0/HIG-RunIIFall17DRPremix-00737_0'
    #test_task = '/vlimant_ACDC0_task_HIG-RunIIFall17wmLHEGS-01415__v1_T_180706_002124_986/HIG-RunIIFall17DRPremix-02001_1/HIG-RunIIFall17DRPremix-02001_1MergeAODSIMoutput/HIG-RunIIFall17MiniAODv2-01299_0'
    #test_error = 85
    #test_task = '/pdmvserv_task_HIG-RunIIFall17wmLHEGS-00689__v1_T_180307_201555_2433/HIG-RunIIFall17wmLHEGS-00689_0/HIG-RunIIFall17DRPremix-00886_0'
    test_task = '/pdmvserv_task_HIG-RunIIFall17wmLHEGS-02145__v1_T_180705_162228_8813/HIG-RunIIFall17wmLHEGS-02145_0/HIG-RunIIFall17DRPremix-02708_0'
    if (task_name == test_task): # and (error == test_error):
        return True
    else:
        return False

In [13]:
failures = avro_rdd.filter(lambda x : getFailing(x)).filter(lambda x : filter_for_problem(x))
#failures = avro_rdd.filter(lambda x : getFailing(x)).flatMap(lambda x : avro_rdd_KV(x)).filter(lambda x : filter_for_problem(x))

In [111]:
failures.saveAsTextFile("hdfs:///cms/users/llayer/debug5.csv")

In [71]:
examples = failures.collect()

In [72]:
print len(examples)

40


In [74]:
import pickle
with open('test_task.pkl', 'wb') as f:
    pickle.dump(examples, f)

In [21]:
print list(example[0][0])

[u'PFNArrayRef', u'task', u'PrepID', u'skippedFiles', u'Campaign', u'wmaid', u'dtype', u'wmats', u'fallbackFiles', u'LFNArray', u'meta_data', u'steps', u'PFNArray', u'LFNArrayRef', u'stype']


In [50]:
print list(example[0][0][u'steps'][2])

[u'status', u'errors', u'name', u'stop', u'site', u'start', u'performance', u'output', u'input']


In [61]:
print list(example[0][0][u'steps'][2]['errors'][0])

[u'type', u'details', u'exitCode']


In [70]:
for i in range(len(example[0][0][u'steps'][2]['errors'])):
    print example[0][0][u'steps'][2]['errors'][i]['exitCode'], example[0][0][u'steps'][2]['errors'][i]['type']

85 CMSSWStepFailure
8021 Fatal Exception
8021 Fatal Exception
8021 Fatal Exception
8021 Fatal Exception
8021 Fatal Exception
8021 Fatal Exception
99999 ErrorLoggingAddition
85 WMAgentStepExecutionError


In [43]:
probs = failures.collect()

In [44]:
print len(probs)

12


In [45]:
for prob in probs:
    print prob[1], prob[2]

T1_UK_RAL 85
T1_UK_RAL 85
T1_UK_RAL 85
T1_UK_RAL 85
T1_UK_RAL 85
T1_UK_RAL 85
T1_FR_CCIN2P3 85
T1_FR_CCIN2P3 85
T1_UK_RAL 85
T1_UK_RAL 85
T1_UK_RAL 85
T1_UK_RAL 85


### Reduce

In [71]:
def map_to_key(row):
    return (row[0], row[1], row[2]), row[3]

In [45]:
print failing_workflows.map(lambda x : map_to_key(x)).take(1)

[((u'/pdmvserv_task_SMP-RunIISummer16DR80Premix-00011__v1_T_161227_083845_4347/SMP-RunIISummer16DR80Premix-00011_0', u'T2_UK_London_IC', 99999), u'Could not find report file for step stageOut1!')]


In [72]:
def reduceToLongest(row):
    key = row[0]
    message_list = list(row[1])
    longest_msg = ''
    for msg in message_list:
        if len(msg) > len(longest_msg):
            longest_msg = msg
    return key[0], key[1], key[2], longest_msg

failing_workflows_reduce = failing_workflows.map(lambda x : map_to_key(x)).groupByKey().map(lambda x : reduceToLongest(x))

In [73]:
failing_workflows_df = failing_workflows_reduce.toDF(["task_name", "site", "error", "error_msg"])

In [74]:
failing_workflows_df.show()

+--------------------+---------------+-----+--------------------+
|           task_name|           site|error|           error_msg|
+--------------------+---------------+-----+--------------------+
|/pdmvserv_task_BP...|     T2_DE_DESY|   80|  Adding last 25 ...|
|/fabozzi_Run2016H...|   T2_US_Purdue|99109|Error in StageOut...|
|/pdmvserv_task_B2...|           null|99303|Could not find jo...|
|/pdmvserv_task_HI...|T2_FR_GRIF_IRFU|  134|  Adding last 25 ...|
|/fabozzi_Run2016B...|     T1_RU_JINR|99999|Adding extra erro...|
|/pdmvserv_task_SU...|      T1_UK_RAL|  139|  Adding last 25 ...|
|/pdmvserv_task_EX...|T2_US_Wisconsin| 8001|Exit 8001: CMSExe...|
|/prebello_Run2016...|   T2_US_Purdue| 8022|An exception of c...|
|/prozober_ACDC0_t...|     T2_CH_CERN|99999|Adding extra erro...|
|/pdmvserv_task_HI...|     T2_DE_RWTH| 8028|An exception of c...|
|/pdmvserv_task_EG...|     T2_US_UCSD|   85|  Adding last 25 ...|
|/pdmvserv_task_B2...|      T1_UK_RAL| 8001|Exit 8001: CMSExe...|
|/pdmvserv

In [63]:
failing_workflows_df.rdd.getNumPartitions()

4159

In [66]:
failing_workflows_df_rep = failing_workflows_df.repartition(500)
print failing_workflows_df_rep.rdd.getNumPartitions()

500


## 3. Load actionhist and convert to df

In [75]:
from pyspark import SQLContext, StorageLevel
sql = SQLContext(sc)
labeled_failing_tasks = (sql.read
     .format("com.databricks.spark.csv")
     .option("header", "true")
     .load("hdfs:///cms/users/llayer/actionhist.csv"))

#rdd_failing_tasks = rdd_failing_tasks.rdd.map(tuple)
#print rdd_failing_tasks.take(1)

In [76]:
from pyspark.sql.types import IntegerType
labeled_failing_tasks = labeled_failing_tasks.withColumn("error", labeled_failing_tasks["error"].cast(IntegerType()))
print labeled_failing_tasks.head()

Row(task_name=u'/vlimant_ACDC0_task_HIG-RunIIFall17wmLHEGS-01415__v1_T_180706_002124_986/HIG-RunIIFall17DRPremix-02001_1/HIG-RunIIFall17DRPremix-02001_1MergeAODSIMoutput/HIG-RunIIFall17MiniAODv2-01299_0', side_state=u'good_site', error=85, site=u'T1_UK_RAL', action=u'acdc', memory=None)


## 4. Join both frames and save to HDFS

In [77]:
df = failing_workflows_df.join(labeled_failing_tasks, ['task_name','site', 'error'])

In [78]:
df = df.dropDuplicates()

In [79]:
df.write.format('com.databricks.spark.csv').save('hdfs:///cms/users/llayer/df_reduced_codes2.csv',header = 'true')