Question #1
Use data to see if there are any clusters of job failures on specific nodes. Determine whether any of the production partition nodes show an unusual number of failed jobs relative to the others. Ignore the debug partition. 

Action: 
1) clean up df columns to only reflect relevant columns 
2) Determine any pattern of job failures for specific nodes
3) Analyze Exit codes 0:0, discard the user errors for this item? 
4) Review number of jobs that have 1 node to first see if there is a pattern
5) NTH: Determine whether pattern of failure rate as job ids increase
6) NTH: Determine whether pattern of failure rate for jobs with >1 node (e.g., communication failure between certain nodes (any correlations between nodes, clusters, hardware, etc)

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import statsmodels.api as sm
%matplotlib inline
from io import StringIO
import re

In [3]:
for_pd = StringIO()
with open('../data/accre-jobs-2020.csv') as accre:
    for line in accre:
        new_line = re.sub(r',', '|', line.rstrip(), count=12)
        print (new_line, file=for_pd)

for_pd.seek(0)

accre = pd.read_csv(for_pd, sep='|')
print (accre)

               JOBID        ACCOUNT       USER    REQMEM    USEDMEM  \
0           15925210        treviso   arabella  122880Mn  65973.49M   
1           15861126        treviso   arabella  122880Mn  67181.12M   
2           15861125        treviso   arabella  122880Mn  69111.86M   
3           16251645        treviso   arabella  122880Mn  65317.33M   
4           16251646        treviso   arabella  122880Mn  65876.11M   
...              ...            ...        ...       ...        ...   
3816285   24173815_8     portabella     vennie   32768Mn  30042.68M   
3816286   24173815_9     portabella     vennie   32768Mn  31067.75M   
3816287  24173815_10     portabella     vennie   32768Mn  21207.96M   
3816288     24173817  boysenberries  hortensia  100000Mn          0   
3816289     24173866    horseradish     ariane   20480Mn          0   

             REQTIME     USEDTIME  NODES  CPUS   PARTITION EXITCODE  \
0        13-18:00:00  13-18:00:28      1    24  production      0:0   
1    

In [4]:
accre.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3816290 entries, 0 to 3816289
Data columns (total 13 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   JOBID      object
 1   ACCOUNT    object
 2   USER       object
 3   REQMEM     object
 4   USEDMEM    object
 5   REQTIME    object
 6   USEDTIME   object
 7   NODES      int64 
 8   CPUS       int64 
 9   PARTITION  object
 10  EXITCODE   object
 11  STATE      object
 12  NODELIST   object
dtypes: int64(2), object(11)
memory usage: 378.5+ MB


In [5]:
accre.EXITCODE.describe()

count     3816290
unique         30
top           0:0
freq      3813403
Name: EXITCODE, dtype: object

In [6]:
accre.EXITCODE.unique()

array(['0:0', '0:6', '0:15', '1:0', '126:0', '0:9', '2:0', '0:125',
       '13:0', '127:0', '8:0', '6:0', '0:39', '0:105', '7:0', '116:0',
       '0:4', '0:7', '0:12', '101:0', '24:0', '38:0', '99:0', '11:0',
       '0:36', '0:11', '64:0', '0:2', '16:0', '59:0'], dtype=object)

In [7]:
accre.STATE.unique()

array(['COMPLETED', 'CANCELLED', 'FAILED', 'OUT_OF_MEMORY',
       'CANCELLED by 782611', 'CANCELLED by 791651', 'CANCELLED by 9206',
       'CANCELLED by 785271', 'CANCELLED by 666860',
       'CANCELLED by 686562', 'CANCELLED by 397600',
       'CANCELLED by 199066', 'CANCELLED by 503585',
       'CANCELLED by 505355', 'TIMEOUT', 'CANCELLED by 486541',
       'CANCELLED by 763605', 'CANCELLED by 124006', 'CANCELLED by 90423',
       'CANCELLED by 200557', 'CANCELLED by 649319',
       'CANCELLED by 483348', 'CANCELLED by 855431',
       'CANCELLED by 199766', 'CANCELLED by 337422', 'CANCELLED by 0',
       'CANCELLED by 546080', 'CANCELLED by 693461',
       'CANCELLED by 782535', 'CANCELLED by 515423',
       'CANCELLED by 454080', 'CANCELLED by 9201', 'CANCELLED by 649321',
       'CANCELLED by 9202', 'CANCELLED by 895426', 'CANCELLED by 651701',
       'CANCELLED by 781109', 'RUNNING'], dtype=object)

In [8]:
accre.STATE.describe()

count       3816290
unique           38
top       COMPLETED
freq        3804644
Name: STATE, dtype: object

In [9]:
accre.groupby('STATE').size()

STATE
CANCELLED                 6107
CANCELLED by 0               5
CANCELLED by 124006          2
CANCELLED by 199066          2
CANCELLED by 199766          4
CANCELLED by 200557          6
CANCELLED by 337422          1
CANCELLED by 397600          7
CANCELLED by 454080          4
CANCELLED by 483348          3
CANCELLED by 486541          4
CANCELLED by 503585          1
CANCELLED by 505355         63
CANCELLED by 515423          2
CANCELLED by 546080          2
CANCELLED by 649319          3
CANCELLED by 649321          1
CANCELLED by 651701          1
CANCELLED by 666860          2
CANCELLED by 686562        681
CANCELLED by 693461          2
CANCELLED by 763605          1
CANCELLED by 781109          1
CANCELLED by 782535          1
CANCELLED by 782611         13
CANCELLED by 785271          1
CANCELLED by 791651          2
CANCELLED by 855431         17
CANCELLED by 895426          2
CANCELLED by 90423           2
CANCELLED by 9201            3
CANCELLED by 9202            3
CA

In [10]:
# review failed nodes, first identify failed jobs
failed_jobs = accre[accre['STATE'] == 'FAILED']
failed_jobs

Unnamed: 0,JOBID,ACCOUNT,USER,REQMEM,USEDMEM,REQTIME,USEDTIME,NODES,CPUS,PARTITION,EXITCODE,STATE,NODELIST
8629,17094238,cms,cmspilot,186648Mn,1.78M,2-00:00:00,00:01:11,1,0,nogpfs,1:0,FAILED,cn1554
8632,17094335,cms,cmspilot,186648Mn,8.38M,2-00:00:00,00:01:26,1,0,nogpfs,1:0,FAILED,cn1544
22234,17072678,cms,cmspilot,21875Mn,11310.72M,2-00:00:00,00:02:35,1,8,nogpfs,126:0,FAILED,ng909
56716,17150413,blueberries,leva,4096Mn,1968.02M,10:00:00,02:15:54,1,1,pascal,1:0,FAILED,gpu0021
68661,17178412,cubanelle,robyn,1024Mc,73.32M,00:15:00,00:14:17,1,1,pascal,2:0,FAILED,gpu0021
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3813639,24168919,pinole,daryl,376832Mn,11.52M,00:30:00,00:01:28,1,1,turing,1:0,FAILED,gpu0048
3814389,24170236,alkmene,abigail,8192Mn,5.81M,02:00:00,00:08:07,2,8,production,1:0,FAILED,"cn[1123,1128]"
3814493,24170522,alkmene,abigail,8192Mn,5.81M,02:00:00,00:08:42,2,8,production,1:0,FAILED,"cn[1207,1455]"
3814510,24170592,pinole,daryl,380280Mn,6.10M,00:30:00,00:02:25,1,1,turing,126:0,FAILED,gpu0045


In [11]:
failed_jobs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1406 entries, 8629 to 3816160
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   JOBID      1406 non-null   object
 1   ACCOUNT    1406 non-null   object
 2   USER       1406 non-null   object
 3   REQMEM     1406 non-null   object
 4   USEDMEM    1406 non-null   object
 5   REQTIME    1406 non-null   object
 6   USEDTIME   1406 non-null   object
 7   NODES      1406 non-null   int64 
 8   CPUS       1406 non-null   int64 
 9   PARTITION  1406 non-null   object
 10  EXITCODE   1406 non-null   object
 11  STATE      1406 non-null   object
 12  NODELIST   1406 non-null   object
dtypes: int64(2), object(11)
memory usage: 153.8+ KB


In [12]:
# Try to get a list of failed_jobs (e.g., without specific partition(s))
failed_jobs_list = failed_jobs['NODELIST'].tolist()
failed_jobs_list

['cn1554',
 'cn1544',
 'ng909',
 'gpu0021',
 'gpu0021',
 'gpu0020',
 'gpu0019',
 'gpu0024',
 'gpu0025',
 'gpu0024',
 'gpu0024',
 'cn[336-338,340,347-348,374-380,386-396,401,403-405,407-411,487-500,911-913,1081]',
 'cn394',
 'gpu0026',
 'gpu0017',
 'gpu0028',
 'gpu0001',
 'gpu0015',
 'gpu0015',
 'gpu0015',
 'gpu0030',
 'gpu0030',
 'gpu0030',
 'gpu0030',
 'cn[326,499]',
 'gpu0030',
 'gpu0030',
 'gpu0002',
 'cn371',
 'gpu0030',
 'gpu0030',
 'gpu0030',
 'gpu0030',
 'cn1581',
 'cn1563',
 'cn1582',
 'cn1539',
 'cn[331-338,340,347-349,361-367,386-393,395-396,398-401,403-405,407-414,441-448,464-473,1273-1279,1340-1345]',
 'gpu0026',
 'gpu0023',
 'gpu0023',
 'cn[1263,1275,1303,1309,1335-1336,1351,1372,1376,1424]',
 'cn[1263,1275,1287,1303,1309,1335-1336,1351,1372,1376]',
 'cn[1263,1275,1287,1303,1309,1335-1336,1351,1372-1373]',
 'cn[1263,1275,1287,1303,1309,1335-1336,1348,1372-1373]',
 'gpu0023',
 'cn[1274-1277,1303-1321,1345-1354,1420-1427]',
 'gpu0023',
 'gpu0035',
 'gpu0035',
 'cn1101',
 'cn

In [13]:
failed_jobs.groupby('PARTITION')['PARTITION'].count().sort_values()

PARTITION
turing         20
debug          27
nogpfs         55
maxwell       102
production    500
pascal        702
Name: PARTITION, dtype: int64

In [14]:
accre.PARTITION.describe()

count        3816290
unique            15
top       production
freq         3311788
Name: PARTITION, dtype: object

In [15]:
accre.PARTITION.unique()

array(['production', 'cgw-dougherty1', 'cgw-capra1', 'cgw-cqs1', 'pascal',
       'turing', 'cgw-tbi01', 'nogpfs', 'maxwell', 'sam', 'debug',
       'devel', 'cgw-cqs3', 'cgw-rocksteady', 'cgw-vm-qa-flatearth1'],
      dtype=object)

I set up a df called "failed" which took the entire accre df and filtered to State = Failed, and Partition = Production.  This is due to the slides 37 (ignore debug, pascal, turing, maxwell and nogpfs only relevant to cms), and by filtering found that only production partition has failed jobs.    

In [16]:
failed = accre[(accre["STATE"] == 'FAILED') & (accre['PARTITION'] == 'production')]
failed.head()  



Unnamed: 0,JOBID,ACCOUNT,USER,REQMEM,USEDMEM,REQTIME,USEDTIME,NODES,CPUS,PARTITION,EXITCODE,STATE,NODELIST
104031,17234446,crowns,tena,5120Mc,8737.88M,1-12:00:00,1-12:00:12,51,1,production,1:0,FAILED,"cn[336-338,340,347-348,374-380,386-396,401,403..."
124397,17261993,burro,golda,1024Mc,0,00:30:00,00:00:01,1,1,production,13:0,FAILED,cn394
167790,17329248,pearl,auther,1024Mc,0,00:30:00,00:00:00,2,8,production,2:0,FAILED,"cn[326,499]"
199503,17354412,crowns,tena,5120Mc,4277.54M,1-06:00:00,01:49:02,75,1,production,1:0,FAILED,"cn[331-338,340,347-349,361-367,386-393,395-396..."
226420,17390284,bulk,lela,20480Mn,4028.41M,20:00:00,00:01:01,10,1,production,1:0,FAILED,"cn[1263,1275,1303,1309,1335-1336,1351,1372,137..."


In [17]:
failed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500 entries, 104031 to 3814493
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   JOBID      500 non-null    object
 1   ACCOUNT    500 non-null    object
 2   USER       500 non-null    object
 3   REQMEM     500 non-null    object
 4   USEDMEM    500 non-null    object
 5   REQTIME    500 non-null    object
 6   USEDTIME   500 non-null    object
 7   NODES      500 non-null    int64 
 8   CPUS       500 non-null    int64 
 9   PARTITION  500 non-null    object
 10  EXITCODE   500 non-null    object
 11  STATE      500 non-null    object
 12  NODELIST   500 non-null    object
dtypes: int64(2), object(11)
memory usage: 54.7+ KB


In [18]:
# failed = 'failed' for STATE and 'production' for 'PARTITION'
failed.NODELIST.unique()

array(['cn[336-338,340,347-348,374-380,386-396,401,403-405,407-411,487-500,911-913,1081]',
       'cn394', 'cn[326,499]',
       'cn[331-338,340,347-349,361-367,386-393,395-396,398-401,403-405,407-414,441-448,464-473,1273-1279,1340-1345]',
       'cn[1263,1275,1303,1309,1335-1336,1351,1372,1376,1424]',
       'cn[1263,1275,1287,1303,1309,1335-1336,1351,1372,1376]',
       'cn[1263,1275,1287,1303,1309,1335-1336,1351,1372-1373]',
       'cn[1263,1275,1287,1303,1309,1335-1336,1348,1372-1373]',
       'cn[1274-1277,1303-1321,1345-1354,1420-1427]', 'cn[1442,1444]',
       'cn[1301,1382]', 'cn469', 'cn1446', 'cn1431', 'cn1212',
       'cn[1356,1379]', 'cn1273', 'cn[1332-1334]',
       'cn[1458,1464,1531-1533]', 'cn[1431-1432,1440-1442]',
       'cn[1437,1530-1531]', 'cn1471', 'cn1301', 'cn[1421-1422]',
       'cn[1270,1310]', 'cn[1291,1394]', 'cn1090',
       'cn[305,308-309,311-314,318-320,322,324,326-329,332,334-338,340,347,349-356,360,365,369,372-374,380,385-390,392-393,395,405,407,409-41

In [19]:
failed.NODELIST.nunique()

339

In [20]:
failed.NODELIST.value_counts()

cn1273                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  9
cn304                                                                                                                                                                                                                                                                                                                                                                                                                                                         

!! Why are there exactly 500 unique failed nodelists, but 339 failed nodelist value_counts?   

In [21]:
# for gut-check, find most common values for failed nodelist.  
def find_most_common_values(df,column):
    return df[column].value_counts(ascending=False).iloc[0:30]

In [22]:
# not useful because it groups nodes 
find_most_common_values(failed, 'NODELIST')

cn1273           9
cn304            6
cn1458           6
cn1364           6
cn1450           5
cn1479           5
cn1270           5
cn[1530-1531]    5
cn1267           4
cn1449           4
cn[1468-1469]    4
cn[1284-1286]    4
cn1275           4
cn1436           4
cn1278           4
cn1448           4
cn1430           3
cn[1297-1298]    3
cn[1420-1421]    3
cn[1422-1423]    3
cn[1343-1344]    3
cn1261           3
cn[1421-1422]    3
cn1445           3
cn1387           3
cn[356-357]      3
cn1469           3
cn416            3
cn[411-412]      3
cn1439           3
Name: NODELIST, dtype: int64

This is the code to separate out the nodelist. 

In [23]:
#Separate out node list
#failed = accre df filtered to state = failed, partition = production 
nodelist = failed['NODELIST'].tolist()
nodelist_nums = [i.strip('cn').strip('[').strip(']') for i in nodelist if ',' not in i]
nodelist_commas = [i.strip('cn').strip('[').strip(']').split(',') for i in nodelist if ',' in i]

In [24]:
nodelist_commas = [item for sublist in nodelist_commas for item in sublist]

In [25]:
new_list = nodelist_commas + nodelist_nums

In [26]:
node_counts = {}
for n in new_list:
    if '-' in n:
        nodes = n.split('-')
        low, high = int(nodes[0]), int(nodes[1])
        for r in range (low, high + 1):
            if r not in node_counts:
                node_counts[r] = 1
            else: 
                node_counts[r] += 1
    else:
        r = int(n)
        if r not in node_counts:
            node_counts[r] = 1
        else: 
            node_counts[r] += 1
            

In [27]:
node_counts


{336: 16,
 337: 14,
 338: 17,
 340: 22,
 347: 24,
 348: 14,
 374: 25,
 375: 14,
 376: 16,
 377: 15,
 378: 12,
 379: 14,
 380: 13,
 386: 22,
 387: 19,
 388: 19,
 389: 17,
 390: 20,
 391: 19,
 392: 21,
 393: 18,
 394: 8,
 395: 18,
 396: 13,
 401: 19,
 403: 20,
 404: 19,
 405: 19,
 407: 17,
 408: 16,
 409: 4,
 410: 23,
 411: 23,
 487: 18,
 488: 13,
 489: 9,
 490: 13,
 491: 20,
 492: 23,
 493: 18,
 494: 21,
 495: 20,
 496: 25,
 497: 21,
 498: 8,
 499: 17,
 500: 13,
 911: 6,
 912: 8,
 913: 5,
 1081: 4,
 326: 17,
 331: 19,
 332: 22,
 333: 12,
 334: 21,
 335: 18,
 349: 13,
 361: 14,
 362: 13,
 363: 9,
 364: 9,
 365: 16,
 366: 6,
 367: 12,
 398: 18,
 399: 19,
 400: 21,
 412: 14,
 413: 16,
 414: 10,
 441: 17,
 442: 15,
 443: 17,
 444: 20,
 445: 13,
 446: 16,
 447: 9,
 448: 10,
 464: 7,
 465: 14,
 466: 11,
 467: 20,
 468: 20,
 469: 10,
 470: 15,
 471: 20,
 472: 19,
 473: 13,
 1273: 15,
 1274: 10,
 1275: 18,
 1276: 16,
 1277: 13,
 1278: 15,
 1279: 9,
 1340: 11,
 1341: 13,
 1342: 7,
 1343: 13,
 13

In [28]:
node_counts_sep = pd.DataFrame.from_dict(node_counts, orient = 'index')
node_counts_sep.head()

Unnamed: 0,0
336,16
337,14
338,17
340,22
347,24


In [30]:
# unable to change column name from '0' to 'count', so converted to csv above and will bring back in

In [31]:
# convert node_counts_sep to csv to create visualization called 'failed_notes' 
node_counts_sep = node_counts_sep.reset_index()
node_counts_sep.to_csv('../data/failed_notes.csv', index = False)
# changed column names in data from 'nodes', '0' to 'node_name', 'count'

In [32]:
failed_visual = pd.read_csv('../data/failed_notes.csv')
failed_visual.head(15)

Unnamed: 0,index,0
0,336,16
1,337,14
2,338,17
3,340,22
4,347,24
5,348,14
6,374,25
7,375,14
8,376,16
9,377,15


In [33]:
failed_visual.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 507 entries, 0 to 506
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   index   507 non-null    int64
 1   0       507 non-null    int64
dtypes: int64(2)
memory usage: 8.0 KB


507 unique nodes.  I converted the file to an Excel and sorted there.  Will attempt to sort the node number by their values here.

In [34]:
# Review exit codes within failed df ('failed' being failed jobs filtered to production partition)
failed_exit = failed.groupby('EXITCODE')['EXITCODE'].count().sort_values(ascending=False)
failed_exit.to_frame()

Unnamed: 0_level_0,EXITCODE
EXITCODE,Unnamed: 1_level_1
1:0,327
2:0,60
127:0,39
7:0,31
6:0,11
13:0,10
24:0,7
11:0,6
116:0,4
99:0,2


!! Note that 100% of failed jobs (jobs filtered to 'failed' state and 'production' partition) have exit codes indicating user error.  See if I can find the Accounts associated with exitcodes '1:0' (65%), '2:0' (77%), '127:0' (85%), '7:0' (91%)

In [35]:
failed_exit_accts = failed[failed.EXITCODE.isin(['1:0', '2:0', '127:0', '7:0'])].groupby('ACCOUNT')['ACCOUNT'].count().sort_values(ascending = False)
failed_exit_accts


ACCOUNT
cep            117
plantain        87
tips            65
bertanne        27
crowns          20
carrot          19
crab            13
seeded          11
wood            10
mignonette      10
angelys          9
poblano          9
peanuts          9
cms              8
pasilla          7
alkmene          6
lemon            4
concord          4
bulk             4
horseradish      4
forelle          3
anise            2
grisette         1
mixed            1
cms_lowprio      1
orange           1
clara            1
pearl            1
retailer         1
sharlin          1
sunburst         1
Name: ACCOUNT, dtype: int64

Now look at failed jobs for turing, pascal, maxwell partitions, assign to 'failed_other' 

In [36]:
failed_other = failed_jobs[failed_jobs['PARTITION'].isin(['turing', 'maxwell', 'pascal'])]
failed_other

Unnamed: 0,JOBID,ACCOUNT,USER,REQMEM,USEDMEM,REQTIME,USEDTIME,NODES,CPUS,PARTITION,EXITCODE,STATE,NODELIST
56716,17150413,blueberries,leva,4096Mn,1968.02M,10:00:00,02:15:54,1,1,pascal,1:0,FAILED,gpu0021
68661,17178412,cubanelle,robyn,1024Mc,73.32M,00:15:00,00:14:17,1,1,pascal,2:0,FAILED,gpu0021
79633,17198446,turban,codie,24576Mn,666.61M,5-00:00:00,00:01:52,1,2,pascal,1:0,FAILED,gpu0020
87763,17202272,blueberries,leva,4096Mn,1946.37M,10:00:00,01:13:20,1,1,pascal,1:0,FAILED,gpu0019
96507,17217459,turban,codie,24576Mn,681.91M,5-00:00:00,00:02:11,1,2,pascal,1:0,FAILED,gpu0024
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3782748,24126473,turban,rollin,24576Mn,620.41M,5-00:00:00,04:16:32,1,2,pascal,1:0,FAILED,gpu0014
3785177,24133659,turban,rollin,24576Mn,604.89M,5-00:00:00,06:18:28,1,2,pascal,1:0,FAILED,gpu0016
3785525,24134092,turban,hardie,122880Mn,5251.23M,20:00:00,20:00:38,1,1,pascal,1:0,FAILED,gpu0019
3813639,24168919,pinole,daryl,376832Mn,11.52M,00:30:00,00:01:28,1,1,turing,1:0,FAILED,gpu0048


In [37]:
failed_other.NODELIST.unique()

array(['gpu0021', 'gpu0020', 'gpu0019', 'gpu0024', 'gpu0025', 'gpu0026',
       'gpu0017', 'gpu0028', 'gpu0001', 'gpu0015', 'gpu0030', 'gpu0002',
       'gpu0023', 'gpu0035', 'gpu0013', 'gpu0014', 'gpu[0028-0030]',
       'gpu[0028-0029]', 'gpu[0032-0034]', 'gpu[0025-0027]',
       'gpu[0027-0029]', 'gpu[0031-0033]', 'gpu0034', 'gpu0031',
       'gpu0027', 'gpu0029', 'gpu[0043-0044]', 'gpu[0030-0032]',
       'gpu[0022,0033-0034]', 'gpu[0005,0011-0012]', 'gpu0032', 'gpu0007',
       'gpu0005', 'gpu0016', 'gpu0018', 'gpu[0003-0005]', 'gpu0003',
       'gpu0008', 'gpu0033', 'gpu[0018-0019]', 'gpu0010',
       'gpu[0024-0026]', 'gpu[0026-0028]', 'gpu0011', 'gpu[0010-0012]',
       'gpu[0008,0010,0012]', 'gpu[0005,0008,0010-0012]',
       'gpu[0006,0008,0010-0012]', 'gpu0012',
       'gpu[0003,0005,0007-0008,0012]', 'gpu[0003,0005,0012]', 'gpu0004',
       'gpu0047', 'gpu0050', 'gpu[0002-0004,0006-0007]',
       'gpu[0002-0004,0011-0012]', 'gpu0006', 'gpu0053', 'gpu0022',
       'gpu[0001-

In [38]:
failed_other.NODELIST.sort_values(ascending = True).tolist()

['gpu0001',
 'gpu0001',
 'gpu0001',
 'gpu0001',
 'gpu0001',
 'gpu0002',
 'gpu0002',
 'gpu0002',
 'gpu0002',
 'gpu0002',
 'gpu0002',
 'gpu0003',
 'gpu0003',
 'gpu0003',
 'gpu0003',
 'gpu0004',
 'gpu0004',
 'gpu0004',
 'gpu0004',
 'gpu0004',
 'gpu0004',
 'gpu0005',
 'gpu0005',
 'gpu0005',
 'gpu0005',
 'gpu0005',
 'gpu0005',
 'gpu0005',
 'gpu0005',
 'gpu0005',
 'gpu0006',
 'gpu0006',
 'gpu0006',
 'gpu0006',
 'gpu0007',
 'gpu0007',
 'gpu0007',
 'gpu0007',
 'gpu0007',
 'gpu0007',
 'gpu0007',
 'gpu0007',
 'gpu0007',
 'gpu0007',
 'gpu0008',
 'gpu0008',
 'gpu0008',
 'gpu0008',
 'gpu0008',
 'gpu0008',
 'gpu0008',
 'gpu0008',
 'gpu0008',
 'gpu0008',
 'gpu0008',
 'gpu0008',
 'gpu0008',
 'gpu0008',
 'gpu0010',
 'gpu0010',
 'gpu0011',
 'gpu0011',
 'gpu0011',
 'gpu0011',
 'gpu0011',
 'gpu0011',
 'gpu0011',
 'gpu0011',
 'gpu0012',
 'gpu0012',
 'gpu0012',
 'gpu0012',
 'gpu0012',
 'gpu0012',
 'gpu0012',
 'gpu0012',
 'gpu0012',
 'gpu0013',
 'gpu0013',
 'gpu0013',
 'gpu0013',
 'gpu0013',
 'gpu0013',
 'gp

In [39]:
failed_other.NODELIST.nunique()

71

In [40]:
failed_other.NODELIST.describe()

count         824
unique         71
top       gpu0019
freq           96
Name: NODELIST, dtype: object

In [41]:
# group the failed jobs for pascal, turing, maxwell by ExitCode
failed_other_exit = failed_other.groupby('EXITCODE')['EXITCODE'].count().sort_values(ascending=False)
failed_other_exit.to_frame()

Unnamed: 0_level_0,EXITCODE
EXITCODE,Unnamed: 1_level_1
1:0,675
2:0,71
127:0,53
126:0,10
38:0,8
6:0,4
64:0,2
16:0,1


!! All Failed Jobs for partitions Production, Maxwell, Turing, Pascal indicate Exit Codes with digits only on left side, indicating User error.  71% of all accre Failed jobs fall under Exit Code 1:0.  2:0 and 127:0 are #2, #3

In [42]:
# Exit Code 1:0 is most prevalent amongst all Failed jobs, all productions (except debug & nogpfs)

(327 + 675) / 1406

0.7126600284495022

Getting back to all accre failed jobs ('failed' and 'failed_other'), identify the total number that have a '[]' or '-' to indicate # of failed jobs that are clusters.  From there, go through exercise to separate out all nodes as previous, but include all other partitions.  Compare total failed node list against production failed node list.  

In [44]:
nodelist_other = failed_other['NODELIST'].tolist()
nodelist_nums_other = [i.strip('gpu').strip('[').strip(']') for i in nodelist_other if ',' not in i]
nodelist_commas_other = [i.strip('gpu').strip('[').strip(']').split(',') for i in nodelist_other if ',' in i]

In [45]:
nodelist_commas_other = [item for sublist in nodelist_commas_other for item in sublist]

In [46]:
new_list_other = nodelist_commas_other + nodelist_nums_other

In [47]:
node_counts_other = {}
for n in new_list_other:
    if '-' in n:
        nodes = n.split('-')
        low, high = int(nodes[0]), int(nodes[1])
        for r in range (low, high + 1):
            if r not in node_counts:
                node_counts_other[r] = 1
            else: 
                node_counts_other[r] += 1
    else:
        r = int(n)
        if r not in node_counts_other:
            node_counts_other[r] = 1
        else: 
            node_counts_other[r] += 1

In [48]:
node_counts_other

{22: 3,
 33: 12,
 34: 14,
 5: 3,
 11: 3,
 12: 9,
 8: 15,
 10: 2,
 6: 3,
 3: 3,
 7: 11,
 2: 7,
 4: 3,
 1: 6,
 35: 11,
 37: 1,
 38: 1,
 39: 1,
 40: 1,
 41: 1,
 42: 1,
 43: 1,
 44: 1,
 45: 2,
 47: 2,
 48: 2,
 49: 1,
 50: 3,
 51: 1,
 52: 1,
 13: 37,
 14: 39,
 15: 32,
 16: 25,
 17: 24,
 23: 61,
 24: 12,
 25: 22,
 26: 14,
 27: 16,
 28: 12,
 29: 24,
 30: 21,
 31: 13,
 32: 3,
 53: 1,
 54: 1,
 18: 25,
 19: 37,
 20: 37,
 21: 29}

In [50]:
node_counts_other
print(list(node_counts_other.values()))

[3, 12, 14, 3, 3, 9, 15, 2, 3, 3, 11, 7, 3, 6, 11, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 3, 1, 1, 37, 39, 32, 25, 24, 61, 12, 22, 14, 16, 12, 24, 21, 13, 3, 1, 1, 25, 37, 37, 29]


In [51]:
node_counts_other = pd.DataFrame.from_dict(node_counts_other, orient = 'index')
node_counts_other.to_csv('../data/nodes_counts_other.csv', index = False)

In [52]:
node_counts_other = pd.read_csv('../data/nodes_counts_other.csv')
node_counts_other.head()

Unnamed: 0,0
0,3
1,12
2,14
3,3
4,3
