In [1]:
# import statements
import pandas as pd
import numpy as np
from io import StringIO
import re
import random
from datetime import datetime
from datetime import timedelta
import time
%matplotlib inline

In [2]:
# display settings
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
# reformat the CSV to use '|' instead of ',' to separate rows, then import [a selection] of the data as a dataframe
for_pd = StringIO()
with open('../data/accre-jobs-2020.csv') as accre:
    for line in accre:
        new_line = re.sub(r',', '|', line.rstrip(), count=12)
        print (new_line, file=for_pd)

for_pd.seek(0)

accre = pd.read_csv(for_pd, sep='|')#[1000000:1005000] # add this to subset
print (accre.head())

      JOBID  ACCOUNT      USER    REQMEM    USEDMEM      REQTIME     USEDTIME  NODES  CPUS   PARTITION EXITCODE      STATE NODELIST
0  15925210  treviso  arabella  122880Mn  65973.49M  13-18:00:00  13-18:00:28      1    24  production      0:0  COMPLETED   cn1531
1  15861126  treviso  arabella  122880Mn  67181.12M  13-18:00:00  12-14:50:56      1    24  production      0:0  COMPLETED   cn1441
2  15861125  treviso  arabella  122880Mn  69111.86M  13-18:00:00  13-18:00:20      1    24  production      0:0  COMPLETED   cn1464
3  16251645  treviso  arabella  122880Mn  65317.33M  13-18:00:00  12-03:50:32      1    24  production      0:0  COMPLETED   cn1473
4  16251646  treviso  arabella  122880Mn  65876.11M  13-18:00:00  13-18:00:03      1    24  production      0:0  COMPLETED   cn1440


In [4]:
# ALTERNATIVE METHOD:
# To fully randomize the data sample, we could do the following:
# import pandas as pd
# import random

# The data to load
# f = "my_data.csv"

# Count the lines
# num_lines = sum(1 for l in open(for_pd))

# Sample size - in this case ~10%
# size = int(num_lines / 10)

# The row indices to skip - make sure 0 is not included to keep the header!
# skip_idx = random.sample(range(1, num_lines), num_lines - size)

# Read the data
# data = pd.read_csv(f, skiprows=skip_idx, ... )

In [5]:
# Take a look at the columns in the dataframe
accre.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3816290 entries, 0 to 3816289
Data columns (total 13 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   JOBID      object
 1   ACCOUNT    object
 2   USER       object
 3   REQMEM     object
 4   USEDMEM    object
 5   REQTIME    object
 6   USEDTIME   object
 7   NODES      int64 
 8   CPUS       int64 
 9   PARTITION  object
 10  EXITCODE   object
 11  STATE      object
 12  NODELIST   object
dtypes: int64(2), object(11)
memory usage: 378.5+ MB


**Some notes on the data:**  
``JOBID`` is a unique numerical identifier. Members of a job array have an underscore in them but are still processed as individual jobs. Unless analyzing array behavior, each ``JOBID`` may be treated as an independent observation.  
``ACCOUNT`` is an anonymized id for each research group. It is *not* unique to a research group, since research groups may have 1+ accounts. However, it _is_ the highest-level user grouping available within the dataset and it is used to prioritize jobs.  
``USERS`` contains anonymized IDs for individual researchers. These researchers are each part of an ``ACCOUNT``.  
``REQMEM`` is megabytes of requested memory. It may be counted per core (Mc) or per node (Mn). To calculate total ``REQMEM``, multiply ``REQMEM`` by either the ``CPUS`` (for Mc) or ``NODES`` (for Mn) field.  
``USEDMEM`` is megabytes of used memory per node. To calculate total ``USEDMEM``, multiply ``USEDMEM`` by ``NODES``.  
``REQTIME`` is in either d-hh:mm:ss or hh:mm:ss format and needs to be converted to total seconds.  
``USEDTIME`` is in either d-hh:mm:ss or hh:mm:ss format and needs to be converted to total seconds.  
``NODES`` is the number of servers (with variable number of CPUs) per job. Multi-node jobs are uncommon and 'for multi-node jobs, memory usage is the maximum over all nodes.'  
**Does this mean the total over all nodes or the highest usage for a single node?**  
``CPUS`` is the number of CPU cores allocated to the job, across all nodes.  
``PARTITION`` is a label for different clusters of nodes.
* 'production' is the biggest partition.
* ignore 'debug' because it is only for short test jobs.
* 'maxwell', 'pascal', and 'turing' partitions are GPU resources and should be analyzed separately.
* 'nogpfs' is a set of older nodes that are only utilized by CMS.  

``EXITCODE`` indicates whether a job has succeeded or failed. Success = 0:0; a non-zero to the left is a user software error and a non-zero on the right is an error with the node or job.  
``STATE`` is a human-readable indication of completion. Successfully completed jobs will say 'COMPLETED', user-stopped jobs should say 'CANCELLED', and a problem running the job (e.g. hitting maximum resources) will read 'FAILED'.  
``NODELIST`` is a comma-separated list of nodes the ``JOBID`` utilized.

In [6]:
# Look at some of the values and how often they appear in the dataframe:
print(accre.PARTITION.value_counts())
print(accre.STATE.value_counts())

production              3311788
nogpfs                   327652
sam                       79151
pascal                    48004
turing                    39406
debug                      6738
maxwell                    3348
cgw-capra1                   83
cgw-dougherty1               72
cgw-cqs1                     34
cgw-cqs3                      4
cgw-vm-qa-flatearth1          4
cgw-tbi01                     2
devel                         2
cgw-rocksteady                2
Name: PARTITION, dtype: int64
COMPLETED              3804644
CANCELLED                 6107
RUNNING                   3108
FAILED                    1406
CANCELLED by 686562        681
OUT_OF_MEMORY               94
TIMEOUT                     80
CANCELLED by 505355         63
CANCELLED by 855431         17
CANCELLED by 782611         13
CANCELLED by 9206            9
CANCELLED by 397600          7
CANCELLED by 200557          6
CANCELLED by 0               5
CANCELLED by 486541          4
CANCELLED by 454080      

### Data Cleaning

First, clean up and standardize the ``REQMEM`` and ``USEDMEM`` columns so that they contain both values per CPU.

In [7]:
# Grab a test string to test the RegExes
test_split = accre.loc[1000000,'REQMEM']
print(test_split)
# Use the re.findall() method to get the number for REQMEM
print(re.findall(r'\d+', test_split))
# Get the unit of measure for REQMEM
print(re.findall(r'..$', test_split))
# alternatively:
print(re.findall(r'\D+', test_split))

5000Mn
['5000']
['Mn']
['Mn']


In [8]:
# Create a new column REQMEM_NO which only contains the number
accre['REQMEM_NO'] = accre['REQMEM'].str.extract('(\d+)')
# Create a new column REQMEM_TYPE which only contains the type
accre['REQMEM_TYPE'] = accre['REQMEM'].str.extract('(\D+)')
# Convert REQMEM_NO to a mathable data type
accre['REQMEM_NO'] = pd.to_numeric(accre['REQMEM_NO'])

In [9]:
# Slice the dataframe by REQMEM_TYPE to get some sample values to test
# print(accre[accre['REQMEM_TYPE'] == 'Mc'].head(50))
# print(accre[accre['REQMEM_TYPE'] == 'Mn'].head(50))

In [10]:
# Create a test variable for each data type for the if-elif statement
# test_mn = accre.loc[1000000, ['REQMEM_NO', 'REQMEM', 'NODES', 'CPUS', 'REQMEM_TYPE']]
# test_mn['CPUS'] = 5
# print(test_mn)
# test_mc = accre.loc[1000006, ['REQMEM_NO', 'REQMEM', 'NODES', 'CPUS', 'REQMEM_TYPE']]
# test_mc['CPUS'] = 5
# print(test_mc)

**THIS LOGIC IS INCORRECT BECAUSE IT ASSUMES THAT THE ``CPUS`` FIELD IS PER NODE. SEE BELOW FOR CORRECTED LOGIC.**  
The logic we need:
* IF there are 5 CPUs per 1 Node
* AND REQMEM in mn = 15M
* THEN REQMEM per mc = 1 / 5 * 15 = 3M per c = node# / CPU# * mn#
* ELSE REQMEM_NO  

Possible code (draft, incomplete for loop) =  

    for row in accre:
    accre['REQMEM_CVT'] = 0
    
    if REQMEM_TYPE == 'Mc':
        accre['REQMEM_CVT'] = accre['REQMEM_NO']
        
    elif REQMEM_TYPE == 'Mn':
        accre['REQMEM_CVT'] = (accre['REQMEM_NO'] * (accre['NODES']/accre['CPUS']))
        
What that looks like in code =

    # If statement, tested on MC:
    if test_mc['REQMEM_TYPE'] == 'Mc':
        print(test_mc['REQMEM_NO'])
    elif test_mc['REQMEM_TYPE'] == 'Mn':
        print(test_mc['REQMEM_NO'] * (test_mc['NODES']/test_mc['CPUS']))
    
    # If statement, tested on Mn:
    if test_mn['REQMEM_TYPE'] == 'Mc':
        print(test_mn['REQMEM_NO'])
    elif test_mn['REQMEM_TYPE'] == 'Mn':
        print(test_mn['REQMEM_NO'] * (test_mn['NODES']/test_mn['CPUS']))

**CORRECTED LOGIC.**  
``REQMEM`` is megabytes of requested memory. It may be counted per core (Mc) or per node (Mn). To calculate total ``REQMEM``, multiply ``REQMEM`` by either the ``CPUS`` (for Mc) or ``NODES`` (for Mn) field. To convert total ``REQMEM`` from per node to per core, take the requested memory per node, multiply by the number of nodes, then divide by the number of CPUs.

In [11]:
# Create a new calculated field 'REQMEM_PER_CORE' which checks the original REQMEM_TYPE before returning a value
accre['REQMEM_PER_CORE'] = np.where((accre['REQMEM_TYPE'] == 'Mn'), \
                                      ((accre['REQMEM_NO'] * accre['NODES']) / accre['CPUS']), \
                                       accre['REQMEM_NO'])
# Check to see if it worked
accre.head(200)

Unnamed: 0,JOBID,ACCOUNT,USER,REQMEM,USEDMEM,REQTIME,USEDTIME,NODES,CPUS,PARTITION,EXITCODE,STATE,NODELIST,REQMEM_NO,REQMEM_TYPE,REQMEM_PER_CORE
0,15925210,treviso,arabella,122880Mn,65973.49M,13-18:00:00,13-18:00:28,1,24,production,0:0,COMPLETED,cn1531,122880,Mn,5120.0
1,15861126,treviso,arabella,122880Mn,67181.12M,13-18:00:00,12-14:50:56,1,24,production,0:0,COMPLETED,cn1441,122880,Mn,5120.0
2,15861125,treviso,arabella,122880Mn,69111.86M,13-18:00:00,13-18:00:20,1,24,production,0:0,COMPLETED,cn1464,122880,Mn,5120.0
3,16251645,treviso,arabella,122880Mn,65317.33M,13-18:00:00,12-03:50:32,1,24,production,0:0,COMPLETED,cn1473,122880,Mn,5120.0
4,16251646,treviso,arabella,122880Mn,65876.11M,13-18:00:00,13-18:00:03,1,24,production,0:0,COMPLETED,cn1440,122880,Mn,5120.0
5,16297022,treviso,arabella,122880Mn,65183.02M,13-18:00:00,12-02:31:43,1,24,production,0:0,COMPLETED,cn1443,122880,Mn,5120.0
6,16297024,treviso,arabella,122880Mn,64941.85M,13-18:00:00,13-06:54:59,1,24,production,0:0,COMPLETED,cn1531,122880,Mn,5120.0
7,16316283,treviso,arabella,122880Mn,68499.51M,13-18:00:00,13-18:00:05,1,24,production,0:0,COMPLETED,cn1535,122880,Mn,5120.0
8,16316284,treviso,arabella,122880Mn,67761.41M,13-18:00:00,13-18:00:15,1,24,production,0:0,COMPLETED,cn1464,122880,Mn,5120.0
9,16316324,treviso,arabella,122880Mn,68087.66M,13-18:00:00,13-18:00:06,1,24,production,0:0,COMPLETED,cn1473,122880,Mn,5120.0


``USEDMEM`` is megabytes of used memory per node.  
**ASSUMING THAT ``USEDMEM`` IS THE MAXIMUM PER NODE IN MULTI-NODE JOBS:** To calculate total ``USEDMEM``, multiply ``USEDMEM`` by ``NODES`` and to get ``USEDMEM`` per core, divide the total by ``CPUS``. Assign this to ``USEDMEM_PER_CORE_01``.  
**ASSUMING THAT ``USEDMEM`` IS THE TOTAL FOR ALL NODES IN MULTI-NODE JOBS:** ``USEDMEM`` is already a total. To get ``USEDMEM`` per core, divide the ``USEDMEM`` by ``CPUS``. Assign this to ``USEDMEM_PER_CORE_02``.

In [12]:
# Create a new column USEDMEM_NO which only contains the number
accre['USEDMEM_NO'] = accre['USEDMEM'].str.extract('(\d+)')
# Create a new column USEDMEM_TYPE which only contains the type (in case there are multiple, which there are not)
accre['USEDMEM_TYPE'] = accre['USEDMEM'].str.extract('(\D$)')
# Convert USEDMEM_NO to a mathable data type
accre['USEDMEM_NO'] = pd.to_numeric(accre['USEDMEM_NO'])
# Create new calculated field USEDMEM_PER_CORE_01 if USEDMEM is USEDMEM PER NODE
accre['USEDMEM_PER_CORE_01'] = (accre['USEDMEM_NO'] * accre['NODES']) / accre['CPUS']
# Create new calculated field USEDMEM_PER_CORE_02 if USEDMEM is total USEDMEM
accre['USEDMEM_PER_CORE_02'] = accre['USEDMEM_NO'] / accre['CPUS']

In [13]:
# Check that the dataframe looks right with all of the new columns
# accre.head(500)

Next, convert the fields ``REQTIME`` and ``USEDTIME`` to seconds.

In [14]:
# Create a function to split the hh:mm:ss string and calculate seconds from it
def to_sec(x):
    h,m,s = map(int,x.split(':'))
    return (h*60+m)*60+s

In [15]:
# Separate out the days from REQTIME, put them in another column, convert to a number, fill NaN with 0, convert to seconds
accre['REQTIME_DAY_SEC'] = accre['REQTIME'].str.extract('(\d+)-')
accre['REQTIME_DAY_SEC'] = pd.to_numeric(accre['REQTIME_DAY_SEC'])
accre['REQTIME_DAY_SEC'] = accre['REQTIME_DAY_SEC'].fillna(0)
accre['REQTIME_DAY_SEC'] = accre['REQTIME_DAY_SEC']*24*60*60
# Extract the hh:mm:ss from REQTIME, put in a new column, and then apply the to_sec function 
accre['REQTIME_T'] = accre['REQTIME'].str.extract('(..:..:..)$')
# REQTIME_SEC includes total seconds from REQTIME
accre['REQTIME_SEC'] = accre['REQTIME_T'].apply(to_sec) + accre['REQTIME_DAY_SEC']
# Do the same for USEDTIME
accre['USEDTIME_DAY_SEC'] = accre['USEDTIME'].str.extract('(\d+)-')
accre['USEDTIME_DAY_SEC'] = pd.to_numeric(accre['USEDTIME_DAY_SEC'])
accre['USEDTIME_DAY_SEC'] = accre['USEDTIME_DAY_SEC'].fillna(0)
accre['USEDTIME_DAY_SEC'] = accre['USEDTIME_DAY_SEC']*24*60*60
# Do the same for USEDTIME
accre['USEDTIME_T'] = accre['USEDTIME'].str.extract('(..:..:..)$')
# USEDTIME_SEC includes total second from USEDTIME
accre['USEDTIME_SEC'] = accre['USEDTIME_T'].apply(to_sec) + accre['USEDTIME_DAY_SEC']
# Check to make sure the data types look okay
# accre.info()

In [16]:
# Thinking about alternate approaches
# accre['REQTIME_SEC'] = np.where(accre['REQTIME'].re.findall(r'(\d+)-' == True, 'yes', 'no')

In [17]:
# accre.head(500)

In [18]:
accre.columns

Index(['JOBID', 'ACCOUNT', 'USER', 'REQMEM', 'USEDMEM', 'REQTIME', 'USEDTIME', 'NODES', 'CPUS', 'PARTITION', 'EXITCODE', 'STATE', 'NODELIST', 'REQMEM_NO', 'REQMEM_TYPE', 'REQMEM_PER_CORE', 'USEDMEM_NO', 'USEDMEM_TYPE', 'USEDMEM_PER_CORE_01', 'USEDMEM_PER_CORE_02', 'REQTIME_DAY_SEC', 'REQTIME_T', 'REQTIME_SEC', 'USEDTIME_DAY_SEC', 'USEDTIME_T', 'USEDTIME_SEC'], dtype='object')

In [19]:
accre.to_csv('../data/accre_cleaned_no_nodelist.csv', columns = ['JOBID', 'ACCOUNT', 'USER', 'REQMEM', 'REQMEM_PER_CORE', 'USEDMEM', 'USEDMEM_PER_CORE_01', 'USEDMEM_PER_CORE_02', 'REQTIME', 'REQTIME_SEC', 'USEDTIME', 'USEDTIME_SEC', 'NODES', 'CPUS', 'PARTITION', 'EXITCODE', 'STATE'])

In [20]:
# To strip apart nodelists [WIP]
# nodelist = failed['NODELIST'].unique().tolist()
# nodelist_nums = [i.strip('cn').strip('[').strip(']') for i in nodelist]
# for n in nodelist_nums:
#    if ',' in n: 
#        nodes = n.split(',')
#        if '-' in n:
#            nodes = n.split('-')
#            low, high = int(nodes[0]), int(nodes[1])       
#        else:
#            nodes = int(n)