In [164]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

%matplotlib inline

# Conversion of the columns 

This import has an error so we will wrap it in a Try-Except block first.

In [2]:
data = pd.read_csv('../data/accre-jobs-2020-processed.csv')
display(data.head())

Unnamed: 0,JOBID,ACCOUNT,USER,REQMEM,USEDMEM,REQTIME,USEDTIME,NODES,CPUS,PARTITION,EXITCODE,STATE,NODELIST
0,15925210,treviso,arabella,122880Mn,65973.49M,13-18:00:00,13-18:00:28,1,24,production,0:0,COMPLETED,cn1531
1,15861126,treviso,arabella,122880Mn,67181.12M,13-18:00:00,12-14:50:56,1,24,production,0:0,COMPLETED,cn1441
2,15861125,treviso,arabella,122880Mn,69111.86M,13-18:00:00,13-18:00:20,1,24,production,0:0,COMPLETED,cn1464
3,16251645,treviso,arabella,122880Mn,65317.33M,13-18:00:00,12-03:50:32,1,24,production,0:0,COMPLETED,cn1473
4,16251646,treviso,arabella,122880Mn,65876.11M,13-18:00:00,13-18:00:03,1,24,production,0:0,COMPLETED,cn1440


In [3]:
data.iloc[[3460]]

Unnamed: 0,JOBID,ACCOUNT,USER,REQMEM,USEDMEM,REQTIME,USEDTIME,NODES,CPUS,PARTITION,EXITCODE,STATE,NODELIST
3460,17050901_91,winged,lavonda,4096Mn,669.61M,12:00:00,00:06:05,4,1,production,0:0,COMPLETED,"cn[449,463,911,913]"


## Standardize the columns

In [4]:
accre_jobs_2020 = data.rename(columns = str.lower)

In [5]:
accre_jobs_2020.head()

Unnamed: 0,jobid,account,user,reqmem,usedmem,reqtime,usedtime,nodes,cpus,partition,exitcode,state,nodelist
0,15925210,treviso,arabella,122880Mn,65973.49M,13-18:00:00,13-18:00:28,1,24,production,0:0,COMPLETED,cn1531
1,15861126,treviso,arabella,122880Mn,67181.12M,13-18:00:00,12-14:50:56,1,24,production,0:0,COMPLETED,cn1441
2,15861125,treviso,arabella,122880Mn,69111.86M,13-18:00:00,13-18:00:20,1,24,production,0:0,COMPLETED,cn1464
3,16251645,treviso,arabella,122880Mn,65317.33M,13-18:00:00,12-03:50:32,1,24,production,0:0,COMPLETED,cn1473
4,16251646,treviso,arabella,122880Mn,65876.11M,13-18:00:00,13-18:00:03,1,24,production,0:0,COMPLETED,cn1440


## Converting Job times to total seconds

Job time is in a format of either d-hh:mm:ss or hh:mm:ss, it needs to be converted to total seconds

In [6]:
def convert_time_to_seconds(time_str):
    """Convert the given time string d-hh:mm:ss or hh:mm:ss to total seconds."""
    
    # Account for possible errors
    try:
        
        # Initialize all time components to 0
        days = hours = minutes = seconds = 0

        # Split the time string at the dash mark
        time_dash_split = time_str.split("-") # => Either ["d", "hh:mm:ss"] or ["hh:mm:ss"]

        # Assign each time element to variables
        if len(time_dash_split) > 1: 
            # Remove the first element from the list and convert to integer and add to days
            days += int(time_dash_split.pop(0))

        # If here, len(time_dash_split) == 1
        # Split the time at the colon marks
        time_colon_split = time_dash_split[0].split(":") # => ["hh", "mm", "ss"]
        seconds += int(time_colon_split.pop())
        minutes += int(time_colon_split.pop())
        hours += int(time_colon_split.pop())

        # Now combine everything together
        return (((((days * 24) + hours) * 60) + minutes) * 60) + seconds
        
    except AttributeError as error:
        # Skip everything if the column is already in seconds (assuming one single digit)
        return time_str

Now, we can convert the time columns to seconds

In [7]:

new_reqtime = accre_jobs_2020["reqtime"].map(convert_time_to_seconds)
new_usedtime = accre_jobs_2020["usedtime"].map(convert_time_to_seconds)

accre_jobs_2020 = accre_jobs_2020.assign(reqtime = new_reqtime)
accre_jobs_2020 = accre_jobs_2020.assign(usedtime = new_usedtime)

accre_jobs_2020.head()

Unnamed: 0,jobid,account,user,reqmem,usedmem,reqtime,usedtime,nodes,cpus,partition,exitcode,state,nodelist
0,15925210,treviso,arabella,122880Mn,65973.49M,1188000,1188028,1,24,production,0:0,COMPLETED,cn1531
1,15861126,treviso,arabella,122880Mn,67181.12M,1188000,1090256,1,24,production,0:0,COMPLETED,cn1441
2,15861125,treviso,arabella,122880Mn,69111.86M,1188000,1188020,1,24,production,0:0,COMPLETED,cn1464
3,16251645,treviso,arabella,122880Mn,65317.33M,1188000,1050632,1,24,production,0:0,COMPLETED,cn1473
4,16251646,treviso,arabella,122880Mn,65876.11M,1188000,1188003,1,24,production,0:0,COMPLETED,cn1440


## Split the exitcode in two columns

In [8]:
accre_jobs_2020[["exitcode_user", "exitcode_error"]] = accre_jobs_2020["exitcode"].str.split(":", expand=True)

In [9]:
accre_jobs_2020['exitcode_error'] = accre_jobs_2020['exitcode_error'].astype(int)

In [10]:
accre_jobs_2020['exitcode_user'] = accre_jobs_2020['exitcode_user'].astype(int)

## Converting usedmem to megabyte per core

In [11]:
accre_jobs_2020.usedmem.str[-1].value_counts()

M    3213713
0     602577
Name: usedmem, dtype: int64

In [12]:
#checking at the data with 0 as used memory # can put it in exploratory?
#accre_usedmem_zeros = accre_jobs_2020[accre_jobs_2020['usedmem'] == '0']
#accre_usedmem_zeros.head(3)

In [13]:
accre_jobs_2020['used_mb_per_node'] = accre_jobs_2020['usedmem'].str.strip('M')

In [14]:
accre_jobs_2020['used_mb_per_node'] = accre_jobs_2020['used_mb_per_node'].astype(float)

In [15]:
accre_jobs_2020['used_mb_per_core'] = (accre_jobs_2020['used_mb_per_node'])/(accre_jobs_2020['cpus']*accre_jobs_2020['nodes'])
accre_jobs_2020.head()

Unnamed: 0,jobid,account,user,reqmem,usedmem,reqtime,usedtime,nodes,cpus,partition,exitcode,state,nodelist,exitcode_user,exitcode_error,used_mb_per_node,used_mb_per_core
0,15925210,treviso,arabella,122880Mn,65973.49M,1188000,1188028,1,24,production,0:0,COMPLETED,cn1531,0,0,65973.49,2748.895417
1,15861126,treviso,arabella,122880Mn,67181.12M,1188000,1090256,1,24,production,0:0,COMPLETED,cn1441,0,0,67181.12,2799.213333
2,15861125,treviso,arabella,122880Mn,69111.86M,1188000,1188020,1,24,production,0:0,COMPLETED,cn1464,0,0,69111.86,2879.660833
3,16251645,treviso,arabella,122880Mn,65317.33M,1188000,1050632,1,24,production,0:0,COMPLETED,cn1473,0,0,65317.33,2721.555417
4,16251646,treviso,arabella,122880Mn,65876.11M,1188000,1188003,1,24,production,0:0,COMPLETED,cn1440,0,0,65876.11,2744.837917


## Converting request memory to megabyte per core

In [16]:
condition = accre_jobs_2020["reqmem"].str.contains("Mc")

accre_jobs_2020.loc[condition, "reqmem_mc"] = accre_jobs_2020["reqmem"].str[:-2]

In [17]:
condition = accre_jobs_2020["reqmem"].str.contains("Mn")

accre_jobs_2020.loc[condition, "reqmem_mn"] = accre_jobs_2020["reqmem"].str[:-2]

In [18]:
accre_jobs_2020.head()

Unnamed: 0,jobid,account,user,reqmem,usedmem,reqtime,usedtime,nodes,cpus,partition,exitcode,state,nodelist,exitcode_user,exitcode_error,used_mb_per_node,used_mb_per_core,reqmem_mc,reqmem_mn
0,15925210,treviso,arabella,122880Mn,65973.49M,1188000,1188028,1,24,production,0:0,COMPLETED,cn1531,0,0,65973.49,2748.895417,,122880
1,15861126,treviso,arabella,122880Mn,67181.12M,1188000,1090256,1,24,production,0:0,COMPLETED,cn1441,0,0,67181.12,2799.213333,,122880
2,15861125,treviso,arabella,122880Mn,69111.86M,1188000,1188020,1,24,production,0:0,COMPLETED,cn1464,0,0,69111.86,2879.660833,,122880
3,16251645,treviso,arabella,122880Mn,65317.33M,1188000,1050632,1,24,production,0:0,COMPLETED,cn1473,0,0,65317.33,2721.555417,,122880
4,16251646,treviso,arabella,122880Mn,65876.11M,1188000,1188003,1,24,production,0:0,COMPLETED,cn1440,0,0,65876.11,2744.837917,,122880


In [19]:
accre_jobs_2020['reqmem_mn'] = accre_jobs_2020['reqmem_mn'].astype(float)
accre_jobs_2020['reqmem_mc'] = accre_jobs_2020['reqmem_mc'].astype(float)

In [20]:
accre_jobs_2020['reqmem_mn_per_core'] = (accre_jobs_2020['reqmem_mn'])/(accre_jobs_2020['cpus']*accre_jobs_2020['nodes'])
accre_jobs_2020.head()

Unnamed: 0,jobid,account,user,reqmem,usedmem,reqtime,usedtime,nodes,cpus,partition,exitcode,state,nodelist,exitcode_user,exitcode_error,used_mb_per_node,used_mb_per_core,reqmem_mc,reqmem_mn,reqmem_mn_per_core
0,15925210,treviso,arabella,122880Mn,65973.49M,1188000,1188028,1,24,production,0:0,COMPLETED,cn1531,0,0,65973.49,2748.895417,,122880.0,5120.0
1,15861126,treviso,arabella,122880Mn,67181.12M,1188000,1090256,1,24,production,0:0,COMPLETED,cn1441,0,0,67181.12,2799.213333,,122880.0,5120.0
2,15861125,treviso,arabella,122880Mn,69111.86M,1188000,1188020,1,24,production,0:0,COMPLETED,cn1464,0,0,69111.86,2879.660833,,122880.0,5120.0
3,16251645,treviso,arabella,122880Mn,65317.33M,1188000,1050632,1,24,production,0:0,COMPLETED,cn1473,0,0,65317.33,2721.555417,,122880.0,5120.0
4,16251646,treviso,arabella,122880Mn,65876.11M,1188000,1188003,1,24,production,0:0,COMPLETED,cn1440,0,0,65876.11,2744.837917,,122880.0,5120.0


In [21]:
accre_jobs_2020.loc[accre_jobs_2020['reqmem_mn_per_core'].isnull(),'reqmem_mn_per_core'] = accre_jobs_2020['reqmem_mc']

In [22]:
accre_jobs_2020[accre_jobs_2020['reqmem_mc'].isnull()].head()

Unnamed: 0,jobid,account,user,reqmem,usedmem,reqtime,usedtime,nodes,cpus,partition,exitcode,state,nodelist,exitcode_user,exitcode_error,used_mb_per_node,used_mb_per_core,reqmem_mc,reqmem_mn,reqmem_mn_per_core
0,15925210,treviso,arabella,122880Mn,65973.49M,1188000,1188028,1,24,production,0:0,COMPLETED,cn1531,0,0,65973.49,2748.895417,,122880.0,5120.0
1,15861126,treviso,arabella,122880Mn,67181.12M,1188000,1090256,1,24,production,0:0,COMPLETED,cn1441,0,0,67181.12,2799.213333,,122880.0,5120.0
2,15861125,treviso,arabella,122880Mn,69111.86M,1188000,1188020,1,24,production,0:0,COMPLETED,cn1464,0,0,69111.86,2879.660833,,122880.0,5120.0
3,16251645,treviso,arabella,122880Mn,65317.33M,1188000,1050632,1,24,production,0:0,COMPLETED,cn1473,0,0,65317.33,2721.555417,,122880.0,5120.0
4,16251646,treviso,arabella,122880Mn,65876.11M,1188000,1188003,1,24,production,0:0,COMPLETED,cn1440,0,0,65876.11,2744.837917,,122880.0,5120.0


In [23]:
accre_jobs_2020.rename(columns={'reqmem_mn_per_core':'reqmem_per_core'}, inplace=True)

In [24]:
accre_jobs_2020.head()

Unnamed: 0,jobid,account,user,reqmem,usedmem,reqtime,usedtime,nodes,cpus,partition,exitcode,state,nodelist,exitcode_user,exitcode_error,used_mb_per_node,used_mb_per_core,reqmem_mc,reqmem_mn,reqmem_per_core
0,15925210,treviso,arabella,122880Mn,65973.49M,1188000,1188028,1,24,production,0:0,COMPLETED,cn1531,0,0,65973.49,2748.895417,,122880.0,5120.0
1,15861126,treviso,arabella,122880Mn,67181.12M,1188000,1090256,1,24,production,0:0,COMPLETED,cn1441,0,0,67181.12,2799.213333,,122880.0,5120.0
2,15861125,treviso,arabella,122880Mn,69111.86M,1188000,1188020,1,24,production,0:0,COMPLETED,cn1464,0,0,69111.86,2879.660833,,122880.0,5120.0
3,16251645,treviso,arabella,122880Mn,65317.33M,1188000,1050632,1,24,production,0:0,COMPLETED,cn1473,0,0,65317.33,2721.555417,,122880.0,5120.0
4,16251646,treviso,arabella,122880Mn,65876.11M,1188000,1188003,1,24,production,0:0,COMPLETED,cn1440,0,0,65876.11,2744.837917,,122880.0,5120.0


## Formating nodelist

Here are the patterns found in nodelist and the conversion we want to happen:

Single Node Cluster

cn456 => cluster_type: "cn", cluster_nodes: "456"
ng1102 => cluster_type: "ng", cluster_nodes: "1102"
gpu0037 => cluster_type: "gpu", cluster_nodes: "0037"
Multi-Nodes Cluster List

cn[345,656,754,565] => cluster_type: "cn", cluster_nodes: "345,656,754,565"
Multi-Nodes Cluster Range

gpu[1367-1370] => cluster_type: "gpu", cluster_nodes: "1367,1368,1369,1370"
Multi-Nodes Cluster List and Range Combo

cn[382-383,394,416,911-913] => cluster_type: "cn", cluster_nodes:"382,383,394,416,911,912,913"
Single Node Patternless

vm-cms-sam-pri => cluster_type: "generic", cluster_nodes: "vm-cms-sam-pri"
eval-dell-01 => cluster_type: "generic", cluster_nodes: "eval-dell-01"
dougherty1 => cluster_type: "generic", cluster_nodes: "dougherty1"

In [25]:
# How to use this function: df["nodelist"].apply(format_nodelist)

def format_nodelist(nodelist):
    """This function takes a list of nodes in any of the pattern format described above and convert it into a tuple in the format (cluster_type, cluster_nodes)
        Both the cluster_type and the cluster_nodes will be strings."""
    
    # We want to return the cluster_type and the cluster_nodes as strings
    cluster_type = "" # e.g. "cn"
    cluster_nodes = "" # e.g. "1308, 134, 545, 546, 547, 548"
    
    # FIRST PART: Handle the cases for:
    #   - Multi-Nodes Cluster List
    #   - Multi-Nodes Cluster Range
    #   - Multi-Nodes Cluster List and Range Combo
    
    # First, remove the "]" and split at the "[" character from the nodelist string
    # If the nodelist string does not contain "[]", this will be ignore and just get the original string
    
    type_and_list = nodelist.strip("]").split("[") # e.g. "cn[1308, 134, 545-548]" => ["cn", "1308, 134, 545-548"]
    
    # If type_and_list has more than one element, we have an actual cluster to process further
    if len(type_and_list) > 1:
        
        # The cluster type is the first element of the list
        cluster_type = type_and_list[0] # "cn"
        
        # The individual nodes can be split on the comma
        nodes = type_and_list[1].split(",") # ["1308", "134", "545-548"]
        
        # The cluster list might need further processing if it has a range
        for nstr in nodes:
            
            # Check if the string contains a dash. If it does, this is a range
            if ("-" in nstr):
                
                # Split on the dash
                range_list = nstr.split("-") # ["545", "548"]
                                
                # Convert the range numbers to int
                start_num = int(range_list[0])
                end_num = int(range_list[1])
                
                # Generate the actual range of numbers
                # The first number is the start of the range, second number is the end of the list INCLUSIVE
                range_list_int = [num for num in range(start_num, end_num+1)] # +1 to account for inclusive
                
                # Now, range_list_int == [545, 546, 547, 548]
                # We need to convert it back to a list of string
                range_list_str = list(map(lambda num: "{}".format(num), range_list_int))
                
                # Now, range_list_int == ["545", "546", "547", "548"]
                # Add each one of them to the cluster node
                for n in range_list_str:
                    
                    # Append to the cluster_nodes string
                    cluster_nodes += "," + n
            
            else: # The string does not have a dash so it is just a single node. e.g. "1308" and "134"
                
                # Append directly to the cluster_nodes. Remove white spaces
                cluster_nodes += "," + nstr.strip()
    
    else: # If here, the type_and_list has only one element string
        
        # SECOND PART: Handle the cases for:
        #   - Single Node Cluster
        
        # Use regular expression to find the node_type (cluster_type) and the node_number (cluster_number)
        # Make sure `re` is imported
        
        single_node_cluster_format = r"^(\w{2,3}?)(\d+)" # for cn124, or gpu5738
        
        # Check if the node match the Single Node Cluster pattern
        if re.match(single_node_cluster_format, type_and_list[0]):
                    
            # Using re.findall() and Splitting text and number in string
            # This split gives us the node_type and the node_number
            cluster_type, cluster_nodes = re.findall(single_node_cluster_format, type_and_list[0])[0]
        
        else:
            # If we don't get a match, it means it is a Single Node Patternless
            # THIRD PART: Handle the cases for:
            #   - Single Node Patternless
            
            cluster_type = "generic"
            cluster_nodes = type_and_list[0]
    
    # Trim cluster_nodes of any extra commas
    cluster_nodes = cluster_nodes.strip(",")
    
    # Finally, return the cluster_type and the cluster_nodes a a tuple
    return (cluster_type, cluster_nodes)    

In [26]:
# Testing: Multi-nodes
original_1 = "cn[1308, 134, 545-548]"

# Testing: Single Node Cluster
original_2 = "cn1308"
original_3 = "gpu567"
original_4 = "ng1102"

# Testing: Single Node Patternless
original_5 = "vm-cms-sam-pri"
original_6 = "eval-dell-01"
original_7 = "dougherty1"

print(format_nodelist(original_1))
print(format_nodelist(original_2))
print(format_nodelist(original_3))
print(format_nodelist(original_4))
print(format_nodelist(original_5))
print(format_nodelist(original_6))
print(format_nodelist(original_7))

('cn', '1308,134,545,546,547,548')
('cn', '1308')
('gpu', '567')
('ng', '1102')
('generic', 'vm-cms-sam-pri')
('generic', 'eval-dell-01')
('generic', 'dougherty1')


In [27]:
# Add a column cluster_type_and_nodes
# This will contain the tuple from format_nodelist
accre_jobs_2020 = accre_jobs_2020.assign(cluster_type_and_nodes = accre_jobs_2020['nodelist'].apply(format_nodelist))
accre_jobs_2020.head()

Unnamed: 0,jobid,account,user,reqmem,usedmem,reqtime,usedtime,nodes,cpus,partition,...,state,nodelist,exitcode_user,exitcode_error,used_mb_per_node,used_mb_per_core,reqmem_mc,reqmem_mn,reqmem_per_core,cluster_type_and_nodes
0,15925210,treviso,arabella,122880Mn,65973.49M,1188000,1188028,1,24,production,...,COMPLETED,cn1531,0,0,65973.49,2748.895417,,122880.0,5120.0,"(cn, 1531)"
1,15861126,treviso,arabella,122880Mn,67181.12M,1188000,1090256,1,24,production,...,COMPLETED,cn1441,0,0,67181.12,2799.213333,,122880.0,5120.0,"(cn, 1441)"
2,15861125,treviso,arabella,122880Mn,69111.86M,1188000,1188020,1,24,production,...,COMPLETED,cn1464,0,0,69111.86,2879.660833,,122880.0,5120.0,"(cn, 1464)"
3,16251645,treviso,arabella,122880Mn,65317.33M,1188000,1050632,1,24,production,...,COMPLETED,cn1473,0,0,65317.33,2721.555417,,122880.0,5120.0,"(cn, 1473)"
4,16251646,treviso,arabella,122880Mn,65876.11M,1188000,1188003,1,24,production,...,COMPLETED,cn1440,0,0,65876.11,2744.837917,,122880.0,5120.0,"(cn, 1440)"


In [28]:
# Finally, Split the cluster_type_and_nodes column into 2 separate columns
accre_jobs_2020[['cluster_type', 'cluster_nodes']] = pd.DataFrame(accre_jobs_2020['cluster_type_and_nodes'].tolist(), index=accre_jobs_2020.index) 
accre_jobs_2020

Unnamed: 0,jobid,account,user,reqmem,usedmem,reqtime,usedtime,nodes,cpus,partition,...,exitcode_user,exitcode_error,used_mb_per_node,used_mb_per_core,reqmem_mc,reqmem_mn,reqmem_per_core,cluster_type_and_nodes,cluster_type,cluster_nodes
0,15925210,treviso,arabella,122880Mn,65973.49M,1188000,1188028,1,24,production,...,0,0,65973.49,2748.895417,,122880.0,5120.000000,"(cn, 1531)",cn,1531
1,15861126,treviso,arabella,122880Mn,67181.12M,1188000,1090256,1,24,production,...,0,0,67181.12,2799.213333,,122880.0,5120.000000,"(cn, 1441)",cn,1441
2,15861125,treviso,arabella,122880Mn,69111.86M,1188000,1188020,1,24,production,...,0,0,69111.86,2879.660833,,122880.0,5120.000000,"(cn, 1464)",cn,1464
3,16251645,treviso,arabella,122880Mn,65317.33M,1188000,1050632,1,24,production,...,0,0,65317.33,2721.555417,,122880.0,5120.000000,"(cn, 1473)",cn,1473
4,16251646,treviso,arabella,122880Mn,65876.11M,1188000,1188003,1,24,production,...,0,0,65876.11,2744.837917,,122880.0,5120.000000,"(cn, 1440)",cn,1440
5,16297022,treviso,arabella,122880Mn,65183.02M,1188000,1045903,1,24,production,...,0,0,65183.02,2715.959167,,122880.0,5120.000000,"(cn, 1443)",cn,1443
6,16297024,treviso,arabella,122880Mn,64941.85M,1188000,1148099,1,24,production,...,0,0,64941.85,2705.910417,,122880.0,5120.000000,"(cn, 1531)",cn,1531
7,16316283,treviso,arabella,122880Mn,68499.51M,1188000,1188005,1,24,production,...,0,0,68499.51,2854.146250,,122880.0,5120.000000,"(cn, 1535)",cn,1535
8,16316284,treviso,arabella,122880Mn,67761.41M,1188000,1188015,1,24,production,...,0,0,67761.41,2823.392083,,122880.0,5120.000000,"(cn, 1464)",cn,1464
9,16316324,treviso,arabella,122880Mn,68087.66M,1188000,1188006,1,24,production,...,0,0,68087.66,2836.985833,,122880.0,5120.000000,"(cn, 1473)",cn,1473


In [29]:
# Checking rows with nodes > 150
accre_jobs_2020[accre_jobs_2020["nodes"] > 150]

Unnamed: 0,jobid,account,user,reqmem,usedmem,reqtime,usedtime,nodes,cpus,partition,...,exitcode_user,exitcode_error,used_mb_per_node,used_mb_per_core,reqmem_mc,reqmem_mn,reqmem_per_core,cluster_type_and_nodes,cluster_type,cluster_nodes
2781194,22266623,crowns,wilhelmina,5120Mc,54.75M,345600,78,156,1,production,...,0,0,54.75,0.350962,5120.0,,5120.0,"(cn, 313,324,331,332,334,335,336,354,361,365,3...",cn,"313,324,331,332,334,335,336,354,361,365,372,37..."
2781539,22272025,crowns,wilhelmina,5120Mc,9910.64M,345600,114568,163,1,production,...,1,0,9910.64,60.801472,5120.0,,5120.0,"(cn, 305,313,324,331,332,334,335,336,337,354,3...",cn,"305,313,324,331,332,334,335,336,337,354,361,36..."
3311027,23335998,grisette,acy,12288Mn,2.70M,3000,533,200,2,production,...,0,15,2.7,0.00675,,12288.0,30.72,"(cn, 304,315,316,317,318,319,320,322,323,324,3...",cn,"304,315,316,317,318,319,320,322,323,324,326,32..."
3329704,23352111,grisette,acy,12288Mn,0.46M,3000,3,200,2,production,...,0,0,0.46,0.00115,,12288.0,30.72,"(cn, 315,316,317,318,319,320,322,323,324,326,3...",cn,"315,316,317,318,319,320,322,323,324,326,327,32..."
3343504,23375092,grisette,acy,12288Mn,2.83M,3000,31,400,2,production,...,0,0,2.83,0.003538,,12288.0,15.36,"(cn, 301,302,303,305,306,307,308,309,311,312,3...",cn,"301,302,303,305,306,307,308,309,311,312,313,31..."
3343506,23375094,grisette,acy,12288Mn,0.62M,3000,24,500,2,production,...,0,0,0.62,0.00062,,12288.0,12.288,"(cn, 301,302,303,305,306,307,308,309,311,312,3...",cn,"301,302,303,305,306,307,308,309,311,312,313,31..."


In [30]:
# List of nodes for jobid == 22272025
accre_jobs_2020[accre_jobs_2020["jobid"] == "22272025"]["cluster_nodes"].values

array(['305,313,324,331,332,334,335,336,337,354,361,365,372,373,374,382,386,404,408,437,438,445,447,472,497,911,912,1085,1086,1088,1090,1092,1094,1095,1096,1121,1122,1123,1126,1127,1128,1129,1132,1201,1202,1203,1207,1208,1212,1213,1214,1215,1216,1221,1223,1224,1228,1241,1258,1260,1261,1262,1263,1269,1270,1278,1279,1281,1285,1286,1287,1291,1292,1299,1305,1306,1307,1308,1310,1313,1319,1321,1323,1325,1331,1332,1335,1336,1338,1339,1340,1341,1342,1343,1344,1345,1346,1347,1349,1350,1351,1352,1353,1354,1358,1359,1364,1367,1368,1369,1370,1376,1377,1380,1384,1385,1386,1387,1388,1389,1391,1392,1394,1395,1396,1397,1398,1420,1421,1423,1424,1425,1426,1427,1432,1433,1434,1435,1436,1437,1438,1442,1443,1451,1452,1472,1492,1494,1495,1497,1498,1499,1504,1506,1508,1516,1521,1524,1530,1531,1534,1535,1536'],
      dtype=object)

In [31]:
# Checking rows with cluster_Type == 'generic'
accre_jobs_2020[accre_jobs_2020["cluster_type"] == "generic"]["cluster_nodes"].value_counts()

vm-cms-sam-pri      45761
vm-cms-sam-sec      33390
eval-dell-01          381
capra1                 83
dougherty1             72
vm-qa-flatearth1        4
rocksteady              2
Name: cluster_nodes, dtype: int64

In [210]:
accre_jobs_2020_with_cpus_0 = accre_jobs_2020[accre_jobs_2020["cpus"] == 0]

In [211]:
accre_jobs_2020 = accre_jobs_2020[accre_jobs_2020["cpus"] != 0]

# Cleaning the useless information

In [212]:
accre_jobs_2020.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3813160 entries, 0 to 3816287
Data columns (total 15 columns):
jobid               object
account             object
user                object
reqtime             int64
usedtime            int64
nodes               int64
cpus                int64
partition           object
state               object
exitcode_user       int64
exitcode_error      int64
used_mb_per_core    float64
reqmem_per_core     float64
cluster_type        object
cluster_nodes       object
dtypes: float64(2), int64(6), object(7)
memory usage: 465.5+ MB


In [213]:
#remove non-informative columns. eg. df.drop(['column'], axis=1)
accre_jobs_2020 = accre_jobs_2020.drop(["reqmem","usedmem","exitcode","nodelist","used_mb_per_node","reqmem_mc","reqmem_mn","cluster_type_and_nodes"], axis=1)
accre_jobs_2020.head()

KeyError: "['reqmem' 'usedmem' 'exitcode' 'nodelist' 'used_mb_per_node' 'reqmem_mc'\n 'reqmem_mn' 'cluster_type_and_nodes'] not found in axis"

In [214]:
accre_jobs_2020.shape

(3813160, 15)

In [56]:
# remove running jobs
accre_jobs_2020 = accre_jobs_2020_clean1[accre_jobs_2020_clean1['state'] != 'RUNNING']

In [57]:
accre_jobs_2020.shape

(3813182, 15)

In [215]:
accre_jobs_2020.head()

Unnamed: 0,jobid,account,user,reqtime,usedtime,nodes,cpus,partition,state,exitcode_user,exitcode_error,used_mb_per_core,reqmem_per_core,cluster_type,cluster_nodes
0,15925210,treviso,arabella,1188000,1188028,1,24,production,COMPLETED,0,0,2748.895417,5120.0,cn,1531
1,15861126,treviso,arabella,1188000,1090256,1,24,production,COMPLETED,0,0,2799.213333,5120.0,cn,1441
2,15861125,treviso,arabella,1188000,1188020,1,24,production,COMPLETED,0,0,2879.660833,5120.0,cn,1464
3,16251645,treviso,arabella,1188000,1050632,1,24,production,COMPLETED,0,0,2721.555417,5120.0,cn,1473
4,16251646,treviso,arabella,1188000,1188003,1,24,production,COMPLETED,0,0,2744.837917,5120.0,cn,1440


In [218]:
accre_jobs_2020.to_csv('../data/accre_jobs_2020_cleaned.csv', index = False)

# Exploring the dataframe

## column partition

In [163]:
accre_jobs_2020['partition'].value_counts()

production              3308929
nogpfs                   327440
sam                       79151
pascal                    48000
turing                    39377
debug                      6738
maxwell                    3347
cgw-capra1                   83
cgw-dougherty1               70
cgw-cqs1                     34
cgw-vm-qa-flatearth1          4
cgw-cqs3                      4
cgw-tbi01                     2
devel                         2
cgw-rocksteady                1
Name: partition, dtype: int64

## Exit code error

In [None]:
#accre_jobs_2020(accre_jobs_2020['exitcode_error'] != 0)