# accre_jobs_2020

In [1]:
import pandas as pd
import numpy as np
import re

## Importing the Data

Check the `accre-jobs-2020--datasource-preprocessing.ipynb` file for the data pre-processing steps

In [2]:
accre_jobs_2020 = pd.read_csv('../data/accre-jobs-2020-processed.csv')
accre_jobs_2020.head()

Unnamed: 0,JOBID,ACCOUNT,USER,REQMEM,USEDMEM,REQTIME,USEDTIME,NODES,CPUS,PARTITION,EXITCODE,STATE,NODELIST
0,15925210,treviso,arabella,122880Mn,65973.49M,13-18:00:00,13-18:00:28,1,24,production,0:0,COMPLETED,cn1531
1,15861126,treviso,arabella,122880Mn,67181.12M,13-18:00:00,12-14:50:56,1,24,production,0:0,COMPLETED,cn1441
2,15861125,treviso,arabella,122880Mn,69111.86M,13-18:00:00,13-18:00:20,1,24,production,0:0,COMPLETED,cn1464
3,16251645,treviso,arabella,122880Mn,65317.33M,13-18:00:00,12-03:50:32,1,24,production,0:0,COMPLETED,cn1473
4,16251646,treviso,arabella,122880Mn,65876.11M,13-18:00:00,13-18:00:03,1,24,production,0:0,COMPLETED,cn1440


## Data Preparation and Cleaning

### Change headers to all lowercase

In [3]:
accre_jobs_2020.columns = accre_jobs_2020.columns.map(str.lower)
accre_jobs_2020.head()

Unnamed: 0,jobid,account,user,reqmem,usedmem,reqtime,usedtime,nodes,cpus,partition,exitcode,state,nodelist
0,15925210,treviso,arabella,122880Mn,65973.49M,13-18:00:00,13-18:00:28,1,24,production,0:0,COMPLETED,cn1531
1,15861126,treviso,arabella,122880Mn,67181.12M,13-18:00:00,12-14:50:56,1,24,production,0:0,COMPLETED,cn1441
2,15861125,treviso,arabella,122880Mn,69111.86M,13-18:00:00,13-18:00:20,1,24,production,0:0,COMPLETED,cn1464
3,16251645,treviso,arabella,122880Mn,65317.33M,13-18:00:00,12-03:50:32,1,24,production,0:0,COMPLETED,cn1473
4,16251646,treviso,arabella,122880Mn,65876.11M,13-18:00:00,13-18:00:03,1,24,production,0:0,COMPLETED,cn1440


First, let's look at the quick summary

In [4]:
accre_jobs_2020.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3816290 entries, 0 to 3816289
Data columns (total 13 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   jobid      object
 1   account    object
 2   user       object
 3   reqmem     object
 4   usedmem    object
 5   reqtime    object
 6   usedtime   object
 7   nodes      int64 
 8   cpus       int64 
 9   partition  object
 10  exitcode   object
 11  state      object
 12  nodelist   object
dtypes: int64(2), object(11)
memory usage: 378.5+ MB


Looks like we have some columns here that need to be formatted into the correct data types. But we will take care of the data types only after manipulating the columns.

### Removing all `STATE == RUNNING` from main analysis

Because these computers are still in their running state, their `usedmem` and `usedtime` are not accurate and could throw off our analysis. We will keep them aside and analyze them separately if needed.

In [5]:
# Setting aside those with state == running
computers_state_running = accre_jobs_2020[accre_jobs_2020['state'] == 'RUNNING']
computers_state_running.shape

(3108, 13)

In [6]:
# Removing those with state == running from the original dataset
accre_jobs_2020 = accre_jobs_2020[accre_jobs_2020['state'] != 'RUNNING']
accre_jobs_2020.shape

(3813182, 13)

### Convert all job times to total seconds

Job time is in a format of either `d-hh:mm:ss` or `hh:mm:ss`, it needs to be converted to total seconds

In [7]:
def convert_time_to_seconds(time_str):
    """Convert the given time string d-hh:mm:ss or hh:mm:ss to total seconds."""
    
    # Account for possible errors
    try:
        
        # Initialize all time components to 0
        days = hours = minutes = seconds = 0

        # Split the time string at the dash mark
        time_dash_split = time_str.split("-") # => Either ["d", "hh:mm:ss"] or ["hh:mm:ss"]

        # Assign each time element to variables
        if len(time_dash_split) > 1: 
            # Remove the first element from the list and convert to integer and add to days
            days += int(time_dash_split.pop(0))

        # If here, len(time_dash_split) == 1
        # Split the time at the colon marks
        time_colon_split = time_dash_split[0].split(":") # => ["hh", "mm", "ss"]
        seconds += int(time_colon_split.pop())
        minutes += int(time_colon_split.pop())
        hours += int(time_colon_split.pop())

        # Now combine everything together
        return (((((days * 24) + hours) * 60) + minutes) * 60) + seconds
        
    except AttributeError as error:
        # Skip everything if the column is already in seconds (assuming one single digit)
        return time_str

Now, we can convert the time columns to seconds

In [8]:
# Apply time conversion
new_reqtime = accre_jobs_2020["reqtime"].map(convert_time_to_seconds)
new_usedtime = accre_jobs_2020["usedtime"].map(convert_time_to_seconds)

# Assign the values to the columns
accre_jobs_2020 = accre_jobs_2020.assign(reqtime = new_reqtime)
accre_jobs_2020 = accre_jobs_2020.assign(usedtime = new_usedtime)

# Check
accre_jobs_2020.head()

Unnamed: 0,jobid,account,user,reqmem,usedmem,reqtime,usedtime,nodes,cpus,partition,exitcode,state,nodelist
0,15925210,treviso,arabella,122880Mn,65973.49M,1188000,1188028,1,24,production,0:0,COMPLETED,cn1531
1,15861126,treviso,arabella,122880Mn,67181.12M,1188000,1090256,1,24,production,0:0,COMPLETED,cn1441
2,15861125,treviso,arabella,122880Mn,69111.86M,1188000,1188020,1,24,production,0:0,COMPLETED,cn1464
3,16251645,treviso,arabella,122880Mn,65317.33M,1188000,1050632,1,24,production,0:0,COMPLETED,cn1473
4,16251646,treviso,arabella,122880Mn,65876.11M,1188000,1188003,1,24,production,0:0,COMPLETED,cn1440


### Split the `exitcode` into `exitcode_user` and `exitcode_error`

In [9]:
# Split the exit code into exitcode_user , exitcode_error
accre_jobs_2020[["exitcode_user", "exitcode_error"]] = accre_jobs_2020["exitcode"].str.split(":", expand=True)
accre_jobs_2020

Unnamed: 0,jobid,account,user,reqmem,usedmem,reqtime,usedtime,nodes,cpus,partition,exitcode,state,nodelist,exitcode_user,exitcode_error
0,15925210,treviso,arabella,122880Mn,65973.49M,1188000,1188028,1,24,production,0:0,COMPLETED,cn1531,0,0
1,15861126,treviso,arabella,122880Mn,67181.12M,1188000,1090256,1,24,production,0:0,COMPLETED,cn1441,0,0
2,15861125,treviso,arabella,122880Mn,69111.86M,1188000,1188020,1,24,production,0:0,COMPLETED,cn1464,0,0
3,16251645,treviso,arabella,122880Mn,65317.33M,1188000,1050632,1,24,production,0:0,COMPLETED,cn1473,0,0
4,16251646,treviso,arabella,122880Mn,65876.11M,1188000,1188003,1,24,production,0:0,COMPLETED,cn1440,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3816283,24173815_6,portabella,vennie,32768Mn,23269.08M,86400,96,1,2,production,0:0,COMPLETED,cn432,0,0
3816284,24173815_7,portabella,vennie,32768Mn,20879.81M,86400,97,1,2,production,0:0,COMPLETED,cn440,0,0
3816285,24173815_8,portabella,vennie,32768Mn,30042.68M,86400,147,1,2,production,0:0,COMPLETED,cn312,0,0
3816286,24173815_9,portabella,vennie,32768Mn,31067.75M,86400,147,1,2,production,0:0,COMPLETED,cn312,0,0


### Convert `usedmem` to `used_mb_per_core`

All memory should be in the format *Mb per core*

In [10]:
# Get used_mb_per_node
accre_jobs_2020['used_mb_per_node'] = accre_jobs_2020['usedmem'].str.strip('M').astype(float)
accre_jobs_2020

Unnamed: 0,jobid,account,user,reqmem,usedmem,reqtime,usedtime,nodes,cpus,partition,exitcode,state,nodelist,exitcode_user,exitcode_error,used_mb_per_node
0,15925210,treviso,arabella,122880Mn,65973.49M,1188000,1188028,1,24,production,0:0,COMPLETED,cn1531,0,0,65973.49
1,15861126,treviso,arabella,122880Mn,67181.12M,1188000,1090256,1,24,production,0:0,COMPLETED,cn1441,0,0,67181.12
2,15861125,treviso,arabella,122880Mn,69111.86M,1188000,1188020,1,24,production,0:0,COMPLETED,cn1464,0,0,69111.86
3,16251645,treviso,arabella,122880Mn,65317.33M,1188000,1050632,1,24,production,0:0,COMPLETED,cn1473,0,0,65317.33
4,16251646,treviso,arabella,122880Mn,65876.11M,1188000,1188003,1,24,production,0:0,COMPLETED,cn1440,0,0,65876.11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3816283,24173815_6,portabella,vennie,32768Mn,23269.08M,86400,96,1,2,production,0:0,COMPLETED,cn432,0,0,23269.08
3816284,24173815_7,portabella,vennie,32768Mn,20879.81M,86400,97,1,2,production,0:0,COMPLETED,cn440,0,0,20879.81
3816285,24173815_8,portabella,vennie,32768Mn,30042.68M,86400,147,1,2,production,0:0,COMPLETED,cn312,0,0,30042.68
3816286,24173815_9,portabella,vennie,32768Mn,31067.75M,86400,147,1,2,production,0:0,COMPLETED,cn312,0,0,31067.75


There are some weird computers with cpus == 0. We will put those aside and maybe look at them later?

In [11]:
accre_jobs_2020_with_zero_cpus = accre_jobs_2020[accre_jobs_2020["cpus"] == 0]
# Remove them from the actual dataset
accre_jobs_2020 = accre_jobs_2020[accre_jobs_2020["cpus"] != 0]

Now, we can do better division

In [12]:
# Calculate used_mb_per_core
accre_jobs_2020['used_mb_per_core'] = (accre_jobs_2020['used_mb_per_node']) / (accre_jobs_2020['cpus'] * accre_jobs_2020['nodes'])
accre_jobs_2020


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  accre_jobs_2020['used_mb_per_core'] = (accre_jobs_2020['used_mb_per_node']) / (accre_jobs_2020['cpus'] * accre_jobs_2020['nodes'])


Unnamed: 0,jobid,account,user,reqmem,usedmem,reqtime,usedtime,nodes,cpus,partition,exitcode,state,nodelist,exitcode_user,exitcode_error,used_mb_per_node,used_mb_per_core
0,15925210,treviso,arabella,122880Mn,65973.49M,1188000,1188028,1,24,production,0:0,COMPLETED,cn1531,0,0,65973.49,2748.895417
1,15861126,treviso,arabella,122880Mn,67181.12M,1188000,1090256,1,24,production,0:0,COMPLETED,cn1441,0,0,67181.12,2799.213333
2,15861125,treviso,arabella,122880Mn,69111.86M,1188000,1188020,1,24,production,0:0,COMPLETED,cn1464,0,0,69111.86,2879.660833
3,16251645,treviso,arabella,122880Mn,65317.33M,1188000,1050632,1,24,production,0:0,COMPLETED,cn1473,0,0,65317.33,2721.555417
4,16251646,treviso,arabella,122880Mn,65876.11M,1188000,1188003,1,24,production,0:0,COMPLETED,cn1440,0,0,65876.11,2744.837917
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3816283,24173815_6,portabella,vennie,32768Mn,23269.08M,86400,96,1,2,production,0:0,COMPLETED,cn432,0,0,23269.08,11634.540000
3816284,24173815_7,portabella,vennie,32768Mn,20879.81M,86400,97,1,2,production,0:0,COMPLETED,cn440,0,0,20879.81,10439.905000
3816285,24173815_8,portabella,vennie,32768Mn,30042.68M,86400,147,1,2,production,0:0,COMPLETED,cn312,0,0,30042.68,15021.340000
3816286,24173815_9,portabella,vennie,32768Mn,31067.75M,86400,147,1,2,production,0:0,COMPLETED,cn312,0,0,31067.75,15533.875000


### Convert `reqmem` to `reqmem_per_core`

All memory should be in the format *Mb per core*

In [13]:
# Set reqmem_mc to those that contains the "Mc" characters
condition = accre_jobs_2020["reqmem"].str.contains("Mc")
accre_jobs_2020.loc[condition, "reqmem_mc"] = accre_jobs_2020["reqmem"].str[:-2]
accre_jobs_2020

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


Unnamed: 0,jobid,account,user,reqmem,usedmem,reqtime,usedtime,nodes,cpus,partition,exitcode,state,nodelist,exitcode_user,exitcode_error,used_mb_per_node,used_mb_per_core,reqmem_mc
0,15925210,treviso,arabella,122880Mn,65973.49M,1188000,1188028,1,24,production,0:0,COMPLETED,cn1531,0,0,65973.49,2748.895417,
1,15861126,treviso,arabella,122880Mn,67181.12M,1188000,1090256,1,24,production,0:0,COMPLETED,cn1441,0,0,67181.12,2799.213333,
2,15861125,treviso,arabella,122880Mn,69111.86M,1188000,1188020,1,24,production,0:0,COMPLETED,cn1464,0,0,69111.86,2879.660833,
3,16251645,treviso,arabella,122880Mn,65317.33M,1188000,1050632,1,24,production,0:0,COMPLETED,cn1473,0,0,65317.33,2721.555417,
4,16251646,treviso,arabella,122880Mn,65876.11M,1188000,1188003,1,24,production,0:0,COMPLETED,cn1440,0,0,65876.11,2744.837917,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3816283,24173815_6,portabella,vennie,32768Mn,23269.08M,86400,96,1,2,production,0:0,COMPLETED,cn432,0,0,23269.08,11634.540000,
3816284,24173815_7,portabella,vennie,32768Mn,20879.81M,86400,97,1,2,production,0:0,COMPLETED,cn440,0,0,20879.81,10439.905000,
3816285,24173815_8,portabella,vennie,32768Mn,30042.68M,86400,147,1,2,production,0:0,COMPLETED,cn312,0,0,30042.68,15021.340000,
3816286,24173815_9,portabella,vennie,32768Mn,31067.75M,86400,147,1,2,production,0:0,COMPLETED,cn312,0,0,31067.75,15533.875000,


In [14]:
# Set reqmem_mn to those that contains the "Mn" characters
condition = accre_jobs_2020["reqmem"].str.contains("Mn")
accre_jobs_2020.loc[condition, "reqmem_mn"] = accre_jobs_2020["reqmem"].str[:-2]
accre_jobs_2020

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


Unnamed: 0,jobid,account,user,reqmem,usedmem,reqtime,usedtime,nodes,cpus,partition,exitcode,state,nodelist,exitcode_user,exitcode_error,used_mb_per_node,used_mb_per_core,reqmem_mc,reqmem_mn
0,15925210,treviso,arabella,122880Mn,65973.49M,1188000,1188028,1,24,production,0:0,COMPLETED,cn1531,0,0,65973.49,2748.895417,,122880
1,15861126,treviso,arabella,122880Mn,67181.12M,1188000,1090256,1,24,production,0:0,COMPLETED,cn1441,0,0,67181.12,2799.213333,,122880
2,15861125,treviso,arabella,122880Mn,69111.86M,1188000,1188020,1,24,production,0:0,COMPLETED,cn1464,0,0,69111.86,2879.660833,,122880
3,16251645,treviso,arabella,122880Mn,65317.33M,1188000,1050632,1,24,production,0:0,COMPLETED,cn1473,0,0,65317.33,2721.555417,,122880
4,16251646,treviso,arabella,122880Mn,65876.11M,1188000,1188003,1,24,production,0:0,COMPLETED,cn1440,0,0,65876.11,2744.837917,,122880
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3816283,24173815_6,portabella,vennie,32768Mn,23269.08M,86400,96,1,2,production,0:0,COMPLETED,cn432,0,0,23269.08,11634.540000,,32768
3816284,24173815_7,portabella,vennie,32768Mn,20879.81M,86400,97,1,2,production,0:0,COMPLETED,cn440,0,0,20879.81,10439.905000,,32768
3816285,24173815_8,portabella,vennie,32768Mn,30042.68M,86400,147,1,2,production,0:0,COMPLETED,cn312,0,0,30042.68,15021.340000,,32768
3816286,24173815_9,portabella,vennie,32768Mn,31067.75M,86400,147,1,2,production,0:0,COMPLETED,cn312,0,0,31067.75,15533.875000,,32768


In [15]:
# Converting to correct data types for calculation
accre_jobs_2020['reqmem_mn'] = accre_jobs_2020['reqmem_mn'].astype(float)
accre_jobs_2020['reqmem_mc'] = accre_jobs_2020['reqmem_mc'].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  accre_jobs_2020['reqmem_mn'] = accre_jobs_2020['reqmem_mn'].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  accre_jobs_2020['reqmem_mc'] = accre_jobs_2020['reqmem_mc'].astype(float)


In [16]:
# Convert reqmem_mn to reqmem_mn_per_core
accre_jobs_2020['reqmem_mn_per_core'] = (accre_jobs_2020['reqmem_mn']) / (accre_jobs_2020['cpus'] * accre_jobs_2020['nodes'])
accre_jobs_2020

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  accre_jobs_2020['reqmem_mn_per_core'] = (accre_jobs_2020['reqmem_mn']) / (accre_jobs_2020['cpus'] * accre_jobs_2020['nodes'])


Unnamed: 0,jobid,account,user,reqmem,usedmem,reqtime,usedtime,nodes,cpus,partition,exitcode,state,nodelist,exitcode_user,exitcode_error,used_mb_per_node,used_mb_per_core,reqmem_mc,reqmem_mn,reqmem_mn_per_core
0,15925210,treviso,arabella,122880Mn,65973.49M,1188000,1188028,1,24,production,0:0,COMPLETED,cn1531,0,0,65973.49,2748.895417,,122880.0,5120.0
1,15861126,treviso,arabella,122880Mn,67181.12M,1188000,1090256,1,24,production,0:0,COMPLETED,cn1441,0,0,67181.12,2799.213333,,122880.0,5120.0
2,15861125,treviso,arabella,122880Mn,69111.86M,1188000,1188020,1,24,production,0:0,COMPLETED,cn1464,0,0,69111.86,2879.660833,,122880.0,5120.0
3,16251645,treviso,arabella,122880Mn,65317.33M,1188000,1050632,1,24,production,0:0,COMPLETED,cn1473,0,0,65317.33,2721.555417,,122880.0,5120.0
4,16251646,treviso,arabella,122880Mn,65876.11M,1188000,1188003,1,24,production,0:0,COMPLETED,cn1440,0,0,65876.11,2744.837917,,122880.0,5120.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3816283,24173815_6,portabella,vennie,32768Mn,23269.08M,86400,96,1,2,production,0:0,COMPLETED,cn432,0,0,23269.08,11634.540000,,32768.0,16384.0
3816284,24173815_7,portabella,vennie,32768Mn,20879.81M,86400,97,1,2,production,0:0,COMPLETED,cn440,0,0,20879.81,10439.905000,,32768.0,16384.0
3816285,24173815_8,portabella,vennie,32768Mn,30042.68M,86400,147,1,2,production,0:0,COMPLETED,cn312,0,0,30042.68,15021.340000,,32768.0,16384.0
3816286,24173815_9,portabella,vennie,32768Mn,31067.75M,86400,147,1,2,production,0:0,COMPLETED,cn312,0,0,31067.75,15533.875000,,32768.0,16384.0


In [17]:
# Combine reqmem_mc into reqmem_mn_per_core
accre_jobs_2020.loc[accre_jobs_2020['reqmem_mn_per_core'].isnull(),'reqmem_mn_per_core'] = accre_jobs_2020['reqmem_mc']
accre_jobs_2020

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


Unnamed: 0,jobid,account,user,reqmem,usedmem,reqtime,usedtime,nodes,cpus,partition,exitcode,state,nodelist,exitcode_user,exitcode_error,used_mb_per_node,used_mb_per_core,reqmem_mc,reqmem_mn,reqmem_mn_per_core
0,15925210,treviso,arabella,122880Mn,65973.49M,1188000,1188028,1,24,production,0:0,COMPLETED,cn1531,0,0,65973.49,2748.895417,,122880.0,5120.0
1,15861126,treviso,arabella,122880Mn,67181.12M,1188000,1090256,1,24,production,0:0,COMPLETED,cn1441,0,0,67181.12,2799.213333,,122880.0,5120.0
2,15861125,treviso,arabella,122880Mn,69111.86M,1188000,1188020,1,24,production,0:0,COMPLETED,cn1464,0,0,69111.86,2879.660833,,122880.0,5120.0
3,16251645,treviso,arabella,122880Mn,65317.33M,1188000,1050632,1,24,production,0:0,COMPLETED,cn1473,0,0,65317.33,2721.555417,,122880.0,5120.0
4,16251646,treviso,arabella,122880Mn,65876.11M,1188000,1188003,1,24,production,0:0,COMPLETED,cn1440,0,0,65876.11,2744.837917,,122880.0,5120.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3816283,24173815_6,portabella,vennie,32768Mn,23269.08M,86400,96,1,2,production,0:0,COMPLETED,cn432,0,0,23269.08,11634.540000,,32768.0,16384.0
3816284,24173815_7,portabella,vennie,32768Mn,20879.81M,86400,97,1,2,production,0:0,COMPLETED,cn440,0,0,20879.81,10439.905000,,32768.0,16384.0
3816285,24173815_8,portabella,vennie,32768Mn,30042.68M,86400,147,1,2,production,0:0,COMPLETED,cn312,0,0,30042.68,15021.340000,,32768.0,16384.0
3816286,24173815_9,portabella,vennie,32768Mn,31067.75M,86400,147,1,2,production,0:0,COMPLETED,cn312,0,0,31067.75,15533.875000,,32768.0,16384.0


In [18]:
# Rename reqmem_mn_per_core colum to reqmem_per_core
accre_jobs_2020.rename(columns={'reqmem_mn_per_core':'reqmem_per_core'}, inplace=True)
accre_jobs_2020

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


Unnamed: 0,jobid,account,user,reqmem,usedmem,reqtime,usedtime,nodes,cpus,partition,exitcode,state,nodelist,exitcode_user,exitcode_error,used_mb_per_node,used_mb_per_core,reqmem_mc,reqmem_mn,reqmem_per_core
0,15925210,treviso,arabella,122880Mn,65973.49M,1188000,1188028,1,24,production,0:0,COMPLETED,cn1531,0,0,65973.49,2748.895417,,122880.0,5120.0
1,15861126,treviso,arabella,122880Mn,67181.12M,1188000,1090256,1,24,production,0:0,COMPLETED,cn1441,0,0,67181.12,2799.213333,,122880.0,5120.0
2,15861125,treviso,arabella,122880Mn,69111.86M,1188000,1188020,1,24,production,0:0,COMPLETED,cn1464,0,0,69111.86,2879.660833,,122880.0,5120.0
3,16251645,treviso,arabella,122880Mn,65317.33M,1188000,1050632,1,24,production,0:0,COMPLETED,cn1473,0,0,65317.33,2721.555417,,122880.0,5120.0
4,16251646,treviso,arabella,122880Mn,65876.11M,1188000,1188003,1,24,production,0:0,COMPLETED,cn1440,0,0,65876.11,2744.837917,,122880.0,5120.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3816283,24173815_6,portabella,vennie,32768Mn,23269.08M,86400,96,1,2,production,0:0,COMPLETED,cn432,0,0,23269.08,11634.540000,,32768.0,16384.0
3816284,24173815_7,portabella,vennie,32768Mn,20879.81M,86400,97,1,2,production,0:0,COMPLETED,cn440,0,0,20879.81,10439.905000,,32768.0,16384.0
3816285,24173815_8,portabella,vennie,32768Mn,30042.68M,86400,147,1,2,production,0:0,COMPLETED,cn312,0,0,30042.68,15021.340000,,32768.0,16384.0
3816286,24173815_9,portabella,vennie,32768Mn,31067.75M,86400,147,1,2,production,0:0,COMPLETED,cn312,0,0,31067.75,15533.875000,,32768.0,16384.0


### Formatting `nodelist`

Here are the patterns found in `nodelist` and the conversion we want to happen:

**Single Node Cluster**

- `cn456` => `cluster_type`: `"cn"`, `cluster_nodes`: `"456"`
- `ng1102` => `cluster_type`: `"ng"`, `cluster_nodes`: `"1102"`
- `gpu0037` => `cluster_type`: `"gpu"`, `cluster_nodes`: `"0037"`

**Multi-Nodes Cluster List**

- `cn[345,656,754,565]` => `cluster_type`: `"cn"`, `cluster_nodes`: `"345,656,754,565"`

**Multi-Nodes Cluster Range**

- `gpu[1367-1370]` => `cluster_type`: `"gpu"`, `cluster_nodes`: `"1367,1368,1369,1370"`

**Multi-Nodes Cluster List and Range Combo**

- `cn[382-383,394,416,911-913]` => `cluster_type`: `"cn"`, `cluster_nodes`:`"382,383,394,416,911,912,913"`

**Single Node Patternless**

- `vm-cms-sam-pri` => `cluster_type`: `"generic"`, `cluster_nodes`: `"vm-cms-sam-pri"`
- `eval-dell-01` => `cluster_type`: `"generic"`, `cluster_nodes`: `"eval-dell-01"`
- `dougherty1` => `cluster_type`: `"generic"`, `cluster_nodes`: `"dougherty1"`

In [19]:
# How to use this function: df["nodelist"].apply(format_nodelist)

def format_nodelist(nodelist):
    """This function takes a list of nodes in any of the pattern format described above and convert it into a tuple in the format (cluster_type, cluster_nodes)
        Both the cluster_type and the cluster_nodes will be strings."""
    
    # We want to return the cluster_type and the cluster_nodes as strings
    cluster_type = "" # e.g. "cn"
    cluster_nodes = "" # e.g. "1308, 134, 545, 546, 547, 548"
    
    # FIRST PART: Handle the cases for:
    #   - Multi-Nodes Cluster List
    #   - Multi-Nodes Cluster Range
    #   - Multi-Nodes Cluster List and Range Combo
    
    # First, remove the "]" and split at the "[" character from the nodelist string
    # If the nodelist string does not contain "[]", this will be ignore and just get the original string
    
    type_and_list = nodelist.strip("]").split("[") # e.g. "cn[1308, 134, 545-548]" => ["cn", "1308, 134, 545-548"]
    
    # If type_and_list has more than one element, we have an actual cluster to process further
    if len(type_and_list) > 1:
        
        # The cluster type is the first element of the list
        cluster_type = type_and_list[0] # "cn"
        
        # The individual nodes can be split on the comma
        nodes = type_and_list[1].split(",") # ["1308", "134", "545-548"]
        
        # The cluster list might need further processing if it has a range
        for nstr in nodes:
            
            # Check if the string contains a dash. If it does, this is a range
            if ("-" in nstr):
                
                # Split on the dash
                range_list = nstr.split("-") # ["545", "548"]
                
                # Convert the range numbers to int
                start_num = int(range_list[0])
                end_num = int(range_list[1])
                
                # Generate the actual range of numbers
                # The first number is the start of the range, second number is the end of the list INCLUSIVE
                range_list_int = [num for num in range(start_num, end_num+1)] # +1 to account for inclusive
                
                # Now, range_list_int == [545, 546, 547, 548]
                # We need to convert it back to a list of string
                range_list_str = list(map(lambda num: "{}".format(num), range_list_int))
                
                # Now, range_list_int == ["545", "546", "547", "548"]
                # Add each one of them to the cluster node
                for n in range_list_str:
                    
                    # Append to the cluster_nodes string
                    cluster_nodes += "," + n
            
            else: # The string does not have a dash so it is just a single node. e.g. "1308" and "134"
                
                # Append directly to the cluster_nodes. Remove white spaces
                cluster_nodes += "," + nstr.strip()
    
    else: # If here, the type_and_list has only one element string
        
        # SECOND PART: Handle the cases for:
        #   - Single Node Cluster
        
        # Use regular expression to find the node_type (cluster_type) and the node_number (cluster_number)
        # Make sure `re` is imported
        
        single_node_cluster_format = r"^(\w{2,3}?)(\d+)" # for cn124, or gpu5738
        
        # Check if the node match the Single Node Cluster pattern
        if re.match(single_node_cluster_format, type_and_list[0]):
            
            # Using re.findall() and Splitting text and number in string
            # This split gives us the node_type and the node_number
            cluster_type, cluster_nodes = re.findall(single_node_cluster_format, type_and_list[0])[0]
        
        else:
            # If we don't get a match, it means it is a Single Node Patternless
            # THIRD PART: Handle the cases for:
            #   - Single Node Patternless
            
            cluster_type = "generic"
            cluster_nodes = type_and_list[0]
    
    # Trim cluster_nodes of any extra commas
    cluster_nodes = cluster_nodes.strip(",")
    
    # Finally, return the cluster_type and the cluster_nodes a a tuple
    return (cluster_type, cluster_nodes)

In [20]:
# Testing: Multi-nodes
original_1 = "cn[1308, 134, 545-548]"

# Testing: Single Node Cluster
original_2 = "cn1308"
original_3 = "gpu567"
original_4 = "ng1102"

# Testing: Single Node Patternless
original_5 = "vm-cms-sam-pri"
original_6 = "eval-dell-01"
original_7 = "dougherty1"

print(format_nodelist(original_1))
print(format_nodelist(original_2))
print(format_nodelist(original_3))
print(format_nodelist(original_4))
print(format_nodelist(original_5))
print(format_nodelist(original_6))
print(format_nodelist(original_7))

('cn', '1308,134,545,546,547,548')
('cn', '1308')
('gpu', '567')
('ng', '1102')
('generic', 'vm-cms-sam-pri')
('generic', 'eval-dell-01')
('generic', 'dougherty1')


Now, let's apply this to the data frame

In [21]:
# Add a column cluster_type_and_nodes
# This will contain the tuple from format_nodelist
accre_jobs_2020 = accre_jobs_2020.assign(cluster_type_and_nodes = accre_jobs_2020['nodelist'].apply(format_nodelist))
accre_jobs_2020.head()

Unnamed: 0,jobid,account,user,reqmem,usedmem,reqtime,usedtime,nodes,cpus,partition,...,state,nodelist,exitcode_user,exitcode_error,used_mb_per_node,used_mb_per_core,reqmem_mc,reqmem_mn,reqmem_per_core,cluster_type_and_nodes
0,15925210,treviso,arabella,122880Mn,65973.49M,1188000,1188028,1,24,production,...,COMPLETED,cn1531,0,0,65973.49,2748.895417,,122880.0,5120.0,"(cn, 1531)"
1,15861126,treviso,arabella,122880Mn,67181.12M,1188000,1090256,1,24,production,...,COMPLETED,cn1441,0,0,67181.12,2799.213333,,122880.0,5120.0,"(cn, 1441)"
2,15861125,treviso,arabella,122880Mn,69111.86M,1188000,1188020,1,24,production,...,COMPLETED,cn1464,0,0,69111.86,2879.660833,,122880.0,5120.0,"(cn, 1464)"
3,16251645,treviso,arabella,122880Mn,65317.33M,1188000,1050632,1,24,production,...,COMPLETED,cn1473,0,0,65317.33,2721.555417,,122880.0,5120.0,"(cn, 1473)"
4,16251646,treviso,arabella,122880Mn,65876.11M,1188000,1188003,1,24,production,...,COMPLETED,cn1440,0,0,65876.11,2744.837917,,122880.0,5120.0,"(cn, 1440)"


In [22]:
# Finally, Split the cluster_type_and_nodes column into 2 separate columns
accre_jobs_2020[['cluster_type', 'cluster_nodes']] = pd.DataFrame(accre_jobs_2020['cluster_type_and_nodes'].tolist(), index=accre_jobs_2020.index) 
accre_jobs_2020

Unnamed: 0,jobid,account,user,reqmem,usedmem,reqtime,usedtime,nodes,cpus,partition,...,exitcode_user,exitcode_error,used_mb_per_node,used_mb_per_core,reqmem_mc,reqmem_mn,reqmem_per_core,cluster_type_and_nodes,cluster_type,cluster_nodes
0,15925210,treviso,arabella,122880Mn,65973.49M,1188000,1188028,1,24,production,...,0,0,65973.49,2748.895417,,122880.0,5120.0,"(cn, 1531)",cn,1531
1,15861126,treviso,arabella,122880Mn,67181.12M,1188000,1090256,1,24,production,...,0,0,67181.12,2799.213333,,122880.0,5120.0,"(cn, 1441)",cn,1441
2,15861125,treviso,arabella,122880Mn,69111.86M,1188000,1188020,1,24,production,...,0,0,69111.86,2879.660833,,122880.0,5120.0,"(cn, 1464)",cn,1464
3,16251645,treviso,arabella,122880Mn,65317.33M,1188000,1050632,1,24,production,...,0,0,65317.33,2721.555417,,122880.0,5120.0,"(cn, 1473)",cn,1473
4,16251646,treviso,arabella,122880Mn,65876.11M,1188000,1188003,1,24,production,...,0,0,65876.11,2744.837917,,122880.0,5120.0,"(cn, 1440)",cn,1440
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3816283,24173815_6,portabella,vennie,32768Mn,23269.08M,86400,96,1,2,production,...,0,0,23269.08,11634.540000,,32768.0,16384.0,"(cn, 432)",cn,432
3816284,24173815_7,portabella,vennie,32768Mn,20879.81M,86400,97,1,2,production,...,0,0,20879.81,10439.905000,,32768.0,16384.0,"(cn, 440)",cn,440
3816285,24173815_8,portabella,vennie,32768Mn,30042.68M,86400,147,1,2,production,...,0,0,30042.68,15021.340000,,32768.0,16384.0,"(cn, 312)",cn,312
3816286,24173815_9,portabella,vennie,32768Mn,31067.75M,86400,147,1,2,production,...,0,0,31067.75,15533.875000,,32768.0,16384.0,"(cn, 312)",cn,312


In [23]:
# Checking rows with nodes > 150
accre_jobs_2020[accre_jobs_2020["nodes"] > 150]

Unnamed: 0,jobid,account,user,reqmem,usedmem,reqtime,usedtime,nodes,cpus,partition,...,exitcode_user,exitcode_error,used_mb_per_node,used_mb_per_core,reqmem_mc,reqmem_mn,reqmem_per_core,cluster_type_and_nodes,cluster_type,cluster_nodes
2781194,22266623,crowns,wilhelmina,5120Mc,54.75M,345600,78,156,1,production,...,0,0,54.75,0.350962,5120.0,,5120.0,"(cn, 313,324,331,332,334,335,336,354,361,365,3...",cn,"313,324,331,332,334,335,336,354,361,365,372,37..."
2781539,22272025,crowns,wilhelmina,5120Mc,9910.64M,345600,114568,163,1,production,...,1,0,9910.64,60.801472,5120.0,,5120.0,"(cn, 305,313,324,331,332,334,335,336,337,354,3...",cn,"305,313,324,331,332,334,335,336,337,354,361,36..."
3311027,23335998,grisette,acy,12288Mn,2.70M,3000,533,200,2,production,...,0,15,2.7,0.00675,,12288.0,30.72,"(cn, 304,315,316,317,318,319,320,322,323,324,3...",cn,"304,315,316,317,318,319,320,322,323,324,326,32..."
3329704,23352111,grisette,acy,12288Mn,0.46M,3000,3,200,2,production,...,0,0,0.46,0.00115,,12288.0,30.72,"(cn, 315,316,317,318,319,320,322,323,324,326,3...",cn,"315,316,317,318,319,320,322,323,324,326,327,32..."
3343504,23375092,grisette,acy,12288Mn,2.83M,3000,31,400,2,production,...,0,0,2.83,0.003538,,12288.0,15.36,"(cn, 301,302,303,305,306,307,308,309,311,312,3...",cn,"301,302,303,305,306,307,308,309,311,312,313,31..."
3343506,23375094,grisette,acy,12288Mn,0.62M,3000,24,500,2,production,...,0,0,0.62,0.00062,,12288.0,12.288,"(cn, 301,302,303,305,306,307,308,309,311,312,3...",cn,"301,302,303,305,306,307,308,309,311,312,313,31..."


In [24]:
# List of nodes for jobid == 22272025
accre_jobs_2020[accre_jobs_2020["jobid"] == "22272025"]["cluster_nodes"].values

array(['305,313,324,331,332,334,335,336,337,354,361,365,372,373,374,382,386,404,408,437,438,445,447,472,497,911,912,1085,1086,1088,1090,1092,1094,1095,1096,1121,1122,1123,1126,1127,1128,1129,1132,1201,1202,1203,1207,1208,1212,1213,1214,1215,1216,1221,1223,1224,1228,1241,1258,1260,1261,1262,1263,1269,1270,1278,1279,1281,1285,1286,1287,1291,1292,1299,1305,1306,1307,1308,1310,1313,1319,1321,1323,1325,1331,1332,1335,1336,1338,1339,1340,1341,1342,1343,1344,1345,1346,1347,1349,1350,1351,1352,1353,1354,1358,1359,1364,1367,1368,1369,1370,1376,1377,1380,1384,1385,1386,1387,1388,1389,1391,1392,1394,1395,1396,1397,1398,1420,1421,1423,1424,1425,1426,1427,1432,1433,1434,1435,1436,1437,1438,1442,1443,1451,1452,1472,1492,1494,1495,1497,1498,1499,1504,1506,1508,1516,1521,1524,1530,1531,1534,1535,1536'],
      dtype=object)

In [25]:
# Checking rows with cluster_Type == 'generic'
accre_jobs_2020[accre_jobs_2020["cluster_type"] == "generic"]["cluster_nodes"].value_counts()

vm-cms-sam-pri      45761
vm-cms-sam-sec      33390
eval-dell-01          381
capra1                 83
dougherty1             70
vm-qa-flatearth1        4
rocksteady              1
Name: cluster_nodes, dtype: int64

### Combine `state` `CANCELLED BY` into `CANCELLED`

In [26]:
# Before
accre_jobs_2020["state"].value_counts()

COMPLETED              3804625
CANCELLED                 6106
FAILED                    1404
CANCELLED by 686562        681
OUT_OF_MEMORY               94
TIMEOUT                     80
CANCELLED by 505355         63
CANCELLED by 855431         17
CANCELLED by 782611         13
CANCELLED by 9206            9
CANCELLED by 397600          7
CANCELLED by 200557          6
CANCELLED by 0               5
CANCELLED by 199766          4
CANCELLED by 486541          4
CANCELLED by 454080          4
CANCELLED by 9202            3
CANCELLED by 649319          3
CANCELLED by 9201            3
CANCELLED by 483348          3
CANCELLED by 124006          2
CANCELLED by 546080          2
CANCELLED by 666860          2
CANCELLED by 90423           2
CANCELLED by 791651          2
CANCELLED by 693461          2
CANCELLED by 895426          2
CANCELLED by 199066          2
CANCELLED by 515423          2
CANCELLED by 337422          1
CANCELLED by 785271          1
CANCELLED by 649321          1
CANCELLE

In [27]:
accre_jobs_2020.loc[accre_jobs_2020["state"].str.contains("CANCELLED by"), "state"] = 'CANCELLED'

In [28]:
# After
accre_jobs_2020["state"].value_counts()

COMPLETED        3804625
CANCELLED           6957
FAILED              1404
OUT_OF_MEMORY         94
TIMEOUT               80
Name: state, dtype: int64

### How many are production nodes? Non-production nodes?

In [29]:
accre_jobs_2020["partition"].value_counts()

production              3308915
nogpfs                   327432
sam                       79151
pascal                    48000
turing                    39377
debug                      6738
maxwell                    3347
cgw-capra1                   83
cgw-dougherty1               70
cgw-cqs1                     34
cgw-cqs3                      4
cgw-vm-qa-flatearth1          4
cgw-tbi01                     2
devel                         2
cgw-rocksteady                1
Name: partition, dtype: int64

### Delete some unneeded columns

We will not need the foloowing columns anymore:
- `reqmem`
- `usedmem`
- `exitcode`
- `nodelist`
- `used_mb_per_node`
- `reqmem_mc`
- `reqmem_mn`
- `cluster_type_and_nodes`

In [30]:
accre_jobs_2020 = accre_jobs_2020.drop(columns=["reqmem", "usedmem", "exitcode", "nodelist", "used_mb_per_node", "reqmem_mc", "reqmem_mn", "cluster_type_and_nodes"])
accre_jobs_2020

Unnamed: 0,jobid,account,user,reqtime,usedtime,nodes,cpus,partition,state,exitcode_user,exitcode_error,used_mb_per_core,reqmem_per_core,cluster_type,cluster_nodes
0,15925210,treviso,arabella,1188000,1188028,1,24,production,COMPLETED,0,0,2748.895417,5120.0,cn,1531
1,15861126,treviso,arabella,1188000,1090256,1,24,production,COMPLETED,0,0,2799.213333,5120.0,cn,1441
2,15861125,treviso,arabella,1188000,1188020,1,24,production,COMPLETED,0,0,2879.660833,5120.0,cn,1464
3,16251645,treviso,arabella,1188000,1050632,1,24,production,COMPLETED,0,0,2721.555417,5120.0,cn,1473
4,16251646,treviso,arabella,1188000,1188003,1,24,production,COMPLETED,0,0,2744.837917,5120.0,cn,1440
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3816283,24173815_6,portabella,vennie,86400,96,1,2,production,COMPLETED,0,0,11634.540000,16384.0,cn,432
3816284,24173815_7,portabella,vennie,86400,97,1,2,production,COMPLETED,0,0,10439.905000,16384.0,cn,440
3816285,24173815_8,portabella,vennie,86400,147,1,2,production,COMPLETED,0,0,15021.340000,16384.0,cn,312
3816286,24173815_9,portabella,vennie,86400,147,1,2,production,COMPLETED,0,0,15533.875000,16384.0,cn,312


### Fomat to correct data types

Let's now review the data types of the final columns

In [31]:
accre_jobs_2020['exitcode_error'].value_counts()

0      3811830
9          837
15         354
125         95
7           15
11          11
39           5
6            4
12           3
105          2
2            2
36           1
4            1
Name: exitcode_error, dtype: int64

In [32]:
accre_jobs_2020.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3813160 entries, 0 to 3816287
Data columns (total 15 columns):
 #   Column            Dtype  
---  ------            -----  
 0   jobid             object 
 1   account           object 
 2   user              object 
 3   reqtime           int64  
 4   usedtime          int64  
 5   nodes             int64  
 6   cpus              int64  
 7   partition         object 
 8   state             object 
 9   exitcode_user     object 
 10  exitcode_error    object 
 11  used_mb_per_core  float64
 12  reqmem_per_core   float64
 13  cluster_type      object 
 14  cluster_nodes     object 
dtypes: float64(2), int64(4), object(9)
memory usage: 625.5+ MB


Expected from accre_jobs_2020.info()?

- jobid => string (have to keep as string because some are in the array format)
- account => string
- user => string
- reqtime => int64
- usedtime => int64
- nodes => int64
- cpus => int64
- partition => string
- state => string
- exitcode_user => string
- exitcode_error => string
- used_mb_per_core => float64
- reqmem_per_core => float64
- cluster_type => string
- cluster_nodes => string

### Set aside the "production" partition only

We really only want to look at the "production" partition so rows with other partitions should be removed.

In [33]:
accre_jobs_2020_prod = accre_jobs_2020.loc[accre_jobs_2020["partition"] == "production"]
accre_jobs_2020_prod.head()

Unnamed: 0,jobid,account,user,reqtime,usedtime,nodes,cpus,partition,state,exitcode_user,exitcode_error,used_mb_per_core,reqmem_per_core,cluster_type,cluster_nodes
0,15925210,treviso,arabella,1188000,1188028,1,24,production,COMPLETED,0,0,2748.895417,5120.0,cn,1531
1,15861126,treviso,arabella,1188000,1090256,1,24,production,COMPLETED,0,0,2799.213333,5120.0,cn,1441
2,15861125,treviso,arabella,1188000,1188020,1,24,production,COMPLETED,0,0,2879.660833,5120.0,cn,1464
3,16251645,treviso,arabella,1188000,1050632,1,24,production,COMPLETED,0,0,2721.555417,5120.0,cn,1473
4,16251646,treviso,arabella,1188000,1188003,1,24,production,COMPLETED,0,0,2744.837917,5120.0,cn,1440


In [34]:
accre_jobs_2020_prod.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3308915 entries, 0 to 3816287
Data columns (total 15 columns):
 #   Column            Dtype  
---  ------            -----  
 0   jobid             object 
 1   account           object 
 2   user              object 
 3   reqtime           int64  
 4   usedtime          int64  
 5   nodes             int64  
 6   cpus              int64  
 7   partition         object 
 8   state             object 
 9   exitcode_user     object 
 10  exitcode_error    object 
 11  used_mb_per_core  float64
 12  reqmem_per_core   float64
 13  cluster_type      object 
 14  cluster_nodes     object 
dtypes: float64(2), int64(4), object(9)
memory usage: 403.9+ MB


## Some noticeable remarks on the dataset

### Why would some jobs have `usedmem` of 0 if they have `usedtime` > 0?

In [35]:
accre_jobs_2020_mem_zeros = accre_jobs_2020[(accre_jobs_2020['used_mb_per_core'] == 0) & (accre_jobs_2020['usedtime'] > 0)]
accre_jobs_2020_mem_zeros

Unnamed: 0,jobid,account,user,reqtime,usedtime,nodes,cpus,partition,state,exitcode_user,exitcode_error,used_mb_per_core,reqmem_per_core,cluster_type,cluster_nodes
4805,17072928,endive,bennett,432000,25,1,1,turing,COMPLETED,0,0,0.0,8192.0,gpu,0039
4806,17072929,endive,bennett,432000,34,1,1,turing,COMPLETED,0,0,0.0,8192.0,gpu,0039
4807,17072930,endive,bennett,432000,23,1,1,turing,COMPLETED,0,0,0.0,8192.0,gpu,0039
4809,17072932,endive,bennett,432000,65,1,1,turing,COMPLETED,0,0,0.0,8192.0,gpu,0039
4810,17072933,endive,bennett,432000,27,1,1,turing,COMPLETED,0,0,0.0,8192.0,gpu,0039
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3816245,24173761_4,portabella,vennie,86400,36,1,1,production,COMPLETED,0,0,0.0,16384.0,cn,1135
3816258,24173800_3,portabella,vennie,86400,37,1,1,production,COMPLETED,0,0,0.0,16384.0,cn,1133
3816259,24173800_4,portabella,vennie,86400,37,1,1,production,COMPLETED,0,0,0.0,16384.0,cn,1092
3816260,24173800_5,portabella,vennie,86400,37,1,1,production,COMPLETED,0,0,0.0,16384.0,cn,1135


In [36]:
accre_jobs_2020_mem_zeros[accre_jobs_2020_mem_zeros['jobid'].str.contains("_")]

Unnamed: 0,jobid,account,user,reqtime,usedtime,nodes,cpus,partition,state,exitcode_user,exitcode_error,used_mb_per_core,reqmem_per_core,cluster_type,cluster_nodes
13262,17105000_2,summer,buddy,2400,7,1,1,production,COMPLETED,0,0,0.0,20480.0,cn,1437
13545,17105967_0,sprite,asbury,86400,4,1,1,pascal,COMPLETED,0,0,0.0,16384.0,gpu,0017
13546,17105967_1,sprite,asbury,86400,34,1,1,pascal,COMPLETED,0,0,0.0,16384.0,gpu,0018
13547,17105967_2,sprite,asbury,86400,34,1,1,pascal,COMPLETED,0,0,0.0,16384.0,gpu,0018
13549,17105967_4,sprite,asbury,86400,30,1,1,pascal,COMPLETED,0,0,0.0,16384.0,gpu,0018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3816245,24173761_4,portabella,vennie,86400,36,1,1,production,COMPLETED,0,0,0.0,16384.0,cn,1135
3816258,24173800_3,portabella,vennie,86400,37,1,1,production,COMPLETED,0,0,0.0,16384.0,cn,1133
3816259,24173800_4,portabella,vennie,86400,37,1,1,production,COMPLETED,0,0,0.0,16384.0,cn,1092
3816260,24173800_5,portabella,vennie,86400,37,1,1,production,COMPLETED,0,0,0.0,16384.0,cn,1135


## Saving to final CSV

In [37]:
accre_jobs_2020.to_csv('../data/accre_jobs_2020_cleaned.csv', index = False)
accre_jobs_2020_with_zero_cpus.to_csv('../data/accre_jobs_2020_with_zero_cpus.csv', index = False)
accre_jobs_2020_prod.to_csv('../data/accre_jobs_2020_prod.csv', index = False)
accre_jobs_2020_mem_zeros.to_csv('../data/accre_jobs_2020_mem_zeros.csv', index = False)