In [1]:
import pandas as pd
from datetime import datetime as dt
import numpy as np
import re

In [2]:
jobs = pd.read_csv("../data/fullsample.csv", nrows=1000000)
jobs.head(5)
# ce5_unresponsive = pd.read_csv('../data/ce5_unresponsive.csv')
# ce6_unresponsive = pd.read_csv('../data/ce6_unresponsive.csv')

Unnamed: 0,JOBID,STATE,BEGIN,END,REQMEM,USEDMEM,REQTIME,USEDTIME,NODES,CPUS,PARTITION,EXITCODE
0,30616928,RUNNING,2021-07-31T22:15:00,Unknown,2048Mn,0,10:04:00,67-22:14:22,1,1,production,0:0
1,30853133,COMPLETED,2021-08-06T11:36:09,2021-09-05T11:36:32,262144Mn,20604.62M,30-00:00:00,30-00:00:23,1,1,cgw-platypus,0:0
2,30858137,COMPLETED,2021-08-06T19:04:39,2021-09-05T19:04:53,204800Mn,57553.77M,30-00:00:00,30-00:00:14,1,32,cgw-tbi01,0:0
3,30935078,COMPLETED,2021-08-09T16:52:51,2021-09-07T20:52:55,65536Mn,20577.96M,29-04:00:00,29-04:00:04,1,8,cgw-platypus,0:0
4,31364111_2,COMPLETED,2021-08-17T07:45:07,2021-09-10T16:45:24,16384Mn,9733.43M,24-09:00:00,24-09:00:17,1,1,production,0:0


In [3]:
jobs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 12 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   JOBID      1000000 non-null  object
 1   STATE      1000000 non-null  object
 2   BEGIN      1000000 non-null  object
 3   END        1000000 non-null  object
 4   REQMEM     1000000 non-null  object
 5   USEDMEM    1000000 non-null  object
 6   REQTIME    1000000 non-null  object
 7   USEDTIME   1000000 non-null  object
 8   NODES      1000000 non-null  int64 
 9   CPUS       1000000 non-null  int64 
 10  PARTITION  1000000 non-null  object
 11  EXITCODE   1000000 non-null  object
dtypes: int64(2), object(10)
memory usage: 91.6+ MB


In [4]:
jobs = jobs[jobs['END'] != 'Unknown']

In [None]:
# Fixes jobs BEGIN and END columns to datetime
k
jobs['BEGIN'] = [dt.strptime(row.replace('T', ' ') + ('.000000' if '.' not in row else ''), '%Y-%m-%d %H:%M:%S.%f') for row in jobs['BEGIN']]

In [6]:
# Fixes USEDMEM column to remove the M and convert to a numeric
jobs['USEDMEM'] = jobs['USEDMEM'].str[:-1]
jobs['USEDMEM'] = pd.to_numeric(jobs['USEDMEM'])

In [7]:
# Cleans memory of nodes/CPUs to allow for numeric manipulation
jobs.insert(5, 'Mc_Mn', jobs['REQMEM'].str[-2:])
jobs['REQMEM']  = jobs['REQMEM'].str[:-2]
jobs['REQMEM'] = pd.to_numeric(jobs['REQMEM'])

In [8]:
# Creates new column that is the difference between the memory used and memory requested
diffmem = jobs['REQMEM'] - jobs['USEDMEM']
jobs.insert(7, 'DIFFMEM', diffmem)

In [9]:
# New column that turns the completion status into a boolean for logistic regression
jobs.insert(2, 'COMPLETE', jobs['STATE'] == 'COMPLETED')
jobs['COMPLETE'] = jobs['COMPLETE'].astype(int)

In [10]:
incomplete = jobs.loc[jobs['COMPLETE'] != 1].head(5)
USEDMEM_pivot = jobs.pivot_table(values='USEDMEM', index=['NODES','EXITCODE', 'STATE'], aggfunc='mean')
REQMEM_pivot = jobs.pivot_table(values='REQMEM', index=['Mc_Mn','EXITCODE', 'STATE'], aggfunc='mean')

The EXITCODE gives the exit code for the job, with "0:0" indicating a successful job. Exit codes have two numbers, where if the first number is non-zero, it indicates a problem on the server side and if the second is nonzero, it indicates a problem on the user side.

In [11]:
diff_mem_group = jobs.groupby(['STATE', 'PARTITION', 'EXITCODE'])['DIFFMEM'].describe()
# jobs['MONTH'] = jobs['END'].dt.month
jobs.groupby('JOBID')['USEDMEM'].mean()

JOBID
30284664        56500.83
30387199            0.06
30503582_484    25031.82
30503586_868    13656.57
30504814_323    12643.80
                  ...   
32923970          343.00
32923977            0.09
32923987            0.09
32923988            0.09
32923989          393.07
Name: USEDMEM, Length: 999349, dtype: float64

In [12]:
jobs[jobs['JOBID'].str.contains('30853133')]

Unnamed: 0,JOBID,STATE,COMPLETE,BEGIN,END,REQMEM,Mc_Mn,USEDMEM,DIFFMEM,REQTIME,USEDTIME,NODES,CPUS,PARTITION,EXITCODE
1,30853133,COMPLETED,1,2021-08-06 11:36:09,2021-09-05 11:36:32,262144,Mn,20604.62,241539.38,30-00:00:00,30-00:00:23,1,1,cgw-platypus,0:0
