In [1]:
import pandas as pd
from datetime import datetime as dt
import numpy as np
import re

In [2]:
# read in dataset 
ce6 = pd.read_csv('../data/slurm_wrapper_ce6.log',
                  header=None,
                  delimiter=' - ',
                  engine='python',
                #   nrows=10000
                  )

In [3]:
# set column names
colnames = ['END', 'USER', 'RETRY', 'TIME', 'RETURNCODE', 'COMMAND']
ce6.columns = colnames

from carb_funcs import optional_ms

# Apply the parsing function to the 'END' column
ce6['END'] = optional_ms(ce6['END'])

# Clean and convert other columns
ce6['TIME'] = ce6['TIME'].str.replace('time', '', regex=False).astype(float)
ce6['USER'] = ce6['USER'].str.replace('user', '', regex=False).astype(int)
ce6['RETRY'] = ce6['RETRY'].str.replace('retry', '', regex=False).astype(int)
ce6['RETURNCODE'] = ce6['RETURNCODE'].str.replace('returncode', '', regex=False).astype(float)
ce6['COMMAND'] = ce6['COMMAND'].str.replace('command', '', regex=False)
ce6['COMMAND'] = [re.sub("[\'\\[\\] ]", '', row).split(',') for row in ce6['COMMAND']];
# Display the updated DataFrame
ce6.head(2)

Unnamed: 0,END,USER,RETRY,TIME,RETURNCODE,COMMAND
0,2020-10-16 10:37:44.163454,9202,0,0.084954,0.0,"[/usr/bin/scontrol, show, job, 24997187]"
1,2020-10-16 10:37:44.206654,9202,0,0.089431,0.0,"[/usr/bin/scontrol, show, job, 24997190]"


In [4]:
# checking that all values contain timestamps
print(ce6['END'].sort_values().iloc[-1])

2021-10-07 22:00:20.919558


In [None]:
# slicing out the records that satisfy unresponsive conditions outlined. 
# slicing out the records that satisfy unresponsive conditions outlined. 
ce6_filtered = ce6[(ce6['USER'] == 9204) & (ce6['RETURNCODE'] == 1.0) & (ce6['TIME'] > 15)]
ce6_unresponsive_df = ce6_filtered[ce6_filtered['COMMAND'].map(lambda x: any('sbatch' in cmd for cmd in x))]
display(ce6_unresponsive_df.head())

Unnamed: 0,END,USER,RETRY,TIME,RETURNCODE,COMMAND
36913,2020-10-18 06:16:25.392946,9204,0,20.037672,1.0,"[/usr/bin/sbatch, /tmp/condor_g_scratch.0x5572..."
37605,2020-10-18 06:38:44.172473,9204,0,20.038736,1.0,"[/usr/bin/sbatch, /tmp/condor_g_scratch.0x5572..."
39075,2020-10-18 07:47:32.241050,9204,0,20.018348,1.0,"[/usr/bin/sbatch, /tmp/condor_g_scratch.0x5572..."
39356,2020-10-18 08:08:49.366063,9204,0,20.030497,1.0,"[/usr/bin/sbatch, /tmp/condor_g_scratch.0x5572..."
40072,2020-10-18 08:57:22.419398,9204,0,20.038508,1.0,"[/usr/bin/sbatch, /tmp/condor_g_scratch.0x5572..."


In [8]:
print(ce6_unresponsive_df.info(), ce6_unresponsive_df.shape)

<class 'pandas.core.frame.DataFrame'>
Index: 1485 entries, 36913 to 4766868
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   END         1485 non-null   datetime64[ns]
 1   USER        1485 non-null   int64         
 2   RETRY       1485 non-null   int64         
 3   TIME        1485 non-null   float64       
 4   RETURNCODE  1485 non-null   float64       
 5   COMMAND     1485 non-null   object        
dtypes: datetime64[ns](1), float64(2), int64(2), object(1)
memory usage: 81.2+ KB
None (1485, 6)


In [6]:
ce6_unresponsive_df.to_csv('../data/ce6_unresponsive.csv', index=False)