In [31]:
import pandas as pd
from datetime import datetime as dt
from datetime import timedelta
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import os
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import statsmodels.formula.api as smf 

In [2]:
print(f'Current working directory is {os.getcwd()}')

Current working directory is /Users/drewrichard/Documents/1 projects/nss/nss_projects/accre-carbonara/notebooks


In [3]:
# Import data
jobs = pd.read_csv("../data/fullsample.csv",
                    # nrows = 10000
                    )
ce5 = pd.read_csv('../data/ce5_unresponsive.csv')
ce6 = pd.read_csv('../data/ce6_unresponsive.csv')
jobs = jobs[jobs['END'] != 'Unknown']

# Convert dates to datetime objects
jobs['END'] = pd.to_datetime(jobs['END'])
jobs['BEGIN'] = pd.to_datetime(jobs['BEGIN'])

# String manipulation of memory columns
jobs['USEDMEM'] = jobs['USEDMEM'].str[:-1]
jobs.insert(5, 'Mc_Mn', jobs['REQMEM'].str[-2:])
jobs['REQMEM']  = jobs['REQMEM'].str[:-2]

# Convert to numeric values for easier manipulation, and create difference col
jobs['USEDMEM'] = pd.to_numeric(jobs['USEDMEM'])
jobs['REQMEM'] = pd.to_numeric(jobs['REQMEM'])
jobs['DIFFMEM'] = jobs['REQMEM'] - jobs['USEDMEM']

# Create boolean column for status = COMPLETE
jobs.insert(2, 'COMPLETE', jobs['STATE'] == 'COMPLETED')
jobs['COMPLETE'] = jobs['COMPLETE'].astype(int)

# Create columns to assign failure, job counts, and concat into one df
ce56 = pd.concat([ce5, ce6])
jobs['FAILED'] = 0
jobs['JOBCOUNT'] = 1
ce56['JOBCOUNT'] = 0
ce56['FAILED'] = 1
jobs_ce56 = pd.concat([jobs, ce56])

# Eliminate milliseconds in desired datetime column 'END' 
jobs_ce56['END'] = pd.to_datetime(jobs_ce56['END']).dt.floor('s')
jobs_ce56.head(5)

Unnamed: 0,JOBID,STATE,COMPLETE,BEGIN,END,REQMEM,Mc_Mn,USEDMEM,REQTIME,USEDTIME,...,PARTITION,EXITCODE,DIFFMEM,FAILED,JOBCOUNT,USER,RETRY,TIME,RETURNCODE,COMMAND
1,30853133,COMPLETED,1.0,2021-08-06 11:36:09,2021-09-05 11:36:32,262144.0,Mn,20604.62,30-00:00:00,30-00:00:23,...,cgw-platypus,0:0,241539.38,0,1,,,,,
2,30858137,COMPLETED,1.0,2021-08-06 19:04:39,2021-09-05 19:04:53,204800.0,Mn,57553.77,30-00:00:00,30-00:00:14,...,cgw-tbi01,0:0,147246.23,0,1,,,,,
3,30935078,COMPLETED,1.0,2021-08-09 16:52:51,2021-09-07 20:52:55,65536.0,Mn,20577.96,29-04:00:00,29-04:00:04,...,cgw-platypus,0:0,44958.04,0,1,,,,,
4,31364111_2,COMPLETED,1.0,2021-08-17 07:45:07,2021-09-10 16:45:24,16384.0,Mn,9733.43,24-09:00:00,24-09:00:17,...,production,0:0,6650.57,0,1,,,,,
5,31364111_3,COMPLETED,1.0,2021-08-17 07:45:07,2021-09-06 16:17:34,16384.0,Mn,9708.04,24-09:00:00,20-08:32:27,...,production,0:0,6675.96,0,1,,,,,


In [4]:
ce56.shape

(3296, 8)

In [5]:
# Set the time window in minutes
x = 20
time_delta = pd.Timedelta(minutes=x)

# Extract rows where FAILED == 1
failed_rows = jobs_ce56[jobs_ce56['FAILED'] == 1][['END']].sort_values('END')

# Sort the main DataFrame by 'END'
jobs_ce56 = jobs_ce56.sort_values('END')

# Convert 'END' to numpy arrays for fast computation
end_times = jobs_ce56['END'].to_numpy()
failed_end_times = failed_rows['END'].to_numpy()

# Use NumPy searchsorted to find rows in the time range
start_idx = np.searchsorted(end_times, failed_end_times - time_delta, side='left')
end_idx = np.searchsorted(end_times, failed_end_times, side='right')

# Collect indices for all matching rows
matching_indices = np.concatenate([np.arange(start, end) for start, end in zip(start_idx, end_idx)])

# Extract the matching rows from the original DataFrame
result = jobs_ce56.iloc[np.unique(matching_indices)].copy()

# Identify missing indicators
result['group'] = result['FAILED']==1

# Shift the group assignment up by one
result['group'] = result['group'].shift(-1, fill_value=False).cumsum()

# Sum the USEDMEM values for each group
totalmem = result.groupby('group')['USEDMEM'].sum()

# Display result
display(result)

Unnamed: 0,JOBID,STATE,COMPLETE,BEGIN,END,REQMEM,Mc_Mn,USEDMEM,REQTIME,USEDTIME,...,EXITCODE,DIFFMEM,FAILED,JOBCOUNT,USER,RETRY,TIME,RETURNCODE,COMMAND,group
7054741,25041599,COMPLETED,1.0,2020-10-18 05:39:54,2020-10-18 05:57:39,61440.0,Mn,4450.38,5-00:00:00,00:17:45,...,0:0,56989.62,0,1,,,,,,0
7054510,25041077,COMPLETED,1.0,2020-10-18 05:38:00,2020-10-18 05:58:37,21880.0,Mn,1.26,2-00:00:00,00:20:37,...,0:0,21878.74,0,1,,,,,,0
7051604,25033847_909,COMPLETED,1.0,2020-10-17 20:53:43,2020-10-18 05:58:42,6144.0,Mn,1646.39,8-08:00:00,09:04:59,...,0:0,4497.61,0,1,,,,,,0
7054506,25041073,COMPLETED,1.0,2020-10-18 05:38:00,2020-10-18 05:58:44,21875.0,Mn,1.60,2-00:00:00,00:20:44,...,0:0,21873.40,0,1,,,,,,0
7054508,25041075,COMPLETED,1.0,2020-10-18 05:38:21,2020-10-18 05:59:09,21875.0,Mn,1.80,2-00:00:00,00:20:48,...,0:0,21873.20,0,1,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
918975,32891997,COMPLETED,1.0,2021-10-01 19:38:34,2021-10-06 15:23:05,6144.0,Mn,243.17,5-00:00:00,4-19:44:31,...,0:0,5900.83,0,1,,,,,,3295
927022,32920902,COMPLETED,1.0,2021-10-06 15:23:36,2021-10-06 15:25:21,2000.0,Mn,513.23,2-00:00:00,00:01:45,...,0:0,1486.77,0,1,,,,,,3295
926285,32914842,COMPLETED,1.0,2021-10-04 16:16:53,2021-10-06 15:27:32,21875.0,Mn,0.09,2-00:00:00,1-23:10:39,...,0:0,21874.91,0,1,,,,,,3295
927026,32920924,COMPLETED,1.0,2021-10-06 15:30:36,2021-10-06 15:33:18,2000.0,Mn,392.43,2-00:00:00,00:02:42,...,0:0,1607.57,0,1,,,,,,3296


In [6]:
totalmem = pd.DataFrame(totalmem).reset_index()
totalmem

Unnamed: 0,group,USEDMEM
0,0,16292.69
1,1,184696.93
2,2,149351.37
3,3,1.63
4,4,174758.03
...,...,...
3292,3292,874523.56
3293,3293,433097.63
3294,3294,398128.59
3295,3295,2925.49


In [7]:
fig = px.scatter(totalmem,
           x=totalmem['group'],
           y=totalmem['USEDMEM'],
           opacity=0.25,
           labels={'group':'Group Number', 'USEDMEM':'Total Used Memory (MB/node)'},
           title='It is unlikely that memory use is the direct cause of Unresponsive indicator'
           )
# Calculate average y value
avg_y = totalmem['USEDMEM'].mean()

# Add horizontal line at average y
fig.add_hline(
    y=avg_y,
    line_dash="dot",
    line_color="skyblue",
    label=dict(
        text=f"average memory used={avg_y:.2f} MBpN",
        textposition='start',
        font=dict(size=10, color="RED"),
        yanchor="top",
    ),
)
# fig.add_vrect(x0=20000, x1=40000, 
#               annotation_text="Fairly little memory used", annotation_position="top left",
#               fillcolor="green", opacity=0.25, line_width=0)

In [8]:
px.histogram(totalmem['USEDMEM'], nbins=200)

In [9]:
nodes = result.value_counts('Mc_Mn')
display(nodes)

Mc_Mn
Mn    394305
Mc    102507
Name: count, dtype: int64

In [10]:
totaldiffmem = result.groupby('group')['DIFFMEM'].sum()
totaldiffmem = pd.DataFrame(totaldiffmem).reset_index()
totaldiffmem

Unnamed: 0,group,DIFFMEM
0,0,1032299.31
1,1,1332866.07
2,2,966108.63
3,3,21876.37
4,4,1429049.97
...,...,...
3292,3292,3634208.44
3293,3293,414842.37
3294,3294,388274.41
3295,3295,61909.51


In [11]:
fig2 = px.scatter(totaldiffmem,
           x=totaldiffmem['group'],
           y=totaldiffmem['DIFFMEM'],
           opacity=0.25,
           labels={'group':'Group Number', 'DIFFMEM':'Difference in memory (MB/node)'},
           #title='It is unlikely that memory use is the direct cause of Unresponsive indicator'
           )
# Calculate average y value
avg_y = totaldiffmem['DIFFMEM'].mean()

# Add horizontal line at average y
fig2.add_hline(
    y=avg_y,
    line_dash="dot",
    line_color="skyblue",
    label=dict(
        text=f"average difference in memory used={avg_y:.2f} MBpN",
        textposition='start',
        font=dict(size=10, color="red"),
        yanchor="top",
    ),
)
# fig2.add_vrect(x0=20000, x1=40000, 
#               annotation_text='little difference in<br>memory used', annotation_position="top left",
#               fillcolor="green", opacity=0.25, line_width=0)

In [12]:
jobcounts = result.groupby('group')['JOBCOUNT'].sum()
jobcounts = pd.DataFrame(jobcounts).reset_index()
jobcounts

Unnamed: 0,group,JOBCOUNT
0,0,58
1,1,79
2,2,70
3,3,1
4,4,79
...,...,...
3292,3292,154
3293,3293,202
3294,3294,235
3295,3295,21


In [13]:
fig3 = px.scatter(jobcounts,
           x=jobcounts['group'],
           y=jobcounts['JOBCOUNT'],
           opacity=0.25,
           labels={'group':'Group Number', 'JOBCOUNT':'number of jobs'},
           title='It is unlikely that number of jobs is the direct cause of Unresponsive indicator'
           )
# Calculate average y value
avg_y = jobcounts['JOBCOUNT'].mean()

# Add horizontal line at average y
fig3.add_hline(
    y=avg_y,
    line_dash="dot",
    line_color="skyblue",
    label=dict(
        text=f"average number of jobs={avg_y:.2f}",
        textposition='start',
        font=dict(size=10, color="red"),
        yanchor="top",
    ),
)
# fig3.add_vrect(x0=20081, x1=40285, 
#               annotation_text="Very few jobs running", annotation_position="top left",
#               fillcolor="green", opacity=0.25, line_width=0)

In [14]:
# Sum Nodes for each group
nodesum = result.groupby('group')['NODES'].sum()
nodesum = pd.DataFrame(nodesum).reset_index()

# Plot sum of nodes used in 20min before failure
fig4 = px.scatter(nodesum,
           x=nodesum['group'],
           y=nodesum['NODES'],
           opacity=0.25,
           labels={'group':'Group Number', 'NODES':'Nodes used'},
           #title='It is unlikely that memory use is the direct cause of Unresponsive indicator'
           )
# Calculate average y value
avg_y = nodesum['NODES'].mean()

# Add horizontal line at average y
fig4.add_hline(
    y=avg_y,
    line_dash="dot",
    line_color="skyblue",
    label=dict(
        text=f"average number of nodes used={avg_y:.2f}",
        textposition='start',
        font=dict(size=10, color="RED"),
        yanchor="top",
    ),
)
# fig4.add_vrect(x0=20000, x1=40000, 
#               annotation_text="Fairly few nodes used", annotation_position="top left",
#               fillcolor="green", opacity=0.25, line_width=0)

In [15]:
# Sum the CPU values for each group
cpusum = result.groupby('group')['CPUS'].sum()
cpusum = pd.DataFrame(cpusum).reset_index()

# Plot total CPUs used in 20min before failure
fig5 = px.scatter(cpusum,
           x=cpusum['group'],
           y=cpusum['CPUS'],
           opacity=0.25,
           labels={'group':'Group Number', 'NODES':'Nodes used'},
           #title='It is unlikely that memory use is the direct cause of Unresponsive indicator'
           )
# Calculate average y value
avg_y = cpusum['CPUS'].mean()

# Add horizontal line at average y
fig5.add_hline(
    y=avg_y,
    line_dash="dot",
    line_color="skyblue",
    label=dict(
        text=f"average number of CPUs used={avg_y:.2f}",
        textposition='start',
        font=dict(size=10, color="RED"),
        yanchor="top",
    ),
)
# fig5.add_vrect(x0=20000, x1=40000, 
#               annotation_text="Fairly few CPUs used", annotation_position="top left",
#               fillcolor="green", opacity=0.25, line_width=0)

In [16]:
fig_multi = make_subplots(rows=5, cols=1, shared_xaxes=True)

fig_multi.add_trace(
    go.Scatter(x=totalmem['group'],
               y=totalmem['USEDMEM'],
               name='USEDMEM',
               opacity=0.5
    ),
    row=1,
    col=1
)

fig_multi.add_trace(
    go.Scatter(x=totaldiffmem['group'],
               y=totaldiffmem['DIFFMEM'],
               name='DIFFMEM',
               opacity=0.5
    ),
    row=2,
    col=1
)

fig_multi.add_trace(
    go.Scatter(x=jobcounts['group'],
               y=jobcounts['JOBCOUNT'],
               name='JOBCOUNT',
               opacity=0.5
    ),
    row=3,
    col=1
)

fig_multi.add_trace(
    go.Scatter(x=nodesum['group'],
               y=nodesum['NODES'],
               opacity=0.5,
               name='NODES',
    ),
    row=4,
    col=1
)

fig_multi.add_trace(
    go.Scatter(x=cpusum['group'],
               y=cpusum['CPUS'],
               opacity=0.5,
               name='CPUs',
    ),
    row=5,
    col=1
)

# Update xaxis properties
fig_multi.update_xaxes(title_text="Group Number")

# Update yaxis properties
fig_multi.update_yaxes(
    title_text="Mem Used", 
    row=1, col=1, 
    title_standoff=20,
    title_font=dict(size=20, family='Arial'),
)

fig_multi.update_yaxes(
    title_text="Diff in Mem", 
    row=2, col=1, 
    title_standoff=20,
    title_font=dict(size=20, family='Arial'),
)

fig_multi.update_yaxes(
    title_text="# Jobs", 
    row=3, col=1, 
    title_standoff=20,
    title_font=dict(size=20, family='Arial'),
)

fig_multi.update_yaxes(
    title_text="# Nodes", 
    row=4, col=1, 
    title_standoff=20,
    title_font=dict(size=20, family='Arial'),
)

fig_multi.update_yaxes(
    title_text="# CPUs", 
    row=5, col=1, 
    title_standoff=20,
    title_font=dict(size=20, family='Arial'),
)

# Highligth notable zone of minimal activity where failure still reported
# fig_multi.add_vrect(
#     x0=20000, x1=40000, 
#     annotation_text="Notable Zone", 
#     annotation_position="top left",
#     fillcolor="green", 
#     opacity=0.25, 
#     line_width=0
# )

fig_multi.update_layout(title_text="Memory, Job number, Node & CPU use unlikely to directly cause ACCRE to become unresponsive", 
                showlegend=True,
                margin=dict(l=100, r=50, t=100, b=100),
                height=1000)
fig_multi.show()

In [17]:
funzone = result[(result['group']>=20000) & (result['group']<=40000)].sort_values(['END'])
display(funzone.head(3), funzone.tail(3))
funzone['STATE'].value_counts()

Unnamed: 0,JOBID,STATE,COMPLETE,BEGIN,END,REQMEM,Mc_Mn,USEDMEM,REQTIME,USEDTIME,...,EXITCODE,DIFFMEM,FAILED,JOBCOUNT,USER,RETRY,TIME,RETURNCODE,COMMAND,group


Unnamed: 0,JOBID,STATE,COMPLETE,BEGIN,END,REQMEM,Mc_Mn,USEDMEM,REQTIME,USEDTIME,...,EXITCODE,DIFFMEM,FAILED,JOBCOUNT,USER,RETRY,TIME,RETURNCODE,COMMAND,group


Series([], Name: count, dtype: int64)

In [27]:
logistic_df = pd.DataFrame({'GROUP':totalmem['group'], 'USEDMEM':totalmem['USEDMEM'], 'DIFFMEM':totaldiffmem['DIFFMEM'], 'JOBCOUNT':jobcounts['JOBCOUNT'], 'NODES':nodesum['NODES'], 'CPUs':cpusum['CPUS']})

In [30]:
logistic_df['FAILED'] = 1
logistic_df

Unnamed: 0,GROUP,USEDMEM,DIFFMEM,JOBCOUNT,NODES,CPUs,FAILED
0,0,16292.69,1032299.31,58,58.0,330.0,1
1,1,184696.93,1332866.07,79,79.0,377.0,1
2,2,149351.37,966108.63,70,70.0,486.0,1
3,3,1.63,21876.37,1,1.0,8.0,1
4,4,174758.03,1429049.97,79,79.0,462.0,1
...,...,...,...,...,...,...,...
3292,3292,874523.56,3634208.44,154,154.0,488.0,1
3293,3293,433097.63,414842.37,202,202.0,224.0,1
3294,3294,398128.59,388274.41,235,235.0,298.0,1
3295,3295,2925.49,61909.51,21,21.0,28.0,1


In [None]:
# logit_model = smf.logit('FAILED ~ ')

3297

randomly select 20minute blocks to incude nonfailures 

In [None]:
dates = result[result['FAILED']==1]['END']
dates

0      2020-10-18 06:16:25
1      2020-10-18 06:38:44
0      2020-10-18 06:53:44
1      2020-10-18 06:54:04
2      2020-10-18 07:47:25
               ...        
1807   2021-09-24 18:14:35
1808   2021-09-24 19:13:14
1809   2021-10-02 08:14:16
1810   2021-10-02 18:29:08
1484   2021-10-06 15:39:20
Name: END, Length: 3296, dtype: datetime64[ns]

ValueError: Length mismatch: Expected 3297 rows, received array of length 3296