Next, use the two log files to determine time intervals (hours) when the scheduler was unresponsive. Do this by looking for records that are the "sbatch" command from user 9204 that have return code 1 and an execution time of ~20 seconds (at least more than 15). These are commands where the scheduler timed out in responding.

3. Calculate some descriptive statistics about how often the scheduler was unresponsive, how long these periods of time were, and create a time series plot of when the scheduler was having difficulties.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns

In [None]:
# Read in both slurm files and concatenate, saving together.
files = ['data/slurm_wrapper_ce6.log', 'data/slurm_wrapper_ce5.log']

for index, value in enumerate(files):
    df = (
        pd.read_csv('data/slurm_wrapper_ce6.log', 
                    sep = ' - ',
                    names = ['Timestamp',
                             'User',
                             'Retry Attempt',
                             'Execution Time',
                             'Returncode',
                             'Command'],
                    engine = 'python'))
    df['Execution Time'] = (
    df['Execution Time'].replace('time ', '', regex = True).astype('float64')
)
    df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%Y-%m-%d %H:%M:%S.%f')
    df['Unresponsive'] = (
        np.where((df['User'] == 'user 9204')\
                 & (df['Returncode'] == 'returncode 1')\
                 & (df['Command'].str.contains('sbatch'))\
                 & (df['Execution Time'] > 15), 
                 True,
                 False)
)
    df['Testing'] = (
        np.where((df['User'] == 'user 9204')\
                 & (df['Command'].str.contains('sbatch')), 
                 True,
                 False)
)
    if index == 0:
        all_dfs = df
    else:
        all_dfs = pd.concat(objs = [all_dfs, df], axis = 0)
all_dfs = all_dfs.set_index('Timestamp').sort_index(axis = 0, kind = 'mergesort')

In [None]:
# Calculate overall percentage of the time when the scheduler was unresponsive upon a test.
unresponsive_series = all_dfs['Unresponsive'].loc[all_dfs['Unresponsive'] == True]
testing_series = all_dfs['Testing'].loc[all_dfs['Testing'] == True]
overall_unresponsive_percent = round(len(unresponsive_series)/len(testing_series),9)
print("Scheduler was unresponsive for {:.3%} of all tests.".format(overall_unresponsive_percent))

In [None]:
# Group data by sum of unresponsive counts per hour.
relevant_columns = ['Testing', 'Unresponsive']

for index, value in enumerate(relevant_columns):
    df = (
        all_dfs.groupby(pd.Grouper(freq = 'H'))[value].\
        sum().\
        reset_index().\
        rename(columns = {'sum':f'{value}'}))
    if index == 0:
        hour_dfs = df
    else:
        hour_dfs = hour_dfs.merge(df, on = 'Timestamp')
        
# Determine unresponsive proportion of testing results.
hour_dfs['Unresponsive_Proportion'] = (
    hour_dfs['Unresponsive']/hour_dfs['Testing']
)

In [None]:
# Resample responsiveness counts by day, week and month.

responsiveness_by_day = (
    pd.DataFrame(hour_dfs.resample('D', on = 'Timestamp')['Unresponsive', 'Unresponsive_Proportion', 'Testing'].sum())
)
responsiveness_by_week = (
    pd.DataFrame(hour_dfs.resample('W', on = 'Timestamp')['Unresponsive', 'Unresponsive_Proportion'].sum())
)
responsiveness_by_month = (
    pd.DataFrame(hour_dfs.resample('M', on = 'Timestamp')['Unresponsive', 'Unresponsive_Proportion'].sum())
)
      
responsiveness_by_day['UnresponsiveMinMax'] = 0.0
responsiveness_by_week['UnresponsiveMinMax'] = 0.0
responsiveness_by_month['UnresponsiveMinMax'] = 0.0

responsiveness_by_day['UnresponsiveProportionMinMax'] = 0.0
responsiveness_by_week['UnresponsiveProportionMinMax'] = 0.0
responsiveness_by_month['UnresponsiveProportionMinMax'] = 0.0

list_of_dfs = [responsiveness_by_day, 
               responsiveness_by_week, 
               responsiveness_by_month]

In [None]:
# Plot line graphs of unresponsive counts over time.
# Possibly reduce xtick number.
facecolor = 'white'
fig = plt.figure(figsize = (15, 10), 
                 dpi = 400)
axes = fig.subplots(nrows = 3)
plt.subplots_adjust(hspace = 0.5)

x_labels = ['Day', 
            'Week', 
            'Month']
y_labels = ['Relative UC Per Day', 
            'Relative UC Per Week', 
            'Relative UC Per Month']

for index, var in enumerate(list_of_dfs):
    var['UnresponsiveMinMax'] = (
        (var.Unresponsive - var.Unresponsive.min())/\
        (var.Unresponsive.max() - var.Unresponsive.min())
    )
    var['UnresponsiveProportionMinMax'] = (
        (var.Unresponsive_Proportion - var.Unresponsive_Proportion.min())/\
        (var.Unresponsive_Proportion.max() - var.Unresponsive_Proportion.min())
    )
    y1 = var.UnresponsiveMinMax
    y2 = var.UnresponsiveProportionMinMax
    x = var.index
    axes[index].plot(x, y1, linewidth = 2, color = 'red')
    axes[index].plot(x, y2, linewidth = 2, color = 'dodgerblue', alpha = 0.5)
    axes[index].set_xlabel(x_labels[index])
    axes[index].set_ylabel(y_labels[index])
    axes[index].xaxis.set_major_formatter(mdates.DateFormatter('%D'))
    axes[index].legend(['Counts','Proportion of Testing'])
    plt.setp(axes[index].get_xticklabels(), 
             rotation = 90, 
             fontsize = 8)
    axes[index].set_ylim(-0.05,1.05)

axes[0].xaxis.set_major_locator(mdates.WeekdayLocator(interval = 2))
axes[0].xaxis.set_minor_locator(mdates.WeekdayLocator(interval = 1))
axes[1].xaxis.set_major_locator(mdates.WeekdayLocator(interval = 2))
axes[2].set_xticks(responsiveness_by_month.index)

axes[0].set_title('Unresponsiveness Counts and Proportion Over Time');

In [None]:
# Plot histgram of the unresponsive counts per day for November through December 2020.
facecolor = 'white'
fig = plt.figure(figsize = (15, 5), 
                 dpi = 400)

criteria = (
    (responsiveness_by_day.index >= '2020-12-01') &\
    (responsiveness_by_day.index <= '2021-1-31')
)

colors = ['orange', 
          'blue']

for num, i in enumerate(['Unresponsive', 'Testing']):
    x = responsiveness_by_day[i][criteria]
    plt.hist(x, 
         bins = 20, 
         color = colors[num], 
         edgecolor = 'black', 
         linewidth = 1.2)
    
plt.legend(['UC','TC'])
plt.xlabel('UC/TC')
plt.ylabel('Counts')
plt.title('Unresponsiveness and Testing Counts per Day during November and December 2020');

In [None]:
# Create a dataframe that shows consecutive unresponsive signals over three minute increments and sums them.
for index, value in enumerate(relevant_columns):
    df = (
        all_dfs.groupby(pd.Grouper(freq = '3T'))[value].\
        sum().\
        reset_index().\
        rename(columns = {'sum':f'{value}'}))
    if index == 0:
        concat_dfs_2 = df
    else:
        concat_dfs_2 = concat_dfs_2.merge(df, on = 'Timestamp')

# Possibly refactor.
begin = []
end = []
sum_runs = []
for num, row in enumerate(concat_dfs_2.itertuples()):
    if num in range(0, len(concat_dfs_2) - 1):
        upper_value = concat_dfs_2.iloc[num - 1, 2]
        lower_value = concat_dfs_2.iloc[num + 1, 2]
        if 0 not in [row[3], lower_value] and upper_value == 0:
            begin.append(row[1])
            sum_runs.append(0)
            i = sum_runs.index(0)
            sum_runs[i] = row[3]
        elif 0 not in [row[3], upper_value, lower_value]:
            sum_runs[i] += row[3]
        elif 0 not in [row[3], upper_value] and lower_value == 0:
            sum_runs[i] += row[3]
            end.append(concat_dfs_2.iloc[num + 1, 0])

In [None]:
# Show intervals dataframe.
consecutive = pd.DataFrame({'Begin': begin, 
                            'End':end, 
                            'Number of Unresponsive Counts':sum_runs})

consecutive['Interval in Minutes'] = consecutive['End'] - consecutive['Begin']
consecutive['Interval in Minutes'] = [x.seconds/60 for x in consecutive['Interval in Minutes']]
consecutive['Interval in Minutes'] = consecutive['Interval in Minutes'].astype('int64')

# Possibly extend initial background color to begin and end columns.
cm = sns.color_palette('rocket', as_cmap = True)
consecutive[0:41][['Number of Unresponsive Counts','Interval in Minutes']].style.background_gradient(cmap = cm)