Next, use the two log files to determine time intervals (hours) when the scheduler was unresponsive. Do this by looking for records that are the "sbatch" command from user 9204 that have return code 1 and an execution time of ~20 seconds (at least more than 15). These are commands where the scheduler timed out in responding.

3. Calculate some descriptive statistics about how often the scheduler was unresponsive, how long these periods of time were, and create a time series plot of when the scheduler was having difficulties.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib.patches import Polygon
import datetime as dt
import seaborn as sns
from sqlalchemy import *

In [None]:
# Read in both slurm files and concatenate, saving together.
files = ['data/slurm_wrapper_ce6.log', 'data/slurm_wrapper_ce5.log']

for index, value in enumerate(files):
    df = (
        pd.read_csv(value, 
                    sep = ' - ',
                    names = ['DATETIME',
                             'USER',
                             'RETRY',
                             'TIMELAPS',
                             'RETURNCODE',
                             'COMMAND'],
                    engine = 'python'))
    
    df['TIMELAPS'] = (
    df['TIMELAPS'].replace('time ', '', regex = True).astype('float64')
    )
    df['DATETIME'] = pd.to_datetime(df['DATETIME'], format='%Y-%m-%d %H:%M:%S.%f')
    
    df['DATE'] = df['DATETIME'].apply(lambda x: x.date())
    df['TIME'] = df['DATETIME'].apply(lambda x: x.time())
    
    df['UNRESPONSIVE'] = (
        np.where((df['USER'] == 'user 9204')\
                 & (df['RETURNCODE'] == 'returncode 1')\
                 & (df['COMMAND'].str.contains('sbatch'))\
                 & (df['TIMELAPS'] > 15), 
                 True,
                 False)
    )
    df['TESTING'] = (
        np.where((df['USER'] == 'user 9204')\
                 & (df['COMMAND'].str.contains('sbatch')), 
                 True,
                 False)
    )
    df['SERVER'] = ''
    if index == 0:
        df['SERVER'] = 'ce6'
    else:
        df['SERVER'] = 'ce5'
    
    df['JOBID'] = (
        df['COMMAND'].\
        str.extract("\'(\d{2,})\'", expand = True)
    )
    
    df['USERCOUNT'] = np.where(~df['USER'].str.contains('user 9204'), True, False)
    
    if index == 0:
        all_dfs = df
    else:
        all_dfs = pd.concat(objs = [all_dfs, df], axis = 0)
all_dfs = all_dfs.set_index('DATETIME').sort_index(axis = 0, kind = 'mergesort')

In [None]:
# Calculate overall percentage of the time when the scheduler was unresponsive upon a test.
unresponsive_series = all_dfs['UNRESPONSIVE'].loc[all_dfs['UNRESPONSIVE'] == True]
testing_series = all_dfs['TESTING'].loc[all_dfs['TESTING'] == True]
overall_unresponsive_percent = round(len(unresponsive_series)/len(testing_series),9)
print("Scheduler was unresponsive for {:.3%} of all tests.".format(overall_unresponsive_percent))

In [None]:
# Group data by sum of unresponsive counts per hour.
relevant_columns = ['TESTING', 'UNRESPONSIVE', 'USERCOUNT']

for index, value in enumerate(relevant_columns):
    df = (
        all_dfs.groupby(pd.Grouper(freq = 'H'))[value].\
        sum().\
        reset_index().\
        rename(columns = {'sum':f'{value}'}))
    if index == 0:
        hour_dfs = df
    else:
        hour_dfs = hour_dfs.merge(df, on = 'DATETIME')

# Determine unresponsive proportion of testing results.
hour_dfs['UNRESPONSIVEPROPORTION'] = (
    hour_dfs['UNRESPONSIVE']/hour_dfs['TESTING']
)

In [None]:
# Resample responsiveness and testing counts and proportion by day, week and month.

responsiveness_by_day = pd.DataFrame()
responsiveness_by_week = pd.DataFrame()
responsiveness_by_month = pd.DataFrame()

list_of_samples = ['D','W','M']

list_of_dfs = [responsiveness_by_day, 
               responsiveness_by_week, 
               responsiveness_by_month]

new_list_of_dfs = []

for index, i in enumerate(list_of_dfs):
    i = (
        pd.DataFrame(hour_dfs.resample(list_of_samples[index], 
                                       on = 'DATETIME')['UNRESPONSIVE',
                                                         'UNRESPONSIVEPROPORTION',
                                                         'TESTING', 
                                                         'USERCOUNT'].sum())
    )
    for j in ['UNRESPONSIVE','UNRESPONSIVEPROPORTION','TESTING','USERCOUNT']:
        i[j + 'MINMAX'] = (
            (i[j] - i[j].min())/(i[j].max() - i[j].min())
    )
    new_list_of_dfs.append(i)

responsiveness_by_day = new_list_of_dfs[0]
responsiveness_by_week = new_list_of_dfs[1]
responsiveness_by_month = new_list_of_dfs[2]

list_of_dfs = [responsiveness_by_day, 
               responsiveness_by_week, 
               responsiveness_by_month]

In [None]:
# Plot line graphs of unresponsive counts over time.
# Possibly reduce xtick number.
facecolor = 'white'
fig = plt.figure(figsize = (15, 12), 
                 dpi = 400)
axes = fig.subplots(nrows = 3)
plt.subplots_adjust(hspace = 0.5)

x_labels = ['Day', 
            'Week', 
            'Month']
y_labels = ['Relative Values Per Day', 
            'Relative Values Per Week', 
            'Relative Values Per Month']

for index, var in enumerate(list_of_dfs):
    y1 = var['UNRESPONSIVEMINMAX']
    y2 = var['UNRESPONSIVEPROPORTIONMINMAX']
    y3 = var['TESTINGMINMAX']
    x = var.index
    axes[index].plot(x, y1, linewidth = 2, color = 'red')
    axes[index].plot(x, y2, linewidth = 2, color = 'dodgerblue', alpha = 0.5)
    axes[index].plot(x, y3, linewidth = 2, color = 'green', alpha = 0.5)
    axes[index].set_xlabel(x_labels[index])
    axes[index].set_ylabel(y_labels[index])
    axes[index].xaxis.set_major_formatter(mdates.DateFormatter('%D'))
    axes[index].legend(['Counts',
                        'Proportion of Testing', 
                        'Testing'], 
                       loc='upper right', 
                       bbox_to_anchor=(1, 1))
    plt.setp(axes[index].get_xticklabels(), 
             rotation = 90, 
             fontsize = 8)
    axes[index].set_ylim(-0.05,1.05)

axes[0].xaxis.set_major_locator(mdates.WeekdayLocator(interval = 2))
axes[0].xaxis.set_minor_locator(mdates.WeekdayLocator(interval = 1))
axes[1].xaxis.set_major_locator(mdates.WeekdayLocator(interval = 2))
axes[2].set_xticks(responsiveness_by_month.index)

axes[0].set_title('Unresponsiveness and Testing Counts and Proportion Over Time');

In [None]:
# Plot ecdfs of the unresponsive counts per day for November through December 2020.
facecolor = 'white'
fig = plt.figure(figsize = (15, 5), 
                 dpi = 400)

criteria = (
    (responsiveness_by_day.index >= '2020-12-01') &\
    (responsiveness_by_day.index <= '2021-1-31')
)

colors = ['orange', 
          'blue',
          'red']

for num, i in enumerate(['UNRESPONSIVE', 'TESTING']):
    x = responsiveness_by_day[i][criteria]
    sns.ecdfplot(x, color = colors[num], linewidth = 2)
    
plt.legend(['UC','TC'])
plt.xlabel('UC/TC')
plt.ylabel('Counts')
plt.title('Unresponsiveness and Testing Counts per Day during November and December 2020');

In [None]:
# Create a dataframe that shows consecutive unresponsive signals over three minute increments and sums them.
for index, value in enumerate(relevant_columns):
    df = (
        all_dfs.groupby(pd.Grouper(freq = '3T'))[value].\
        sum().\
        reset_index().\
        rename(columns = {'sum':f'{value}'}))
    if index == 0:
        concat_dfs_2 = df
    else:
        concat_dfs_2 = concat_dfs_2.merge(df, on = 'DATETIME')

# Possibly refactor.
begin = []
end = []
sum_runs = []
i = 0
for num, row in enumerate(concat_dfs_2.itertuples()):
    if num in range(0, len(concat_dfs_2) - 1):
        upper_value = concat_dfs_2.iloc[num - 1, 2]
        lower_value = concat_dfs_2.iloc[num + 1, 2]
        if 0 not in [row[3], lower_value] and upper_value == 0:
            begin.append(row[1])
            sum_runs.append(0)
            i = sum_runs.index(0)
            sum_runs[i] = row[3]
        elif 0 not in [row[3], upper_value, lower_value]:
            sum_runs[i] += row[3]
        elif 0 not in [row[3], upper_value] and lower_value == 0:
            sum_runs[i] += row[3]
            end.append(concat_dfs_2.iloc[num + 1, 0])

In [None]:
# Show intervals dataframe.
consecutive = pd.DataFrame({'BEGIN': begin, 
                            'END':end, 
                            'UNRESPONSIVE':sum_runs})

consecutive['INTERVAL'] = consecutive['END'] - consecutive['BEGIN']
consecutive['INTERVAL'] = [x.seconds/60 for x in consecutive['INTERVAL']]
consecutive['INTERVAL'] = consecutive['INTERVAL'].astype('int64')

# Possibly extend initial background color to begin and end columns.
cm = sns.color_palette('Blues', as_cmap = True)

def hightlight_timestamp(row):
    """Function to highlight timestamps where unresponsive counts or intervals thereof are outliers."""
    row_highlights = ["" for _ in row.index]
    if (row['INTERVAL'] > 6) or (row['UNRESPONSIVE'] >= 6):
        row_highlights[row.index.get_loc("BEGIN")] = "background-color: orange; font-weight: bold;"
        row_highlights[row.index.get_loc("END")] = "background-color: orange; font-weight: bold;"
    return row_highlights

# Present style DataFrame of outlier data.
consecutive.loc[consecutive['UNRESPONSIVE'] > 4].style.\
apply(hightlight_timestamp, axis=1).\
background_gradient(cmap = cm).\
set_table_styles([
    {'selector': 'th.col_heading', 
     'props': 'text-align: center; background-color: #00008B; color: white;'}], 
    overwrite = False)

In [None]:
# Calendar heatmap based on modified version of https://stackoverflow.com/a/61277350.

weeks = [1, 2, 3, 4, 5, 6]
days = ['Su', 'Mo', 'Tu', 'We', 'Th', 'Fr', 'Sa']
month_names = ['November', 'December']

def split_months(df, indices_of_desired_values):
    """Split dataframe into months of desired values."""
    
    # Initialize empty 6 x 7 matrix 
    a = np.empty((6, 7))
    a[:] = np.nan

    # Create dictionary where key is month and values are days within a month.
    day_nums = {m:np.copy(a) for m in range(11,13)} 
    
    # Create dictionary where key is month and values are the desired values for that day during a month.
    list_of_day_vals = []
    for i in range(len(indices_of_desired_values)):
        day_vals = {m:np.copy(a) for m in range(11,13)}
        list_of_day_vals.append(day_vals)
        
    
    # Iterate through the DataFrame and assign applicable day to day number.
    # Then assign applicable day value to that day.
    for row in df.itertuples():

        day = row[0].day
        month = row[0].month
        calendar_column = row[0].dayofweek

        if row[0].is_month_start:
            calendar_row = 0
            
        day_nums[month][calendar_row, calendar_column] = day
        
        for index in range(len(indices_of_desired_values)):
            list_of_day_vals[index][month][calendar_row, calendar_column] = row[indices_of_desired_values[index]]

        if calendar_column == 6:
            calendar_row += 1

    return day_nums, list_of_day_vals


def create_calendar_heatmap(day_nums, day_vals):
    """Create actual calendar"""
    fig, ax = plt.subplots(2, 2, figsize = (10, 8), dpi = 400)
    plt.subplots_adjust(wspace = 0.1, hspace = 0.1)
    
    j = [0,0,1,1]
    k = [0,1,0,1]

    for i, axs in enumerate(ax.flat):
        # Cycle through unresponsive counts for Nov. and Dec. 2020, then user counts, creating calendar for each.
        index = j[i]
        second_index = k[i]

        axs.imshow(day_vals[index][second_index+11], cmap = 'coolwarm')
        axs.set_title(month_names[second_index], fontsize = 10)

        axs.set_xticks(np.arange(len(days)))
        axs.set_xticklabels(days, fontsize = 10, color = 'black')
        
        axs.set_yticklabels([])

        axs.tick_params(axis = 'both',
                        which = 'both',
                        length = 0)
        axs.xaxis.tick_top()
        
        # Despine the whole plot.
        for edge in ['left', 'right', 'bottom', 'top']:
            axs.spines[edge].set_color('white')

        for w in range(len(weeks)):
            for d in range(len(days)):
                
                day_val = day_vals[index][second_index+11][w, d]
                day_num = day_nums[second_index+11][w, d]
                
                # If value isn't na add day number and day value text.
                if not np.isnan(day_num):
                    axs.text(d, w+0.3, 
                             f"{day_val:,.0f}",
                             ha = "center", 
                             va = "center",
                             fontsize = 6, 
                             color = "w", 
                             fontweight = 'bold',
                             alpha = 1)
                    
                    axs.text(d+0.475, 
                             w-0.35, 
                             f"{day_num:0.0f}",
                             ha = "right", 
                             va = "center",
                             fontsize = 5, 
                             color = "#003333",
                             fontweight = 'bold',
                             alpha = 1)
                    
                    # Draw white grid lines around boxes.
                    patch_coords = ((d - 0.5, w + 0.5),
                                    (d + 0.5, w + 0.5),
                                    (d + 0.5, w - 0.5),
                                    (d - 0.5, w - 0.5))
                    
                    square = Polygon(patch_coords, fc = 'none', edgecolor = 'w')
                    axs.add_artist(square)
                    
                    # Draw smaller white boxes for day numbers.
                    patch_coords = ((d + 0.25, w - 0.5),
                                    (d + 0.5, w - 0.5),
                                    (d + 0.5, w - 0.25),
                                    (d + 0.25, w - 0.25))
                    
                    square = Polygon(patch_coords, fc = 'w')
                    axs.add_artist(square)

    plt.suptitle('Calendar of Unresponsive Counts for November and December, then User Counts', 
                 fontsize = 10)

# Call functions and plot.
day_nums, day_vals = split_months(responsiveness_by_day[16:77], [1,4])
create_calendar_heatmap(day_nums, day_vals)

In [None]:
# Scatter plot between number of user commands and responsiveness.
scatter_responsiveness_by_day = responsiveness_by_day.loc[responsiveness_by_day['USERCOUNT'] > 10]

# Adjust legend labels later.
#month_names = ['January', 
#               'February', 
#               'March', 
#               'April', 
#               'May', 
#               'June', 
#               'July', 
#               'August',
#               'September', 
#               'October', 
#               'November', 
#               'December']

fig, ax = plt.subplots(figsize = (8, 4), dpi = 500)
pl = ax.scatter(scatter_responsiveness_by_day['USERCOUNT'],
                scatter_responsiveness_by_day['UNRESPONSIVE'],
                c = scatter_responsiveness_by_day.index.month,
                cmap = 'tab20',
                alpha = 0.8)

first_legend = ax.legend(*scatter.legend_elements(), loc = 'upper right')
ax.add_artist(first_legend)

plt.xlabel('User Command Submissions')
plt.ylabel('Unresponsive Counts')
plt.title('Scatter Plot Comparing User Command Submissions and Unresponsive Counts', fontsize = 10)
plt.show()