Next, use the two log files to determine time intervals (hours) when the scheduler was unresponsive. Do this by looking for records that are the "sbatch" command from user 9204 that have return code 1 and an execution time of ~20 seconds (at least more than 15). These are commands where the scheduler timed out in responding.

3. Calculate some descriptive statistics about how often the scheduler was unresponsive, how long these periods of time were, and create a time series plot of when the scheduler was having difficulties.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as mtick
import seaborn as sns
import datetime as dt
from scipy.stats import poisson, nbinom, norm
import statsmodels.api as sm
from statsmodels.discrete.count_model import ZeroInflatedNegativeBinomialP, Logit
import statsmodels.discrete.count_model as sdcm
import warnings
warnings.filterwarnings("ignore")
pd.set_option("display.max_rows", 101)

In [None]:
# Read in both slurm files and concatenate, saving together.
files = ['data/slurm_wrapper_ce6.log', 'data/slurm_wrapper_ce5.log']

for index, value in enumerate(files):
    df = (
        pd.read_csv(value, 
                    sep = ' - ',
                    names = ['DATETIME',
                             'USER',
                             'RETRY',
                             'TIMELAPS',
                             'RETURNCODE',
                             'COMMAND'],
                    engine = 'python'))
    
    df['TIMELAPS'] = (
    df['TIMELAPS'].replace('time ', '', regex = True).astype('float64')
    )
    df['DATETIME'] = pd.to_datetime(df['DATETIME'], format='%Y-%m-%d %H:%M:%S.%f')
    
    df['DATE'] = df['DATETIME'].apply(lambda x: x.date())
    df['TIME'] = df['DATETIME'].apply(lambda x: x.time())
    
    df['UNRESPONSIVE'] = (
        np.where((df['RETURNCODE'] == 'returncode 1')\
                 & (df['TIMELAPS'] > 15),
                 True,
                 False)
    )
      
    df['SERVER'] = ''
    if index == 0:
        df['SERVER'] = 'ce6'
    else:
        df['SERVER'] = 'ce5'
    
    df['JOBID'] = (
        df['COMMAND'].\
        str.extract("\'(\d{2,})\'", expand = True)
    )
    
    df['COMMANDCOUNT'] = True
       
    if index == 0:
        all_dfs = df
    else:
        all_dfs = pd.concat(objs = [all_dfs, df], axis = 0)
all_dfs = all_dfs.set_index('DATETIME').sort_index(axis = 0, kind = 'mergesort')

In [None]:
proportion_df = (
    pd.DataFrame(all_dfs.\
                 groupby(pd.Grouper(freq = '1H'))['COMMANDCOUNT','UNRESPONSIVE'].\
                 agg('sum')
                ).rename(columns = {'COMMANDCOUNT':'TOTALCOMMANDS',
                                   'UNRESPONSIVE':'TOTALUNRESPONSIVE'})
)

proportion_df['PROPORTION'] = round(proportion_df['TOTALUNRESPONSIVE']/proportion_df['TOTALCOMMANDS'],2)

removable_dates = (
    proportion_df.loc[(proportion_df['TOTALCOMMANDS'] > 15000) & (proportion_df['PROPORTION'] < 0.4)].index
)

removable_date_hour = (
    [dt.datetime(year = i.year, month = i.month, day = i.day, hour = i.hour) for i in removable_dates]
)

#command_logit_df = (
#    pd.DataFrame(all_dfs.\
#                 groupby(pd.Grouper(freq = 'H'))['COMMANDCOUNT'].\
#                 transform('sum')
#                ).rename(columns = {'COMMANDCOUNT':'COMMANDTOTAL'})
#)

#logit_df = pd.concat(objs = [all_dfs, command_logit_df], axis = 1)[['UNRESPONSIVE','DATE','COMMANDTOTAL']]
logit_df = all_dfs[['UNRESPONSIVE','COMMANDCOUNT']]
logit_df['COMMANDTOTAL'] = logit_df['COMMANDCOUNT'].rolling('1H').sum()
logit_df['UNRESPONSIVE'] = pd.get_dummies(logit_df['UNRESPONSIVE'], drop_first = True)
logit_df = logit_df.loc[logit_df['COMMANDTOTAL'] > 0]
logit_df.replace([np.inf, -np.inf], np.nan, inplace=True)
logit_df = logit_df.dropna()

logit_df = logit_df.reset_index()

logit_df['REMOVE'] = ''
for i in removable_date_hour:
    logit_df['REMOVE'] = np.where((logit_df['DATETIME'].dt.hour == i.hour)\
                          & (logit_df['DATETIME'].dt.day == i.day)\
                          & (logit_df['DATETIME'].dt.month == i.month), True, False)

logit_df = logit_df.loc[logit_df['REMOVE'] == False]

mask = np.random.rand(len(logit_df)) < 0.3
logit_train_df = logit_df[mask]

mask = np.random.rand(len(logit_df)) < 0.3
logit_test_df = logit_df[mask]

logit_model_exog = sm.add_constant(np.log(logit_train_df['COMMANDTOTAL']))

logit_model = Logit(endog = logit_train_df['UNRESPONSIVE'],
                    exog = logit_model_exog, 
                    offset = np.log(logit_train_df['COMMANDTOTAL'])/1000).fit()

print('\n\n' + 'Logistic Regression Model' + '\n')
print(logit_model.summary())

In [None]:
fig, ax = plt.subplots(figsize = (10, 6), dpi = 300)

x = logit_test_df['COMMANDTOTAL']
y = logit_test_df['UNRESPONSIVE']

ax.scatter(x, y, color = 'k', alpha = 0.05)

x_predict_range = np.linspace(start = np.log(logit_test_df['COMMANDTOTAL']).min(),
                              stop = np.log(logit_test_df['COMMANDTOTAL']).max(), 
                              num = len(logit_test_df['COMMANDTOTAL']))

y = logit_model.predict(sm.add_constant(x_predict_range)) 

ax.plot(np.exp(x_predict_range), y, color = 'red')
ax.get_xaxis().set_major_formatter(mtick.FuncFormatter(lambda x, p: format(int(x), ',')))
plt.xlabel('User Command Submissions Per Hour', fontsize = 10)
plt.ylabel('Probability of User Command Unresponsiveness During Such Hour', fontsize = 10);
#plt.savefig('images/LogitScatterRegression.png', dpi = 300, transparent = True);

In [None]:
from sklearn.calibration import calibration_curve

pred = logit_model.predict(sm.add_constant(np.log(logit_test_df['COMMANDTOTAL'])))
logreg_y, logreg_x = calibration_curve(logit_test_df['UNRESPONSIVE'], pred, n_bins = 30)
plt.figure(figsize = (10, 6), dpi = 100)
plt.scatter(logreg_x, logreg_y)
plt.plot(logreg_x, logreg_y)
plt.plot([0, 1], [0, 1], linestyle = '--')
plt.xticks(np.arange(0, 1.05, 0.05))
plt.yticks(np.arange(0, 1.05, 0.05))
plt.xlim(0,1)
plt.ylim(0,1)
plt.xlabel('Predicted')
plt.ylabel('Observed');