In [571]:
import pandas as pd
from datetime import datetime as dt
from datetime import timedelta
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import os
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.stats import chi2
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [111]:
pd.set_option('mode.chained_assignment', None)

In [2]:
print(f'Current working directory is {os.getcwd()}')

Current working directory is C:\Users\cavin\Documents\NSS_Projects\ACCR-project-clj\accre-carbonara\notebooks


In [3]:
# Import data
jobs = pd.read_csv("../data/fullsample.csv",
                    # nrows = 1000000
                    )
ce5 = pd.read_csv('../data/ce5_unresponsive.csv')
ce6 = pd.read_csv('../data/ce6_unresponsive.csv')
jobs = jobs[jobs['END'] != 'Unknown']

# Convert dates to datetime objects
jobs['END'] = pd.to_datetime(jobs['END'])
jobs['BEGIN'] = pd.to_datetime(jobs['BEGIN'])

# String manipulation of memory columns
jobs['USEDMEM'] = jobs['USEDMEM'].str[:-1]
jobs.insert(5, 'Mc_Mn', jobs['REQMEM'].str[-2:])
jobs['REQMEM']  = jobs['REQMEM'].str[:-2]

# Convert to numeric values for easier manipulation, and create difference col
jobs['USEDMEM'] = pd.to_numeric(jobs['USEDMEM'])
jobs['REQMEM'] = pd.to_numeric(jobs['REQMEM'])
jobs['DIFFMEM'] = jobs['REQMEM'] - jobs['USEDMEM']

# Create boolean column for status = COMPLETE
jobs.insert(2, 'COMPLETE', jobs['STATE'] == 'COMPLETED')
jobs['COMPLETE'] = jobs['COMPLETE'].astype(int)

# Create columns to assign failure, job counts, and concat into one df
ce56 = pd.concat([ce5, ce6])
jobs['FAILED'] = 0
jobs['JOBCOUNT'] = 1
ce56['JOBCOUNT'] = 0
ce56['FAILED'] = 1
jobs_ce56 = pd.concat([jobs, ce56])
ce56.to_csv('../data/ce56.csv', index=False)
# Eliminate milliseconds in desired datetime column 'END' 
jobs_ce56['END'] = pd.to_datetime(jobs_ce56['END']).dt.floor('5min')

In [460]:
df = jobs_ce56[['END', 'JOBID', 'FAILED', 'NODES', 'USEDMEM', 'CPUS', 'PARTITION']]

In [497]:
df1 = df[df['FAILED'] == 0]
df1 = (
    df1
    .groupby('END')['FAILED']
    .value_counts()
    .reset_index()
)
df2 = df[df['FAILED'] == 1]
df2 = (
    df2
    .groupby('END')['FAILED']
    .value_counts()
    .reset_index()
)
reg_df = pd.concat([df1, df2])
reg_df = (
    reg_df
    .sort_values('END')
    .set_index('END')
)
reg_df['slurm_crashes'] = reg_df[reg_df['FAILED'] == 1]['count']
reg_df['completed_jobs'] = reg_df[reg_df['FAILED'] == 0]['count']
reg_df = reg_df.drop(columns = ['count'])

In [547]:
reg_df.sample(20)

Unnamed: 0_level_0,FAILED,slurm_crashes,completed_jobs
END,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-05-17 00:50:00,0,,44.0
2021-03-20 06:00:00,0,,42.0
2020-10-07 09:40:00,0,,35.0
2021-05-17 14:55:00,0,,6.0
2021-04-18 14:45:00,0,,46.0
2021-01-31 12:05:00,0,,45.0
2021-01-26 03:05:00,0,,25.0
2021-02-22 21:25:00,0,,195.0
2021-05-13 04:10:00,0,,7.0
2021-02-04 18:45:00,0,,55.0


In [549]:
reg_df.loc['2021-05-11 19:40:00']

Unnamed: 0_level_0,FAILED,slurm_crashes,completed_jobs
END,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-05-11 19:40:00,0,1.0,27.0
2021-05-11 19:40:00,1,1.0,27.0


In [597]:
model = smf.logit('FAILED ~ slurm_crashes',
                    data = reg_df).fit()

Optimization terminated successfully.
         Current function value: 0.693119
         Iterations 3


In [599]:
model.params

Intercept       -0.007547
slurm_crashes    0.011953
dtype: float64

In [587]:
def logistic(x):
    return 1 / (1 + np.exp(-x))

In [607]:
crashes = 50

logit_p = model.params['Intercept'] + model.params['slurm_crashes']*crashes

print(f'Estimated Probability: {logistic(logit_p)}')

Estimated Probability: 0.643394328774461


In [567]:
slim_model = smf.logit('FAILED ~ 1', data = reg_df).fit()
full_model = smf.logit('FAILED ~ slurm_crashes + completed_jobs', data = reg_df).fit()

Optimization terminated successfully.
         Current function value: 0.102193
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.693147
         Iterations 1


In [569]:
G2 = -2 * (slim_model.llf - full_model.llf)
G2

15788.900156026033

In [577]:
df3 = full_model.df_model - slim_model.df_model
df3

2.0

In [581]:
p = chi2.sf(G2, df = df3)
p

0.0