In [None]:
import os
# os.chdir('..')
print(f'Current working directory is {os.getcwd()}')

Current working directory is c:\Users\blond\Documents\NssProjects\accre-green-bean-casserole


In [11]:
import pandas as pd
from scripts.get import log_to_df, df_to_datelist
from collections import Counter
import numpy as np
from tqdm import tqdm

In [5]:
jobs = pd.read_csv("data/fullsample.csv")

In [6]:
jobs = jobs[jobs['END'] != 'Unknown']
jobs = jobs[jobs['STATE'] == 'COMPLETED']
jobs['BEGIN'] = pd.to_datetime(jobs['BEGIN'])
jobs['END'] = pd.to_datetime(jobs['END'])
jobs['REQTIME'] = pd.to_timedelta(jobs['REQTIME'])
jobs['USEDTIME'] = pd.to_timedelta(jobs['USEDTIME'])

In [7]:
ce5 = log_to_df('data/slurm_wrapper_ce5.log')
ce6 = log_to_df('data/slurm_wrapper_ce6.log')

errors_ce6 = df_to_datelist(ce6)
errors_ce5 = df_to_datelist(ce5)
all_errors = errors_ce5 + errors_ce6

In [8]:
def count_jobs_before_interr(all_errors_func = all_errors, jobs_func = jobs, typeTime = 'h', countTime = 1, on = 'END'):
    """
    Calculates the number of jobs occurring within a specified time window 
    relative to each error timestamp, based on the relationship specified 
    (BEGIN, DURING, END, or ALL). Returns a DataFrame where each row corresponds 
    to an error and the number of jobs meeting the specified criteria.

    Parameters:
    ----------
    all_errors : pd.Series or iterable
        A list or Series of error timestamps. Each timestamp is used as 
        a reference point to count the jobs within the specified time window.

    jobs : pd.DataFrame
        A DataFrame containing job details with at least the following columns:
        - 'BEGIN': The start times of jobs.
        - 'END': The end times of jobs.

    typeTime : str, optional
        The unit of time for the countTime parameter. Accepted values are:
        - 'm': Minutes
        - 'h': Hours (default)
        - 'd': Days

    countTime : float, optional
        The size of the time window in the units specified by typeTime. 
        For example:
        - countTime=1 with typeTime='h' means a 1-hour window.
        - countTime=30 with typeTime='m' means a 30-minute window.

    on : str, optional
        Defines the relationship between the jobs and the error timestamp. 
        Accepted values are:
        - 'BEGIN': Count jobs whose start times fall within the time window 
                   before the error.
        - 'DURING': Count jobs that were active (spanning) during the error.
        - 'END': Count jobs whose end times fall within the time window 
                 before the error. (Default)
        - 'ALL': Generates a DataFrame with counts for all relationships:
            - 'Start Count': Number of jobs starting within the time window.
            - 'During Count': Number of jobs spanning the error timestamp.
            - 'End Count': Number of jobs ending within the time window.

    Returns:
    -------
    pd.DataFrame
        - For 'BEGIN', 'DURING', or 'END': A DataFrame where each row corresponds 
          to an error and its associated count of jobs based on the specified criteria.
        - For 'ALL': A DataFrame with columns 'Interruption Time', 'Start Count', 
          'During Count', and 'End Count'.

    Notes:
    -----
    - If an invalid value for `on` is provided, the function defaults to 'END' 
      and prints a warning message.
    - The 'ALL' option adds comprehensive job counts across all specified 
      relationships to the error timestamps.
    """
    
    
    
#     error_min_time = all_errors.min() - pd.Timedelta(hours=time_hours)
#         error_max_time = all_errors.max()

#          Filter jobs within the global range
#         jobs_filtered = jobs[(jobs['BEGIN'] <= error_max_time) & (jobs['END'] >= error_min_time)]
    
    time_dict = {
        'm': 60,
        'h': 1,
        'd': 1/24    
    }
    time_hours = countTime / time_dict[typeTime]
    error_min_time = min(all_errors) - pd.Timedelta(hours=time_hours)
    error_max_time = max(all_errors)
    on = on.strip().upper()
    errors_array = np.array(all_errors_func)
    all_errors_func = sorted(all_errors_func)
    
    if on == 'BEGIN':
        
        jobs_copy = jobs_func.copy(deep = True)
        jobs_copy = jobs_copy[(jobs_copy['BEGIN'] <= error_max_time) & (jobs_copy['BEGIN'] >= error_min_time)]
        jobs_copy = jobs_copy.sort_values('BEGIN')
        job_counts_for_interrupt = {}
        last_error_date = all_errors_func[0]
        for i, error in enumerate(tqdm(all_errors_func, desc="Processing Errors")):
            hour_less_than_given = error - pd.Timedelta(hours=time_hours)
            if (error - last_error_date).days >= 30:
                jobs_copy = jobs_copy[jobs_copy['BEGIN'] >= hour_less_than_given]
                print(last_error_date)
                last_error_date = error
                
            
            count = ((jobs_copy['BEGIN'] > hour_less_than_given) & (jobs_copy['BEGIN'] <= error)).sum()
            job_counts_for_interrupt[error] = count
    
    elif on == 'DURING':
        
        jobs_copy = jobs_func.copy(deep = True)
        jobs_copy = jobs_copy[(jobs_copy['BEGIN'] <= error_max_time) & (jobs_copy['END'] >= error_min_time)]
        jobs_copy = jobs_copy.sort_values('END')
        job_counts_for_interrupt = {}
        last_error_date = all_errors_func[0]
        for i, error in enumerate(tqdm(all_errors_func, desc="Processing Errors")):
            hour_less_than_given = error - pd.Timedelta(hours=time_hours)
            if (error - last_error_date).days >= 30:
                jobs_copy = jobs_copy[jobs_copy['END'] >= error]
                print(last_error_date)
                last_error_date = error
            #hour_less_than_given = error - pd.Timedelta(hours=time_hours)
            count = ((jobs_copy['END'] > error) & (jobs_copy['BEGIN'] < error)).sum()
            job_counts_for_interrupt[error] = count
            
    elif on == 'END':
        
        jobs_copy = jobs_func.copy(deep = True)
        jobs_copy = jobs_copy[(jobs_copy['END'] <= error_max_time) & (jobs_copy['END'] >= error_min_time)]
        jobs_copy = jobs_copy.sort_values('END')
        job_counts_for_interrupt = {}
        last_error_date = all_errors_func[0]
        for i, error in enumerate(tqdm(all_errors_func, desc="Processing Errors")):
            hour_less_than_given = error - pd.Timedelta(hours=time_hours)
            if (error - last_error_date).days >= 30:
                jobs_copy = jobs_copy[jobs_copy['END'] >= hour_less_than_given]
                print(last_error_date)
                last_error_date = error
            hour_less_than_given = error - pd.Timedelta(hours=time_hours)
            count = ((jobs_copy['END'] > hour_less_than_given) & (jobs_copy['END'] <= error)).sum()
            job_counts_for_interrupt[error] = count
        
    elif on == 'ALL':
        
        jobs_copy = jobs_func.copy(deep = True)
        jobs_copy = jobs_copy[((jobs_copy['END'] <= error_max_time) & (jobs_copy['END'] >= error_min_time)) | ((jobs_copy['BEGIN'] <= error_max_time) & (jobs_copy['BEGIN'] >= error_min_time)) | ((jobs_copy['BEGIN'] <= error_max_time) & (jobs_copy['END'] >= error_min_time))]
        jobs_copy = jobs_copy.sort_values('END')
        
        job_counts_for_interrupt_begin = {}
        job_counts_for_interrupt_during = {}
        job_counts_for_interrupt_end = {}
        last_error_date = all_errors_func[0]

        for i, error in enumerate(tqdm(all_errors_func, desc="Processing Errors")):
            hour_less_than_given = error - pd.Timedelta(hours=time_hours)
            if (error - last_error_date).days >= 30:
                jobs_copy = jobs_copy[jobs_copy['END'] >= hour_less_than_given]
                print(last_error_date)
                last_error_date = error

            countbegin = ((jobs_copy['BEGIN'] > hour_less_than_given) & (jobs_copy['BEGIN'] <= error)).sum()
            countduring = ((jobs_copy['END'] > error) & (jobs_copy['BEGIN'] < error)).sum()
            countend = ((jobs_copy['END'] > hour_less_than_given) & (jobs_copy['END'] <= error)).sum()
            
            job_counts_for_interrupt_begin[error] = countbegin
            job_counts_for_interrupt_during[error] = countduring
            job_counts_for_interrupt_end[error] = countend
            
        df1 =  pd.DataFrame(job_counts_for_interrupt_begin.items())
        #df.rename(columns={'A': 'a', 'B': 'c'}, inplace=True)
        df1.rename(columns = {1:'Start Count', 0:'Interruption Time'}, inplace = True)
        df1['During Count'] = job_counts_for_interrupt_during.values()
        df1['End Count'] = job_counts_for_interrupt_end.values()
        
        return df1
        
            
    else:
        
        
        
        print(f'Your "ON" variable of "{on}" was not found to be (BEGIN, END, DURING, or ALL), so defaulted to END.')
        
        jobs_copy = jobs_func.copy(deep = True)
        jobs_copy = jobs_copy[(jobs_copy['END'] <= error_max_time) & (jobs_copy['END'] >= error_min_time)]
        jobs_copy = jobs_copy.sort_values('END')
        job_counts_for_interrupt = {}
        last_error_date = all_errors_func[0]
        for i, error in enumerate(tqdm(all_errors_func, desc="Processing Errors")):
            hour_less_than_given = error - pd.Timedelta(hours=time_hours)
            if (error - last_error_date).days >= 30:
                jobs_copy = jobs_copy[jobs_copy['END'] >= hour_less_than_given]
                print(last_error_date)
                last_error_date = error
            hour_less_than_given = error - pd.Timedelta(hours=time_hours)
            count = ((jobs_copy['END'] > hour_less_than_given) & (jobs_copy['END'] <= error)).sum()
            job_counts_for_interrupt[error] = count

        
    
    
    return pd.DataFrame(job_counts_for_interrupt.items())

In [9]:
jobs.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7375084 entries, 1 to 7395884
Data columns (total 12 columns):
 #   Column     Dtype          
---  ------     -----          
 0   JOBID      object         
 1   STATE      object         
 2   BEGIN      datetime64[ns] 
 3   END        datetime64[ns] 
 4   REQMEM     object         
 5   USEDMEM    object         
 6   REQTIME    timedelta64[ns]
 7   USEDTIME   timedelta64[ns]
 8   NODES      int64          
 9   CPUS       int64          
 10  PARTITION  object         
 11  EXITCODE   object         
dtypes: datetime64[ns](2), int64(2), object(6), timedelta64[ns](2)
memory usage: 731.5+ MB


In [None]:
Every_Fifteen_Minutes_Errors_df = count_jobs_before_interr(typeTime = 'm', countTime = 15)

Processing Errors:   3%|▎         | 93/3296 [00:07<09:03,  5.89it/s]

2020-10-18 06:16:25.392946


Processing Errors:  35%|███▍      | 1146/3296 [01:07<05:50,  6.13it/s]

2020-11-18 15:03:14.439449


Processing Errors:  49%|████▉     | 1616/3296 [01:32<03:13,  8.66it/s]

2020-12-18 15:18:59.450549


Processing Errors:  53%|█████▎    | 1735/3296 [01:38<02:59,  8.71it/s]

2021-01-17 16:28:49.469932


Processing Errors:  63%|██████▎   | 2062/3296 [01:53<02:10,  9.42it/s]

2021-02-16 16:58:11.049951


Processing Errors:  67%|██████▋   | 2193/3296 [01:58<01:21, 13.60it/s]

2021-03-19 08:29:39.070946


Processing Errors:  79%|███████▉  | 2613/3296 [02:14<00:35, 19.45it/s]

2021-04-18 14:59:17.312041


Processing Errors:  85%|████████▍ | 2792/3296 [02:20<00:24, 20.59it/s]

2021-05-20 08:10:47.902061


Processing Errors:  92%|█████████▏| 3047/3296 [02:28<00:13, 18.47it/s]

2021-06-19 11:57:52.170544


Processing Errors:  95%|█████████▌| 3139/3296 [02:30<00:05, 28.13it/s]

2021-07-19 12:36:10.601915


Processing Errors:  99%|█████████▉| 3267/3296 [02:32<00:00, 85.94it/s]

2021-08-20 12:03:28.102533


Processing Errors: 100%|██████████| 3296/3296 [02:33<00:00, 21.54it/s] 


In [13]:
Every_Fifteen_Minutes_Errors_df[0] = pd.to_datetime(Every_Fifteen_Minutes_Errors_df[0])
Every_Fifteen_Minutes_Errors_df.set_index(0, inplace=True)
Every_Fifteen_Minutes_Errors_df.reset_index(inplace=True)

In [15]:
Group_By_Day_df= Every_Fifteen_Minutes_Errors_df.groupby(pd.Grouper(key=0, freq='15min')).sum()

Group_By_Day_df[Group_By_Day_df[1] > 100].sort_values(1, ascending=False).reset_index()

Unnamed: 0,0,1
0,2021-06-28 09:45:00,9890
1,2020-12-11 09:45:00,6712
2,2020-12-11 09:00:00,6696
3,2020-11-21 18:30:00,5747
4,2021-08-03 21:00:00,4587
...,...,...
1280,2020-11-23 11:30:00,101
1281,2021-02-06 08:15:00,101
1282,2021-09-09 11:00:00,101
1283,2021-03-21 16:15:00,101


In [16]:
import scipy.stats as stats

stats.describe(Group_By_Day_df)

DescribeResult(nobs=33926, minmax=(array([0], dtype=int64), array([9890], dtype=int64)), mean=array([16.67956729]), variance=array([19293.52387844]), skewness=array([27.57957353]), kurtosis=array([1306.70503834]))

In [17]:
def count_jobs_before_interr_mean(all_errors_func = all_errors, jobs_func = jobs, typeTime = 'h', countTime = 1, on = 'END'):
    
    time_dict = {
        'm': 60,
        'h': 1,
        'd': 1/24    
    }
    time_hours = countTime / time_dict[typeTime]
    error_min_time = min(all_errors_func) - pd.Timedelta(hours=time_hours)
    error_max_time = max(all_errors_func)
    
    jobs_copy = jobs_func.copy(deep = True)
    jobs_copy = jobs_copy[(jobs_copy['END'] <= error_max_time) & (jobs_copy['END'] >= error_min_time)]
    jobs_copy = jobs_copy.sort_values('END')
    job_counts_for_interrupt = {}
    last_error_date = all_errors_func[0]
    
    for error in tqdm(sorted(all_errors_func), desc="Processing Errors"):
        hour_less_than_given = error - pd.Timedelta(hours=time_hours)
        if (error - last_error_date).days >= 30:
            jobs_copy = jobs_copy[jobs_copy['END'] >= hour_less_than_given]
            last_error_date = error
            
        count = ((jobs_copy['END'] > hour_less_than_given) & (jobs_copy['END'] <= error)).sum()
        job_counts_for_interrupt[error] = count
    
    result_df = pd.DataFrame(job_counts_for_interrupt.items())
    mean_jobs = result_df[1].mean()
    
    print(f"\nAverage Number of Jobs Completed in {countTime}{typeTime} Before Error Ocurred: {mean_jobs:.2f}")
    
    return result_df

In [19]:
count_jobs_before_interr_mean(typeTime = 'm', countTime = 15)

Processing Errors: 100%|██████████| 3296/3296 [02:21<00:00, 23.28it/s] 



Average Number of Jobs Completed in 15m Before Error Ocurred: 171.68


Unnamed: 0,0,1
0,2020-10-18 06:16:25.392946,51
1,2020-10-18 06:38:44.172473,67
2,2020-10-18 06:53:44.272915,70
3,2020-10-18 06:54:04.322412,70
4,2020-10-18 07:47:25.825172,67
...,...,...
3291,2021-09-24 18:14:35.862916,150
3292,2021-09-24 19:13:14.894282,123
3293,2021-10-02 08:14:16.557499,130
3294,2021-10-02 18:29:08.267199,216
