In [220]:
import os
import random
import pandas as pd

from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

import warnings
warnings.filterwarnings("ignore")

# Set Up The Data

## Read in the Errors

In [221]:
project_name = 'allstate_log_github'
data_directory = '/mnt/data/' + project_name + '/'

errors_df = pd.read_csv(data_directory+'error_analysis_all_time.csv')

In [222]:
errors_df.columns

Index(['Error', 'Line_Number', 'Context', 'Error_Type', 'File_Path',
       'Date_Time', 'Node', 'execution_id'],
      dtype='object')

In [223]:
errors_df.Error_Type.value_counts()

cluster    35526
domino     17838
user          48
Name: Error_Type, dtype: int64

In [224]:
errors_df = errors_df[['Context', 'Error_Type']]

## Extract random lines for non-error support bundles and create a "no error" dataframe

In [225]:
support_bundles_directory = data_directory + 'support-bundles/'
support_bundles = os.listdir(support_bundles_directory)
support_bundles = [x for x in support_bundles if '.zip' not in x]

In [226]:
len(support_bundles)

5549

In [227]:
resource_usage_directory = data_directory + 'resource-usage-by-day/'
resource_usage_csv_files = os.listdir(resource_usage_directory)
resource_usage_df = pd.concat([pd.read_csv(os.path.join(resource_usage_directory, f)) for f in resource_usage_csv_files], ignore_index=True)

In [228]:
resource_usage_df.head()

Unnamed: 0,run_id,run_title,command,status,starting_user,project_owner_username,project_name,total_run_duration_s_,run_duration_within_reporting_period_s_,hardware_tier,...,completed_timestamp,compute_cluster_type,master_hardware_tier,master_hardware_tier_cost_per_minute,worker_count,worker_hardware_tier,worker_hardware_tier_cost_per_minute,date,run_type,time_to_boot_s
0,63714cc09196ec342a497c35,echo test,echo test,Succeeded,bryce_beddard,bryce_beddard,testAPI,5.0,5,small-k8s,...,1668369640346,,,,,,,2022-11-13,batch,35
1,6370c0209196ec342a496c3e,Data Warehouse Daily Run,'/repos/domino_dw/DW_ETL.py',Succeeded,griffin_young,griffin_young,DW-Prod,6680.0,6680,small-k8s,...,1668340317363,,,,,,,2022-11-13,batch,36
2,640cddc0c4253f28887e0bbf,test.sh,test.sh,Failed,blake_moore,blake_moore,Finch,2.0,2,small-k8s,...,1678564826519,,,,,,,2023-03-11,batch,24
3,627ad072ee0eb12322fc991d,alireza_mounesisohi's JupyterLab session,[JupyterLab session ],Stopped,alireza_mounesisohi,alireza_mounesisohi,IMM-regression,8324.0,8324,small-k8s,...,1652224265867,,,,,,,2022-05-10,notebook,20
4,627abbfbd78c3b4fc50c85c5,Sahil-test,[vscode session ],Stopped,sahil.rahi,sahil.rahi,Test1,1007.0,1007,gpu-V100,...,1652211711849,,,,,,,2022-05-10,notebook,21


In [229]:
no_errors = resource_usage_df[resource_usage_df['status'].isin(['Running', 'Succeeded'])]

In [230]:
support_bundles = list(set(support_bundles) & set(no_errors.run_id))

In [231]:
def find_abs_path(support_bundle_path, support_bundles_directory=support_bundles_directory):
    all_paths = os.listdir(support_bundles_directory + support_bundle_path)
    all_paths = [os.path.abspath(support_bundles_directory + support_bundle_path + '/'+f) for f in all_paths]
    return(all_paths)
 
#find_abs_path(support_bundles[0])
successful_paths = [find_abs_path(target_support_bundle) for target_support_bundle in support_bundles]
successful_paths = [path for support_bundle in successful_paths for path in support_bundle]

In [232]:
def open_all_files(paths):
    with open(paths, 'r') as f:
        raw_content = f.readlines()

    raw_content = [line.strip() for line in raw_content]
    
    return(raw_content)

all_content = [open_all_files(path) for path in successful_paths]
all_content = [line for file in all_content for line in file]

In [233]:
all_content_save = all_content
all_content = list(filter(lambda word: len(word) > 10, all_content))

In [234]:
wanted_num_non_errors = errors_df.shape[0]
indices = random.sample(range(len(all_content)), wanted_num_non_errors)
target_content = [all_content[idx] for idx in indices]

In [235]:
no_errors_df = pd.DataFrame({"Context" : target_content, "Error_Type" : ["none" for i in target_content]})

## Combine the errors with the non-errors and create training/testing data

In [236]:
all_data = pd.concat([errors_df, no_errors_df])
all_data.columns = ['text', 'label']
all_data.shape

(106824, 2)

In [237]:
df_train, df_test = train_test_split(all_data, test_size=0.1, stratify=all_data["label"])

In [238]:
all_data.shape

(106824, 2)

In [239]:
def stratified_sample(df, stratifying_col, n):
    def sample_func(group):
        if len(group) < int(n/df[stratifying_col].nunique()):
            return group.sample(int(n/df[stratifying_col].nunique()), replace=True)
        else:
            return group.sample(int(n/df[stratifying_col].nunique()))
    return df.groupby(stratifying_col, group_keys=False).apply(sample_func)

# Get stratified sample with target number of rows
n = 10000
all_data_small = stratified_sample(all_data, 'label', n)

df_train_small, df_test_small = train_test_split(all_data_small, test_size=0.1, stratify=all_data_small["label"])

dir_name = os.path.join(data_directory, 'classification_data')

if not os.path.exists(dir_name):
    os.makedirs(dir_name)
   

df_train_small.to_csv(os.path.join(dir_name, "train_small.csv"), index=False)
df_test_small.to_csv(os.path.join(dir_name, "test_small.csv"), index=False)

In [240]:
df_train['label'].value_counts()

none       48071
cluster    31973
domino     16054
user          43
Name: label, dtype: int64

In [241]:
dir_name = os.path.join(data_directory, 'classification_data')

if not os.path.exists(dir_name):
    os.makedirs(dir_name)
    

df_train.to_csv(os.path.join(dir_name, "train.csv"), index=False)
df_test.to_csv(os.path.join(dir_name, "test.csv"), index=False)

## Groom the data into something that HuggingFace can take in

In [254]:
dir_name = os.path.join(data_directory, 'classification_data')

df_train = pd.read_csv(os.path.join(dir_name, "train_small.csv"))
df_test = pd.read_csv(os.path.join(dir_name, "test_small.csv"))

In [246]:
label_to_text

{0: 'domino', 1: 'user', 2: 'none', 3: 'cluster'}

In [256]:
unique_labels = df_train['label'].unique()
#label_to_text = {idx:unique_labels[idx] for idx in range(len(unique_labels))}
#text_to_label = {unique_labels[idx]:idx for idx in range(len(unique_labels))}
label_to_text = {0: 'none', 1: 'cluster', 2: 'domino', 3: 'user'}
text_to_label = {'none': 0, 'cluster':1, 'domino':2, 'user':3}

In [257]:
df_train['label'] = df_train['label'].apply(lambda x: text_to_label[x])
df_test['label'] = df_test['label'].apply(lambda x: text_to_label[x])

In [258]:
dataset_train = Dataset.from_pandas(df_train)
dataset_test = Dataset.from_pandas(df_test)

dataset_logs = DatasetDict({
        'train':dataset_train,
        'test':dataset_test
})

In [259]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

def tokenize_function(df):
    return tokenizer(df['text'], padding="max_length", truncation=True)

tokenized_datasets_logs = dataset_logs.map(tokenize_function)

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [260]:
tokenized_datasets_logs = tokenized_datasets_logs.remove_columns(["text"])
tokenized_datasets_logs = tokenized_datasets_logs.rename_column("label", "labels")
tokenized_datasets_logs.set_format("torch")

In [261]:
train_dataset_logs = tokenized_datasets_logs["train"]
eval_dataset_logs = tokenized_datasets_logs["test"]

In [262]:
from torch.utils.data import DataLoader

train_dataloader_logs = DataLoader(train_dataset_logs, shuffle=True, batch_size=8)
eval_dataloader_logs = DataLoader(eval_dataset_logs, batch_size=8)

## Begin to Build the Model

In [263]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained('bert-base-cased', num_labels=4)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initi

In [264]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [265]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader_logs)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [267]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

In [268]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader_logs:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/3375 [00:00<?, ?it/s]

In [269]:
!pip install evaluate

[33mDEPRECATION: feast 0.20.2 has a non-standard dependency specifier PyYAML>=5.4.*. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of feast or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m[33mDEPRECATION: feast 0.20.2 has a non-standard dependency specifier tenacity>=7.*. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of feast or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m[33mDEPRECATION: feast 0.20.2 has a non-standard dependency specifier dask<2022.02.0,>=2021.*. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of feast or contact the author to suggest tha

In [270]:
import evaluate

metric = evaluate.load("accuracy")
model.eval()
for batch in eval_dataloader_logs:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 1.0}

In [271]:
from transformers import pipeline

model = model.to(device)
device_num = 0 if torch.cuda.is_available() else -1
classifier = pipeline('text-classification', model=model, tokenizer=tokenizer, device=device_num)
classifier(df_train.text[10])

[{'label': 'LABEL_3', 'score': 0.9999047517776489}]

## TEST: Now find an error bundle which showed no errors and see if we can find something...

In [75]:
error_execution_ids = resource_usage_df.run_id[resource_usage_df["status"].isin(["Failed", "Error"])]

In [79]:
project_name = 'allstate_log_github'
data_directory = '/mnt/data/' + project_name + '/'

errors_df = pd.read_csv(data_directory+'error_analysis_all_time.csv')

In [91]:
import re

caught_errors = [re.sub(r"\/|\.zip", "", error) for error in errors_df.execution_id]

In [98]:
missed_errors = list(set(error_execution_ids) - set(caught_errors))

In [160]:
missed_errors[1002]

'63637adcc037d60af817bd83'

In [172]:
def find_abs_path(execution_id, support_bundles_directory=support_bundles_directory):
    all_paths = os.listdir(support_bundles_directory + execution_id)
    all_paths = [os.path.abspath(support_bundles_directory + execution_id + '/'+f) for f in all_paths]
    return(all_paths)

target_paths = find_abs_path(missed_errors[1060])

In [173]:
def open_all_files(path):
    with open(path, 'r') as f:
        raw_content = f.readlines()

    raw_content = [line.strip() for line in raw_content]
    
    return(raw_content)

all_content = [open_all_files(path) for path in target_paths]
all_content = [line for file in all_content for line in file]

In [None]:
desired_labels = ['LABEL_1', 'LABEL_2', 'LABEL_3']
error_analysis = dict()

for execution_id in missed_errors:
    target_paths = find_abs_path(execution_id)
    print("target_paths:\n", '\n'.join(target_paths))
    
    all_content = [open_all_files(path) for path in target_paths]
    if len(all_content) == 0:
        print("Execution Id " + execution_id + " does not have associated bundle files. This could be due to errors in the ways they were downloaded via API")
    all_content = [line for file in all_content for line in file]
    
    pred = classifier(all_content, max_length=512, truncation=True)
    
    indices = [i for i, p in enumerate(pred) if p['label'] in desired_labels]
    
    print("Execution Id: " + execution_id)
    print("Indices with Caught Errors: ["+ ', '.join(map(str, indices)) + ']')
    
    caught_errors = '\n'.join([all_content[i] for i in indices])
    print("Caught Errors: ", caught_errors)
    
    error_analysis[execution_id] = dict()
    error_analysis[execution_id]['num_errors'] = len(indices)
    error_analysis[execution_id]['caught_errors'] = caught_errors
    
    print("\n\n")

target_paths:
 /mnt/data/allstate_log_github/support-bundles/64f89884fdd3bd2b95f4b6c2/performance-metrics.csv
/mnt/data/allstate_log_github/support-bundles/64f89884fdd3bd2b95f4b6c2/nucleus-frontend-7d8bfd9496-xjsml-nucleus-frontend.log
/mnt/data/allstate_log_github/support-bundles/64f89884fdd3bd2b95f4b6c2/describe-pods.log
/mnt/data/allstate_log_github/support-bundles/64f89884fdd3bd2b95f4b6c2/saga.json
/mnt/data/allstate_log_github/support-bundles/64f89884fdd3bd2b95f4b6c2/nucleus-frontend-7d8bfd9496-n6cxk-nucleus-frontend.log
/mnt/data/allstate_log_github/support-bundles/64f89884fdd3bd2b95f4b6c2/execution.log
/mnt/data/allstate_log_github/support-bundles/64f89884fdd3bd2b95f4b6c2/nginx.log
/mnt/data/allstate_log_github/support-bundles/64f89884fdd3bd2b95f4b6c2/run-history.json
/mnt/data/allstate_log_github/support-bundles/64f89884fdd3bd2b95f4b6c2/nucleus-dispatcher-786f69d4d6-wwdb5-nucleus-dispatcher.log
/mnt/data/allstate_log_github/support-bundles/64f89884fdd3bd2b95f4b6c2/logjam-202309

In [210]:
execution_id = '652c0e24b9c768436d7c1971'

target_paths = find_abs_path(execution_id)
all_content = [open_all_files(path) for path in target_paths]
all_content = [line for file in all_content for line in file]
pred = classifier(all_content)
#indices = [i for i, p in enumerate(pred) if p['label'] in desired_labels]
#error_analysis[i] = indices
#print("Execution Id: " + i)
#print("Indices with Caught Errors: ["+ ', '.join(indices) + ']')

In [202]:
indices = [i for i, p in enumerate(pred) if p['label'] in desired_labels]
print("Indices with Caught Errors: ["+ ', '.join(map(str, indices)) + ']')

Indices with Caught Errors: [13, 14, 15, 16, 17, 123, 124, 125, 126, 127]


In [201]:
indices

[13, 14, 15, 16, 17, 123, 124, 125, 126, 127]

In [183]:
tst = ['1', '2', '3']
print("Test ["+ ', '.join(tst) + ']')

Test [1, 2, 3]


## Now let's save out our model so we don't lose that good processing power!