In [77]:
import os
import random
import pandas as pd

from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

import warnings
warnings.filterwarnings("ignore")

# Set Up The Data

## Read in the Errors

In [3]:
project_name = 'allstate_log_github'
data_directory = '/mnt/data/' + project_name + '/'

errors_df = pd.read_csv(data_directory+'error_analysis_all_time.csv')

In [4]:
errors_df.columns

Index(['Error', 'Line_Number', 'Context', 'Error_Type', 'File_Path',
       'Date_Time', 'Node', 'execution_id'],
      dtype='object')

In [5]:
errors_df.Error_Type.value_counts()

cluster    35526
domino     17838
user          48
Name: Error_Type, dtype: int64

In [6]:
errors_df = errors_df[['Context', 'Error_Type']]

## Extract random lines for non-error support bundles and create a "no error" dataframe

In [7]:
support_bundles_directory = data_directory + 'support-bundles/'
support_bundles = os.listdir(support_bundles_directory)
support_bundles = [x for x in support_bundles if '.zip' not in x]

In [8]:
len(support_bundles)

5548

In [9]:
resource_usage_directory = data_directory + 'resource-usage-by-day/'
resource_usage_csv_files = os.listdir(resource_usage_directory)
resource_usage_df = pd.concat([pd.read_csv(os.path.join(resource_usage_directory, f)) for f in resource_usage_csv_files], ignore_index=True)

In [10]:
resource_usage_df.head()

Unnamed: 0,run_id,run_title,command,status,starting_user,project_owner_username,project_name,total_run_duration_s_,run_duration_within_reporting_period_s_,hardware_tier,...,completed_timestamp,compute_cluster_type,master_hardware_tier,master_hardware_tier_cost_per_minute,worker_count,worker_hardware_tier,worker_hardware_tier_cost_per_minute,date,run_type,time_to_boot_s
0,63714cc09196ec342a497c35,echo test,echo test,Succeeded,bryce_beddard,bryce_beddard,testAPI,5.0,5,small-k8s,...,1668369640346,,,,,,,2022-11-13,batch,35
1,6370c0209196ec342a496c3e,Data Warehouse Daily Run,'/repos/domino_dw/DW_ETL.py',Succeeded,griffin_young,griffin_young,DW-Prod,6680.0,6680,small-k8s,...,1668340317363,,,,,,,2022-11-13,batch,36
2,640cddc0c4253f28887e0bbf,test.sh,test.sh,Failed,blake_moore,blake_moore,Finch,2.0,2,small-k8s,...,1678564826519,,,,,,,2023-03-11,batch,24
3,627ad072ee0eb12322fc991d,alireza_mounesisohi's JupyterLab session,[JupyterLab session ],Stopped,alireza_mounesisohi,alireza_mounesisohi,IMM-regression,8324.0,8324,small-k8s,...,1652224265867,,,,,,,2022-05-10,notebook,20
4,627abbfbd78c3b4fc50c85c5,Sahil-test,[vscode session ],Stopped,sahil.rahi,sahil.rahi,Test1,1007.0,1007,gpu-V100,...,1652211711849,,,,,,,2022-05-10,notebook,21


In [11]:
no_errors = resource_usage_df[resource_usage_df['status'].isin(['Running', 'Succeeded'])]

In [12]:
support_bundles = list(set(support_bundles) & set(no_errors.run_id))

In [13]:
def find_abs_path(support_bundle_path, support_bundles_directory=support_bundles_directory):
    all_paths = os.listdir(support_bundles_directory + support_bundle_path)
    all_paths = [os.path.abspath(support_bundles_directory + support_bundle_path + '/'+f) for f in all_paths]
    return(all_paths)
 
#find_abs_path(support_bundles[0])
successful_paths = [find_abs_path(target_support_bundle) for target_support_bundle in support_bundles]
successful_paths = [path for support_bundle in successful_paths for path in support_bundle]

In [14]:
def open_all_files(paths):
    with open(paths, 'r') as f:
        raw_content = f.readlines()

    raw_content = [line.strip() for line in raw_content]
    
    return(raw_content)

all_content = [open_all_files(path) for path in successful_paths]
all_content = [line for file in all_content for line in file]

In [21]:
all_content_save = all_content
all_content = list(filter(lambda word: len(word) > 10, all_content))

In [22]:
wanted_num_non_errors = errors_df.shape[0]
indices = random.sample(range(len(all_content)), wanted_num_non_errors)
target_content = [all_content[idx] for idx in indices]

In [23]:
no_errors_df = pd.DataFrame({"Context" : target_content, "Error_Type" : ["none" for i in target_content]})

## Combine the errors with the non-errors and create training/testing data

In [31]:
all_data = pd.concat([errors_df, no_errors_df])
all_data.columns = ['text', 'label']
all_data.shape

(106824, 2)

In [33]:
df_train, df_test = train_test_split(all_data, test_size=0.1, stratify=all_data["label"])

In [53]:
all_data.shape

(106824, 2)

In [55]:
all_data.groupby('label', group_keys=False)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f3d93d25700>

In [60]:
def stratified_sample(df, stratifying_col, n):
    def sample_func(group):
        if len(group) < int(n/df[stratifying_col].nunique()):
            return group.sample(int(n/df[stratifying_col].nunique()), replace=True)
        else:
            return group.sample(int(n/df[stratifying_col].nunique()))
    return df.groupby(stratifying_col, group_keys=False).apply(sample_func)

# Get stratified sample with target number of rows
n = 1000
all_data_small = stratified_sample(all_data, 'label', n)

df_train_small, df_test_small = train_test_split(all_data_small, test_size=0.1, stratify=all_data_small["label"])

dir_name = os.path.join(data_directory, 'classification_data')

if not os.path.exists(dir_name):
    os.makedirs(dir_name)
   

df_train_small.to_csv(os.path.join(dir_name, "train_small.csv"), index=False)
df_test_small.to_csv(os.path.join(dir_name, "test_small.csv"), index=False)

In [35]:
df_train['label'].value_counts()

none       48071
cluster    31973
domino     16054
user          43
Name: label, dtype: int64

In [36]:
dir_name = os.path.join(data_directory, 'classification_data')

if not os.path.exists(dir_name):
    os.makedirs(dir_name)
    

df_train.to_csv(os.path.join(dir_name, "train.csv"), index=False)
df_test.to_csv(os.path.join(dir_name, "test.csv"), index=False)

## Groom the data into something that HuggingFace can take in

In [94]:
df_train = pd.read_csv(os.path.join(dir_name, "train_small.csv"))
df_test = pd.read_csv(os.path.join(dir_name, "test_small.csv"))

In [98]:
len(unique_labels)

4

In [102]:
unique_labels = df_train['label'].unique()
label_to_text = {idx:unique_labels[idx] for idx in range(len(unique_labels))}
text_to_label = {unique_labels[idx]:idx for idx in range(len(unique_labels))}

In [106]:
df_train['label'] = df_train['label'].apply(lambda x: text_to_label[x])
df_test['label'] = df_test['label'].apply(lambda x: text_to_label[x])

In [109]:
dataset_train = Dataset.from_pandas(df_train)
dataset_test = Dataset.from_pandas(df_test)

dataset_logs = DatasetDict({
        'train':dataset_train,
        'test':dataset_test
})

In [110]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

def tokenize_function(df):
    return tokenizer(df['text'], padding="max_length", truncation=True)

tokenized_datasets_logs = dataset_logs.map(tokenize_function)

Map:   0%|          | 0/900 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [111]:
tokenized_datasets_logs = tokenized_datasets_logs.remove_columns(["text"])
tokenized_datasets_logs = tokenized_datasets_logs.rename_column("label", "labels")
tokenized_datasets_logs.set_format("torch")

In [136]:
train_dataset_logs = tokenized_datasets_logs["train"]
eval_dataset_logs = tokenized_datasets_logs["test"]

In [138]:
from torch.utils.data import DataLoader

train_dataloader_logs = DataLoader(train_dataset_logs, shuffle=True, batch_size=8)
eval_dataloader_logs = DataLoader(eval_dataset_logs, batch_size=8)

## Begin to Build the Model

In [132]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained('bert-base-cased', num_labels=4)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initi

In [133]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [134]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader_logs)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [120]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [139]:
for i in train_dataloader_logs:
    print(i)
    break

{'labels': tensor([2, 2, 1, 1, 2, 3, 1, 1]), 'input_ids': tensor([[  101, 17881,  1495,  ...,     0,     0,     0],
        [  101, 17881,  1495,  ...,     0,     0,     0],
        [  101,  2962,   131,  ...,     0,     0,     0],
        ...,
        [  101,   196,   107,  ...,  1545,  1559,   102],
        [  101,   118, 12461,  ...,     0,     0,     0],
        [  101, 17881,  1495,  ...,   117,  1888,   102]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]])}


In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader_logs:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/339 [00:00<?, ?it/s]