In [1]:
import pandas as pd
import os
import numpy as np

In [2]:
cd = os.getcwd()
datapath = os.path.join(cd,'defectors')

In [3]:
rand_train_file_path = os.path.join(datapath, 'line_bug_prediction_splits', 'time', 'train.parquet.gzip')
rand_val_file_path = os.path.join(datapath, 'line_bug_prediction_splits', 'time', 'val.parquet.gzip')
rand_test_file_path = os.path.join(datapath, 'line_bug_prediction_splits', 'time', 'test.parquet.gzip')
df_rand_train = pd.read_parquet(rand_train_file_path, engine='pyarrow')
df_rand_val = pd.read_parquet(rand_val_file_path, engine='pyarrow')
df_rand_test = pd.read_parquet(rand_test_file_path, engine='pyarrow')

We only care about the 'content' and 'lines' features.

In [4]:
df_rand_train_cleaned = df_rand_train[['content', 'lines']]
df_rand_val_cleaned = df_rand_val[['content', 'lines']]
df_rand_test_cleaned = df_rand_test[['content', 'lines']]

In [5]:
len(df_rand_train_cleaned), len(df_rand_val_cleaned), len(df_rand_test_cleaned)

(185369, 10000, 10000)

In [6]:
df_rand_train_cleaned.loc[:, 'content'] = df_rand_train_cleaned['content'].apply(lambda x: x.decode('latin1') if isinstance(x, bytes) else x)
df_rand_val_cleaned.loc[:, 'content'] = df_rand_val_cleaned['content'].apply(lambda x: x.decode('latin1') if isinstance(x, bytes) else x)
df_rand_test_cleaned.loc[:, 'content'] = df_rand_test_cleaned['content'].apply(lambda x: x.decode('latin1') if isinstance(x, bytes) else x)

In [7]:
df_rand_train_cleaned['content'] = df_rand_train_cleaned['content'].astype('string')
df_rand_val_cleaned['content'] = df_rand_val_cleaned['content'].astype('string')
df_rand_test_cleaned['content'] = df_rand_test_cleaned['content'].astype('string')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rand_train_cleaned['content'] = df_rand_train_cleaned['content'].astype('string')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rand_val_cleaned['content'] = df_rand_val_cleaned['content'].astype('string')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rand_test_cleaned['content'] = df_ra

In [8]:
df_rand_train.loc[:, 'content'] = df_rand_train_cleaned['lines'].apply(lambda x: tuple(x) if isinstance(x, np.ndarray) else x)
df_rand_val.loc[:, 'content'] = df_rand_val_cleaned['lines'].apply(lambda x: tuple(x) if isinstance(x, np.ndarray) else x)
df_rand_test.loc[:, 'content'] = df_rand_test_cleaned['lines'].apply(lambda x: tuple(x) if isinstance(x, np.ndarray) else x)

In [9]:
len(df_rand_train_cleaned), len(df_rand_val_cleaned), len(df_rand_test_cleaned)

(185369, 10000, 10000)

Drop rows with null 'content'

In [10]:
df_rand_train_cleaned['content'].isnull().sum(), df_rand_val_cleaned['content'].isnull().sum(), df_rand_test_cleaned['content'].isnull().sum()

(2689, 232, 44)

In [11]:
df_rand_train_cleaned = df_rand_train_cleaned.dropna(subset=['content'])
df_rand_val_cleaned = df_rand_val_cleaned.dropna(subset=['content'])
df_rand_test_cleaned = df_rand_test_cleaned.dropna(subset=['content'])

In [12]:
len(df_rand_train_cleaned), len(df_rand_val_cleaned), len(df_rand_test_cleaned)

(182680, 9768, 9956)

In [13]:
(df_rand_train_cleaned['content'] == '').sum(), (df_rand_val_cleaned['content'] == '').sum(), (df_rand_test_cleaned['content'] == '').sum()

(298, 21, 15)

In [14]:
df_rand_test_cleaned = df_rand_test_cleaned[df_rand_test_cleaned['content'] != '']
df_rand_train_cleaned = df_rand_train_cleaned[df_rand_train_cleaned['content'] != '']
df_rand_val_cleaned = df_rand_val_cleaned[df_rand_val_cleaned['content'] != '']

In [15]:
len(df_rand_train_cleaned), len(df_rand_val_cleaned), len(df_rand_test_cleaned)

(182382, 9747, 9941)

In [16]:
df_rand_train_cleaned['content'].duplicated().sum(), df_rand_val_cleaned['content'].duplicated().sum(), df_rand_test_cleaned['content'].duplicated().sum()

(2860, 127, 268)

In [17]:
def combine_as_set(series):
    combined_set = set()
    for item in series:
        combined_set.update(item)
    return tuple(combined_set)

In [18]:
for index, row in df_rand_test_cleaned.iterrows():
    if len(row['content']) == 0:  # Example condition
        print(row)

In [19]:
df_rand_train_cleaned = df_rand_train_cleaned.groupby('content', as_index=False).agg({'lines': combine_as_set})
df_rand_val_cleaned = df_rand_val_cleaned.groupby('content', as_index=False).agg({'lines': combine_as_set})
df_rand_test_cleaned = df_rand_test_cleaned.groupby('content', as_index=False).agg({'lines': combine_as_set})

In [20]:
len(df_rand_train_cleaned), len(df_rand_val_cleaned), len(df_rand_test_cleaned)

(179522, 9620, 9673)

We remve all empty liens and adjust the bug line numbers accordingly.

In [21]:
def remove_empty_lines_and_adjust(df):
    """
    Remove empty lines from the 'content' column and adjust the line numbers in the 'lines' column.
    """

    new_lines = []
    new_contents = []

    for index, row in df.iterrows():
        content = row['content']
        bug_lines = row['lines']

        # Split the content into lines
        lines = content.split('\n')

        # Remove empty lines and keep track of the new line numbers
        new_content_lines = []
        line_mapping = {}
        new_line_number = 0

        for old_line_number, line in enumerate(lines):
            if line.strip():  # Check if the line is not empty
                new_content_lines.append(line)
                line_mapping[old_line_number] = new_line_number
                new_line_number += 1

        # Adjust the bug line numbers
        new_bug_lines = [line_mapping[old_line_number] for old_line_number in bug_lines if old_line_number in line_mapping]

        # Join the new content lines into a single string
        new_content = '\n'.join(new_content_lines)
        new_contents.append(new_content)
        new_lines.append(new_bug_lines)
    new_df = pd.DataFrame({'content': new_contents, 'lines': new_lines})
    return new_df

In [22]:
df_rand_train_cleaned = remove_empty_lines_and_adjust(df_rand_train_cleaned)
df_rand_val_cleaned = remove_empty_lines_and_adjust(df_rand_val_cleaned)
df_rand_test_cleaned = remove_empty_lines_and_adjust(df_rand_test_cleaned)

In [23]:
from transformers import BertTokenizer, BertModel
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, Subset
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [24]:
random_seed = 42
torch.manual_seed(random_seed)

<torch._C.Generator at 0x16512d7ebf0>

In [25]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [26]:
def prepare_data(full_code, buggy_lines, window_size=30):
    lines = full_code.split("\n")
    labels = [1 if i + 1 in buggy_lines else 0 for i in range(len(lines))]
    return lines, labels

In [27]:
def split_into_chunks(lines, window_size=30, overlap=15):
    """
    Splits a list of lines into overlapping chunks of a specified window size.
    """
    chunks = []
    for i in range(0, len(lines), window_size - overlap):
        chunk = lines[i:i + window_size]
        chunks.append(chunk)
    return chunks

In [28]:
class ChunkCodeDataset(Dataset):
    def __init__(self, dataframe, retrained_tokenizer_name="bert-base-uncased"):
        self.dataframe = dataframe
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        code = self.dataframe.iloc[idx]['content']
        if isinstance(code, bytes):
            code = code.decode('utf-8')
        buggy_lines = self.dataframe.iloc[idx]['lines']
        lines, labels = prepare_data(code, buggy_lines)
        chunks = split_into_chunks(lines)

         # Tokenize each chunk
        tokenized_chunks, chunk_labels = [], []
        for chunk in chunks:
            concatenated_lines = " [SEP] ".join(chunk)
            tokenized = self.tokenizer(
                concatenated_lines,
                padding='max_length',
                truncation=True,
                max_length=512,
                return_tensors="pt" # Return PyTorch tensors
            )
            tokenized_chunks.append(tokenized)
            
            # Extract corresponding labels for this chunk
            start_idx = lines.index(chunk[0])
            end_idx = start_idx + len(chunk)
            chunk_label = labels[start_idx:end_idx]
            if len(chunk_label) < 30:
                chunk_label.extend([0] * (30 - len(chunk_label)))
            chunk_labels.append(torch.tensor(chunk_label))

        return tokenized_chunks, chunk_labels

In [None]:
class ChunkLineClassifier(torch.nn.Module):
    def __init__(self, pretrained_model_name="bert-base-uncased", hidden_size=768):
        super().__init__()
        self.codebert = BertModel.from_pretrained(pretrained_model_name)
        self.line_classifier = torch.nn.Linear(hidden_size, 1)  # Binary classification per line

    def forward(self, input_ids, attention_mask):
        outputs = self.codebert(input_ids=input_ids, attention_mask=attention_mask)
        token_embeddings = outputs.last_hidden_state  # (batch_size, seq_len, hidden_size)
        # Aggregate embeddings for each line
        sep_token_id = self.codebert.config.sep_token_id
        batch_line_outputs = []
        for batch_idx in range(input_ids.size(0)):
            # Find the positions of all [SEP] tokens (Line breaks) in chunk
            sep_positions = (input_ids[batch_idx] == sep_token_id).nonzero(as_tuple=True)[0]
            line_embs = []
            start_idx = 0
            for sep_idx in sep_positions:
                line_emb = token_embeddings[batch_idx, start_idx:sep_idx].mean(dim=0)  # Mean-pooling
                line_embs.append(line_emb)
                start_idx = sep_idx + 1
            break
            batch_line_outputs.append(torch.stack(line_embs))  # (num_lines_in_chunk, hidden_size)
        # Apply line-level classifier
        line_probs = [torch.sigmoid(self.line_classifier(line_embs)) for line_embs in batch_line_outputs]
        return line_probs  # List of (num_lines_in_chunk, 1) tensors

In [30]:
def aggregate_predictions(chunk_predictions, line_count):
    # Initialize list to store probabilities for each line
    line_probs = [[] for _ in range(line_count)]
    for chunk_idx, predictions in enumerate(chunk_predictions):
        for line_idx, prob in enumerate(predictions):
            global_line_idx = chunk_idx * (30 - 15) + line_idx
            if global_line_idx < line_count:
                line_probs[global_line_idx].append(prob)
    # Average probabilities for each line
    return [sum(probs) / len(probs) if probs else 0 for probs in line_probs]

In [31]:
classifier = ChunkLineClassifier()
optimizer = optim.Adam(classifier.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

In [None]:
def train(classifier, dataloader):
    classifier.train()
    losses = []
    for epoch in range(1):  # Example: 3 epochs
        for batch in dataloader:
            chunks, labels = zip(*batch)
            chunks = chunks[0]
            labels = labels[0]
            input_ids_list = [chunk['input_ids'][0] for chunk in chunks]
            attention_mask_list = [chunk['attention_mask'][0] for chunk in chunks]
            # Stack the tensors
            input_ids_stacked = torch.stack(input_ids_list).to(device)
            attention_mask_stacked = torch.stack(attention_mask_list).to(device)
            labels_stacked = torch.stack(labels).to(device)
            outputs = classifier(input_ids_stacked, attention_mask_stacked)
            loss = torch.nn.BCELoss()(outputs, labels_stacked)  # Binary cross-entropy loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            break
            losses.append(loss.item())
    return losses

In [33]:
dataset = ChunkCodeDataset(df_rand_train_cleaned[['content', 'lines']])
batch_size = 1
subset_dataset = Subset(dataset, range(10))
dataloader = DataLoader(subset_dataset, batch_size=batch_size, shuffle=True, collate_fn=lambda x: x)

In [34]:
train(classifier, dataloader)

torch.Size([1, 512, 768])


AttributeError: 'NoneType' object has no attribute 'shape'

In [111]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate(model, classifier, dataloader):
    model.eval()
    classifier.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Validation"):
            code, buggy_lines = zip(*batch)
            code = code[0]
            tokenized_lines, num_lines = tokenize_lines(code, tokenizer)
            with torch.no_grad():
                output = model(**tokenized_lines)
            last_hidden_states = output.last_hidden_state
            # Use the [CLS] token representation
            logits = classifier(last_hidden_states[:, 0, :])
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            buggy_lines = buggy_lines[0]
            labels = prepare_labels(num_lines, buggy_lines).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels)
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='binary')
    recall = recall_score(all_labels, all_preds, average='binary')
    f1 = f1_score(all_labels, all_preds, average='binary')

    return accuracy, precision, recall, f1, all_preds, all_labels

In [112]:
val_dataset = CodeDataset(df_rand_val_cleaned[['content', 'lines']])
val_subset_dataset = Subset(val_dataset, range(10))
batch_size = 1
val_dataloader = DataLoader(val_subset_dataset, batch_size=batch_size, shuffle=True, collate_fn=lambda x: x)

NameError: name 'CodeDataset' is not defined

In [None]:
losses = train(model, classifier, dataloader)

Training:   1%|          | 1/100 [00:18<30:39, 18.59s/it]


KeyboardInterrupt: 

In [None]:
accuracy, precision, recall, f1, all_preds, all_labels = evaluate(model, classifier, val_dataloader)

Validation: 100%|██████████| 10/10 [03:48<00:00, 22.88s/it]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
