In [None]:
# --- Helper Functions ---
import random
import numpy as np
import torch
import os
import pandas as pd
import re
import string
from tqdm import tqdm

RANDOM_SEED = 42 # Your chosen seed

def set_all_seeds(seed_value):
    print(f"Setting all seeds to: {seed_value}")
    os.environ['PYTHONHASHSEED'] = str(seed_value) # Set it for consistent hashing
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed_value) # for multi-GPU.
        torch.cuda.manual_seed(seed_value)     # for current GPU.

# Call this AT THE VERY BEGINNING of your script, before almost any other import or operation
set_all_seeds(RANDOM_SEED)

# ... then your other imports like pandas, tqdm, sklearn, transformers ...
# ... then your model definition, tokenizer, dataset, dataloader creation ...
# ... then your training loop ...

Context Window Set


In [None]:
def create_context_window(text, target_string, window_size=200):

    target_index = text.find(target_string)

    if target_index != -1:
        start_index = max(0, target_index - window_size)
        end_index = min(len(text), target_index + len(target_string) + window_size)
        context_window = text[start_index:end_index]
        return context_window

    return None


In [None]:
def clean_text(text, remove_non_printable=True):
    """
    Clean a single text string for BART fine-tuning.
    
    Args:
    - text (str): The input text.
    - remove_non_printable (bool): Whether to remove non-printable characters.
    
    Returns:
    - str: Cleaned text.
    """
    if not isinstance(text, str):
        return ""
    
    # 1. Strip leading/trailing whitespace
    cleaned = text.strip()
    
    # 2. Replace multiple newlines/tabs with a space
    cleaned = re.sub(r'[\r\n\t]+', ' ', cleaned)
    
    # 3. Remove excessive spaces
    cleaned = re.sub(r'\s+', ' ', cleaned)
    
    # 4. Optionally remove embedded <eos> or </s> tokens (Bart uses </s> as EOS)
    cleaned = re.sub(r'(</s>|<eos>)', '', cleaned)
    
    
    # 5. Optionally remove non-printable characters
    if remove_non_printable:
        printable_chars = set(string.printable)
        cleaned = ''.join(filter(lambda x: x in printable_chars, cleaned))
        
    cleaned = re.sub(r'[\'"\│]', '', cleaned)
    dir_list_clean = re.sub(r'drwx[-\s]*\d+\s+\w+\s+\w+\s+\d+\s+\w+\s+\d+\s+[0-9a-fA-F-]+.*','',cleaned)
    shell_code_free_text = re.sub(r'```shell([^`]+)```','',dir_list_clean,flags=re.IGNORECASE)
    shell_code_free_text = re.sub(r'```Shell\s*"([^"]*)"\s*```','',shell_code_free_text,flags=re.IGNORECASE)
    # saved_game_free_text = re.sub(r'```([^`]+)```','',shell_code_free_text) #etay jhamela hobe
    saved_game_free_text = re.sub(r'<details><summary>Saved game</summary>\n\n```(.*?)```', '', shell_code_free_text)
    remove_packages = re.sub(r'(\w+\.)+\w+','',saved_game_free_text)
    java_exp_free_text = re.sub(r'at\s[\w.$]+\.([\w]+)\(([^:]+:\d+)\)','',remove_packages)
    # url_free_text= re.sub(https?://[^\s#]+#[A-Za-z0-9\-]+,'', java_exp_free_text, flags=re.IGNORECASE)
    url_with_fragment_text= re.sub(r'https?://[^\s#]+#[A-Za-z0-9\-\=\+]+','', java_exp_free_text, flags=re.IGNORECASE)
    url_free_text= re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '',url_with_fragment_text)
    commit_free_text= re.sub(r'commit[ ]?(?:id)?[ ]?[:]?[ ]?([0-9a-f]{40})\b', '', url_free_text, flags=re.IGNORECASE)
    file_path_free_text = re.sub(r"/[\w/. :-]+",'',commit_free_text)
    file_path_free_text = re.sub( r'(/[^/\s]+)+','',file_path_free_text)
    sha256_free_text = re.sub(r'sha256\s*[:]?[=]?\s*[a-fA-F0-9]{64}','',file_path_free_text)
    sha1_free_text = re.sub(r'git-tree-sha1\s*=\s*[a-fA-F0-9]+','',sha256_free_text)
    build_id_free_text = re.sub(r'build-id\s*[:]?[=]?\s*([a-fA-F0-9]+)','',sha1_free_text)
    guids_free_text = re.sub(r'GUIDs:\s+([0-9a-fA-F-]+\s+[0-9a-fA-F-]+\s+[0-9a-fA-F-]+)','',build_id_free_text)
    uuids_free_text = re.sub(r'([0-9a-fA-F-]+\s*,\s*[0-9a-fA-F-]+\s*,\s*[0-9a-fA-F-]+)','',guids_free_text)
    event_id_free_text = re.sub(r'<([^>]+)>','',uuids_free_text)
    UUID_free_text = re.sub(r'(?:UUID|GUID|version|id)[\\=:"\'\s]*\b[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}\b'
,'',event_id_free_text,flags=re.IGNORECASE) ##without the prefix so many false positives can be omitted
    hex_free_text = re.sub(r'(?:data|address|id)[\\=:"\'\s]*\b0x[0-9a-fA-F]+\b','',UUID_free_text,flags=re.IGNORECASE) ## deleting hex ids directly can cause issues
    ss_free_text = re.sub(r'Screenshot_(\d{4}[_-]\d{2}[_-]\d{2}[_-]\d{2}[_-]\d{2}[_-]\d{2}[_-]\d{2}[_-]\w+)','',hex_free_text,flags=re.IGNORECASE)
    cleaned = ss_free_text    

    
    return cleaned

In [None]:

# Enable tqdm integration with pandas apply
tqdm.pandas()

def process_dataframe(input_df: pd.DataFrame):
    """
    Processes an input Pandas DataFrame and returns a new, modified DataFrame,
    along with lists of modified text and candidate strings.

    Args:
        input_df (pd.DataFrame): The input Pandas DataFrame to be processed.

    Returns:
        tuple: (processed DataFrame, list of modified text, list of candidate strings)
    """

    # --- 1. Input Validation ---
    if not isinstance(input_df, pd.DataFrame):
        raise TypeError("Input must be a Pandas DataFrame.")
    if input_df.empty:
        print("Warning: Input DataFrame is empty.")
        return input_df.copy(), [], []

    # --- 2. Processing ---
    preprocessed_df = input_df.copy()



    # Apply `create_context_window` with progress bar
    print("Creating context windows...")
    preprocessed_df['modified_text'] = preprocessed_df.progress_apply(
        lambda row: create_context_window(row['text'], row['candidate_string']), axis=1)

    # Extract lists
    X_text = preprocessed_df['modified_text'].tolist()
    X_candidate = preprocessed_df['candidate_string'].tolist()
    
    X_text = [str(x) for x in X_text]
    X_candidate = [str(x) for x in X_candidate]


    return preprocessed_df, X_text, X_candidate


In [None]:


def extract_candidate_strings(regex_excel_path, issue_df):
    """
    Extracts candidate strings matching secret-detection regex patterns from issue text.

    Parameters:
        regex_excel_path (str): Path to the Excel file with regex patterns.
        issue_df (pd.DataFrame): DataFrame containing at least 'Issue ID' and 'Issue Body'.
        output_csv_path (str): Path where the output CSV will be saved.

    Returns:
        pd.DataFrame: A DataFrame of unique (Issue ID, Candidate String, Type) matches.
        
    """
    
     # Apply `clean_text` with progress bar
    print("Cleaning text...")
    issue_df['Issue Body'] = issue_df.progress_apply(
        lambda row: clean_text(row['Issue Body']), axis=1)
    
    
    # Load regex patterns
    excel_data = pd.read_excel(regex_excel_path)
    regex = pd.DataFrame(excel_data, columns=['Pattern_ID', 'Secret Type', 'Regular Expression', 'Source'])

    data_dict = {}
    idx = 0

    # Loop through each regex pattern with progress bar
    for i in tqdm(regex.index, desc="Processing regex patterns"):
        pattern = re.compile(regex.at[i, 'Regular Expression'])

        # Loop through each issue
        for j in issue_df.index:
            cleaned_text = issue_df.at[j, 'Issue Body']
            matches = re.findall(pattern, cleaned_text)

            for match in set(matches):
                data_dict[idx] = {
                    'Type': regex.at[i, 'Secret Type'],
                    'Issue ID': issue_df.at[j, 'Issue ID'],
                    'Candidate String': match
                }
                idx += 1

    # Convert to DataFrame and drop duplicates
    result_df = pd.DataFrame.from_dict(data_dict, orient='index')
    result_df = result_df.drop_duplicates(subset=["Issue ID", "Candidate String"], keep='first')

    print(f"Extracted {result_df.shape[0]} unique matches.")
    return result_df,issue_df


## works fine

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report,f1_score,precision_score,recall_score
import numpy as np
import os # For path joining
from tqdm.auto import tqdm

In [None]:
import gc
gc.collect()
if torch.cuda.is_available(): # If using PyTorch
    torch.cuda.empty_cache()

# Parameters

In [None]:
# --- Existing Strong Performers & Baselines ---
# 1. RoBERTa (Robustly Optimized BERT Pretraining Approach)
MODEL_NAME    = "roberta-base"

# 6. CodeBERT (Pre-trained on code and natural language)
#MODEL_NAME    = "microsoft/codebert-base"


# 14. LUKE (Language Understanding with Knowledge-based Embeddings)
# - Enhances language models by incorporating entity embeddings and knowledge graph information.
# - Could be interesting if your secrets are named entities or have known structures.
#MODEL_NAME    = "studio-ousia/luke-base"



In [None]:
# --- Configuration ---
DATASET_TYPE = "balanced"
RANDOM_SEED = 42
BATCH_SIZE = 16
NUM_EPOCHS = 20 # Reduced for quicker demonstration, increase for real training
LEARNING_RATE = 2e-5 # Common starting point for fine-tuning transformers
MAX_LENGTH = 256 # Reduced for potentially faster training, RoBERTa can handle 512
BEST_MODEL_PATH = "models/"+DATASET_TYPE+"/"+"best_"+MODEL_NAME.replace("/", "_")+"_model.pth"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
# import pandas as pd

# # Example input
# data = {
#     "1": {
#         "issue_body": """ I'm getting a permission error when trying to upload to our S3 bucket using the boto3 client. Here’s a snippet of the config I’m using:
# ```pyth
# on
# aws_access_key_id = "AKIAIOSFODNN7EXAMPLE"
# aws_secret_access_key = sk_67y453tc6hz129uz""",
#         "candidates": "sk_67y453tc6hz129uz"
#     },
#     "2": {
#         "issue_body": """I'm getting a permission error when trying to upload to our S3 bucket using the boto3 client. Here’s a snippet of the config I’m using:
# ```pyth
# on
# aws_access_key_id = "AKIAIOSFODNN7EXAMPLE"
# # dummy key
# aws_secret_access_key = sk_67y453tc6hz129uz""",
#         "candidates": "sk_67y453tc6hz129uz",
#     },
#     "3": {
#         "issue_body": """I'm getting a permission error when trying to upload to our S3 bucket using the boto3 client. Here’s a snippet of the config I’m using:
# ```pyth
# on
# aws_access_key_id = "AKIAIOSFODNN7EXAMPLE"
# aws_secret_access_key = XXXXXXXX""",
#         "candidates": " XXXXXXXX",
#     }
#     ,
#     "4": {
#         "issue_body": """ I'm getting a permission error when trying to upload to our S3 bucket using the boto3 client. Here’s a snippet of the config I’m using:
# ```pyth
# on
# aws_access_key_id = "AKIAIOSFODNN7EXAMPLE"
# aws_secret_access_key = DUMMY_KEY""",
#         "candidates": "DUMMY_KEY",
#     }
# }

# # Convert dictionary to a dataset (flatten the structure)
# dataset = []
# for issue_id, issue_info in data.items():
#     issue_body = issue_info["issue_body"]
#     candidate = issue_info["candidates"]
#     dataset.append({
#             "Issue ID": issue_id,
#             "text": issue_body,
#             "candidate_string": candidate
#         })

# # Create a DataFrame
# test_df = pd.DataFrame(dataset)

# # Print or save the dataset
# print(test_df)
# # df.to_csv("issue_dataset.csv", index=False)  # Uncomment to save to CSV


In [None]:
# df = pd.read_csv("Real_life_Issue_Reports_Merged.csv")
# data,df = extract_candidate_strings(
#     regex_excel_path='../dataset/Secret-Regular-Expression.xlsx',
#     issue_df=df
# )


In [None]:
# data = data.rename(columns={'Issue ID': 'Issue_id'})
# print(data.shape)
# print(data.head())
# merged_df = df.merge(data, left_on=['Issue ID'], right_on=['Issue_id'])
# print(merged_df.shape)
# columns_to_remove = ['Issue_id']
# merged_df.drop(columns=columns_to_remove, inplace=True)
# print(merged_df.columns)

# # Rename columns
# merged_df = merged_df.rename(columns={
#     'Issue Body': 'text',
#     'Candidate String': 'candidate_string'
# })
# print(merged_df.columns)
# print(merged_df.shape)


In [None]:
#merged_df.to_csv('merged_issues-with-candidate-strings.csv')


In [None]:
merged_df = pd.read_csv('merged_issues-with-candidate-strings.csv')

In [None]:

preprocessed_df,X_text_test,X_candidate_test = process_dataframe(merged_df)

print(f"Test samples: {len(X_text_test)}")

# --- Tokenization ---
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


In [None]:
# For your specific task, it seems you want to encode text and candidate separately
# and then pass them to a model that can handle two separate inputs.
# However, RobertaForSequenceClassification expects either:
# 1. A single sequence: tokenizer(text, truncation=True, padding=True)
# 2. A pair of sequences: tokenizer(text1, text2, truncation=True, padding=True)
#
# Your original `CustomDataset` suggests you are passing two separate tokenized inputs.
# This implies your model needs to be able to process them.
# If you are using `RobertaForSequenceClassification` directly, it will interpret the *first*
# set of input_ids and attention_mask as the primary input.
#
# Let's adjust to the common way of using `RobertaForSequenceClassification` for pairs:
# Concatenate or pass as pair to tokenizer
# Option A: Concatenate with [SEP]
# X_combined_train = [text + " [SEP] " + cand for text, cand in zip(X_text_train, X_candidate_train)]
# train_encodings = tokenizer(X_combined_train, padding=True, truncation=True, return_tensors='pt', max_length=MAX_LENGTH)

# Option B: Pass as pair (preferred if model supports it well, like BERT, RoBERTa)
print("Tokenizing test data...")
test_encodings = tokenizer(X_text_test, X_candidate_test, padding=True, truncation=True, return_tensors='pt', max_length=MAX_LENGTH)


In [None]:

class PairDataset(Dataset):
    def __init__(self, encodings, labels, indices=None):
        self.encodings = encodings
        self.labels = labels
        self.indices = indices if indices is not None else list(range(len(labels)))

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(int(self.labels[idx]), dtype=torch.long)
        item['index'] = torch.tensor(self.indices[idx])  # <-- Add row index
        return item

    def __len__(self):
        return len(self.labels)


# Convert labels to numpy arrays of integers

Y_labels_test_arr = np.zeros(len(X_text_test))

# Save indices to map predictions back to preprocessed_df
test_indices = list(preprocessed_df.index)

test_dataset = PairDataset(test_encodings, Y_labels_test_arr, indices=test_indices)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
# --- Model, Optimizer, Scheduler ---
num_labels=2

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)
model.to(DEVICE)

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)




In [None]:
# Load the best model
model.load_state_dict(torch.load(BEST_MODEL_PATH))
model.to(DEVICE) # Ensure model is on the correct device after loading
model.eval()

all_test_preds = []
all_test_labels = []
all_indices = []
total_test_loss = 0

test_progress_bar = tqdm(test_loader, desc="Testing", leave=False)
with torch.no_grad():
    for batch in test_progress_bar:
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)

        token_type_ids = batch.get('token_type_ids')
        if token_type_ids is not None:
            token_type_ids = token_type_ids.to(DEVICE)
            outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels)
        else:
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        
        loss = outputs.loss
        total_test_loss += loss.item()

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        all_test_preds.extend(predictions.cpu().numpy())
        all_test_labels.extend(labels.cpu().numpy())
        all_indices.extend(batch['index'].cpu().numpy())
        test_progress_bar.set_postfix({'test_loss': f"{loss.item():.4f}"})

# Convert dictionary to a dataset (flatten the structure)
dataset = []
for idx, pred in zip(all_indices, all_test_preds):
    if pred == 1:
        dataset.append({
            "Issue ID": preprocessed_df.loc[idx, "Issue ID"],
            "text": preprocessed_df.loc[idx, "text"],
            "candidate_string": preprocessed_df.loc[idx, "candidate_string"]
        })
# Create a DataFrame
flagged_df = pd.DataFrame(dataset)

flagged_file_name = MODEL_NAME.replace("/", "_") +"_"+str(all_test_preds.count(1))+"_flagged.csv"
flagged_df.to_csv(flagged_file_name, index=False)  # Uncomment to save to CSV


