In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# ***GENERATING POSITIVE AND NEGATIVE REPORTS***

In [None]:
import tarfile
import os
import xml.etree.ElementTree as ET

def extract_tar_bz2(tar_path, extract_path):
    os.makedirs(extract_path, exist_ok=True)
    with tarfile.open(tar_path, "r:bz2") as tar:
        tar.extractall(path=extract_path)
    return extract_path

def parse_bug_reports(directory):
    positive_reports = []
    negative_reports = []

    for dir_root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".xml"):
                file_path = os.path.join(dir_root, file)
                tree = ET.parse(file_path)
                xml_root = tree.getroot()

                for bug in xml_root.findall("bug"):
                    bug_id = bug.find("bug_id").text if bug.find("bug_id") is not None else "Unknown"
                    description = bug.find("short_desc").text if bug.find("short_desc") is not None else "No description"
                    resolution = bug.find("resolution").text if bug.find("resolution") is not None else "UNRESOLVED"

                    report_text = f"Bug ID: {bug_id}\nDescription: {description}\nResolution: {resolution}\n\n"

                    if resolution in ["FIXED", "VERIFIED", "DUPLICATE"]:
                        positive_reports.append(report_text)
                    else:
                        negative_reports.append(report_text)

    return positive_reports, negative_reports

def save_reports(positive_reports, negative_reports, output_dir):
    os.makedirs(output_dir, exist_ok=True)

    with open(os.path.join(output_dir, "positive_reports.txt"), "w", encoding="utf-8") as pos_file:
        pos_file.writelines(positive_reports)

    with open(os.path.join(output_dir, "negative_reports.txt"), "w", encoding="utf-8") as neg_file:
        neg_file.writelines(negative_reports)

def main():
    tar_path = "/content/drive/MyDrive/llvm.tar.bz2"
    extract_path = "/content/sample_data/extracted"
    output_dir = "/content/sample_data/out"

    print("Extracting tar.bz2 file...")
    extract_tar_bz2(tar_path, extract_path)

    print("Parsing bug reports...")
    positive_reports, negative_reports = parse_bug_reports(extract_path)

    print("Saving reports...")
    save_reports(positive_reports, negative_reports, output_dir)

    print("Processing complete! Positive and negative reports saved.")

if __name__ == "__main__":
    main()


Extracting tar.bz2 file...
Parsing bug reports...
Saving reports...
Processing complete! Positive and negative reports saved.


# ***TRAINING MODEL ON LLVM DATASET***

In [None]:
import tensorflow as tf
from keras import backend as K
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support
import numpy as np
from keras.layers import Conv1D, GlobalMaxPooling1D, Dense, Dropout, BatchNormalization, Input, concatenate
from keras.models import Model, load_model
from sklearn.model_selection import train_test_split
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import torch
import gc
import os
import json
from datetime import datetime
from itertools import islice
import multiprocessing
import tempfile
import psutil
import time


os.environ['TF_ENABLE_ONEDNN_OPTS'] = '1'
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)


NUM_WORKERS = min(2, max(1, multiprocessing.cpu_count() - 1))
DEFAULT_THRESHOLD = 0.3

def print_memory_usage(label):
    """Monitor memory usage during execution"""
    try:
        process = psutil.Process(os.getpid())
        memory_mb = process.memory_info().rss / (1024 * 1024)
        print(f"[MEMORY] {label}: {memory_mb:.2f} MB")
    except Exception as e:
        print(f"[STATUS] {label} (memory monitoring failed: {e})")

class TextClassificationDataset(Dataset):
    """Dataset class for handling text classification data"""
    def __init__(self, reports, tokenizer, max_length):
        self.reports = reports
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.reports)

    def __getitem__(self, idx):
        text = " ".join(self.reports[idx][:self.max_length])
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return {k: v.squeeze(0) for k, v in encoding.items()}

def load_reports(path, max_reports=None, chunk_size=1000):
    """Load reports with memory-efficient chunking"""
    reports = []
    with open(path, 'r', encoding='UTF-8') as infile:
        while True:
            if max_reports and len(reports) >= max_reports:
                break
            chunk_to_read = min(chunk_size, max_reports - len(reports)) if max_reports else chunk_size
            chunk = list(islice(infile, chunk_to_read))
            if not chunk:
                break
            reports.extend(chunk)
            if len(reports) % (chunk_size * 5) == 0:
                gc.collect()
    return reports

def process_reports(reports):
    """Process reports into word lists"""
    processed = []
    chunk_size = 5000
    for i in range(0, len(reports), chunk_size):
        chunk = reports[i:i+chunk_size]
        processed.extend([report.replace(' \n', '').split(' ') for report in chunk])
        if i + chunk_size < len(reports):
            gc.collect()
    return processed

def generate_embeddings_batch(model_dict, dataset, output_path, batch_size=16, max_length=30):
    """Generate embeddings in batches with memory optimization"""
    model = model_dict["model"]
    device = model_dict["device"]

    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        num_workers=NUM_WORKERS,
        pin_memory=True if device.type == 'cuda' else False
    )

    embedding_dim = model.config.hidden_size
    total_samples = len(dataset)

    with open(output_path, 'w+b') as f:
        memmap = np.memmap(
            f,
            dtype=np.float16,
            mode='w+',
            shape=(total_samples, max_length, embedding_dim)
        )

        current_idx = 0
        for batch in tqdm(dataloader, desc="Generating embeddings"):
            batch = {k: v.to(device) for k, v in batch.items()}
            batch_size_actual = batch['input_ids'].size(0)
            with torch.no_grad():
                if device.type == 'cuda':
                    with torch.amp.autocast(device_type='cuda'):
                        outputs = model(**batch)
                else:
                    outputs = model(**batch)

                hidden_states = outputs.last_hidden_state
                embeddings = hidden_states.cpu().numpy().astype(np.float16)

                end_idx = min(current_idx + batch_size_actual, total_samples)
                memmap[current_idx:end_idx] = embeddings[:end_idx-current_idx]
                memmap.flush()
                current_idx = end_idx

                del outputs, hidden_states, embeddings
                batch = {k: v.detach().cpu() for k, v in batch.items()}
                if device.type == 'cuda':
                    torch.cuda.empty_cache()
                gc.collect()

    return output_path

def load_model_transformer(model_directory):
    """Load transformer model with memory optimization"""
    print("Loading transformer model...")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model_kwargs = {'low_cpu_mem_usage': True}
    if device.type == 'cuda':
        model_kwargs['torch_dtype'] = torch.float16
    tokenizer = AutoTokenizer.from_pretrained(model_directory)
    model = AutoModel.from_pretrained(model_directory, **model_kwargs).to(device)
    model.eval()
    if hasattr(model, 'gradient_checkpointing_enable'):
        model.gradient_checkpointing_enable()
    return {"model": model, "tokenizer": tokenizer, "device": device}

def get_samples(emb_path, padding_size, dimension):
    """Calculate number of samples in embedding file"""
    file_size = os.path.getsize(emb_path)
    return file_size // (padding_size * dimension * 2)

def CNN_preprocess(pos_path, neg_path, model_path, max_reports=None, padding_size=30, batch_size=16):
    """Preprocess data and generate embeddings"""
    print_memory_usage("Before loading reports")

    pos_reports = load_reports(pos_path, max_reports)
    print_memory_usage("After loading positive reports")
    pos_reports = process_reports(pos_reports)
    print_memory_usage("After processing positive reports")
    neg_reports = load_reports(neg_path, max_reports)
    print_memory_usage("After loading negative reports")
    neg_reports = process_reports(neg_reports)
    print_memory_usage("After processing negative reports")

    if not padding_size:
        sample_reports = pos_reports[:1000] + neg_reports[:1000]
        padding_size = min(max(len(r) for r in sample_reports), 100)
        del sample_reports
        gc.collect()

    print(f"Using padding size: {padding_size}")


    model_dict = load_model_transformer(model_path)
    dimension = model_dict["model"].config.hidden_size
    print_memory_usage("After loading model")


    pos_emb_path = tempfile.mktemp()
    neg_emb_path = tempfile.mktemp()

    pos_dataset = TextClassificationDataset(pos_reports, model_dict["tokenizer"], padding_size)
    print_memory_usage("Before generating positive embeddings")
    generate_embeddings_batch(model_dict, pos_dataset, pos_emb_path, batch_size, padding_size)
    print_memory_usage("After generating positive embeddings")
    del pos_dataset, pos_reports
    gc.collect()

    neg_dataset = TextClassificationDataset(neg_reports, model_dict["tokenizer"], padding_size)
    print_memory_usage("Before generating negative embeddings")
    generate_embeddings_batch(model_dict, neg_dataset, neg_emb_path, batch_size, padding_size)
    print_memory_usage("After generating negative embeddings")
    del neg_dataset, neg_reports
    gc.collect()


    del model_dict["model"], model_dict["tokenizer"]
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()
    print_memory_usage("After clearing model")


    pos_samples = get_samples(pos_emb_path, padding_size, dimension)
    neg_samples = get_samples(neg_emb_path, padding_size, dimension)
    total_samples = pos_samples + neg_samples
    print(f"Positive samples: {pos_samples}, Negative samples: {neg_samples}")

    combined_emb_path = tempfile.mktemp()
    x_combined = np.memmap(combined_emb_path, dtype=np.float16, mode='w+',
                           shape=(total_samples, padding_size, dimension))

    try:

        chunk_size = 500
        print("Merging positive embeddings...")
        x_pos = np.memmap(pos_emb_path, dtype=np.float16, mode='r',
                          shape=(pos_samples, padding_size, dimension))
        for i in tqdm(range(0, pos_samples, chunk_size)):
            end = min(i + chunk_size, pos_samples)
            x_combined[i:end] = x_pos[i:end]
        del x_pos

        print("Merging negative embeddings...")
        neg_start = pos_samples
        x_neg = np.memmap(neg_emb_path, dtype=np.float16, mode='r',
                          shape=(neg_samples, padding_size, dimension))
        for i in tqdm(range(0, neg_samples, chunk_size)):
            end = min(i + chunk_size, neg_samples)
            x_combined[neg_start+i:neg_start+end] = x_neg[i:end]
        del x_neg

        x_combined.flush()
        print_memory_usage("After merging embeddings")


        y_combined_path = tempfile.mktemp()
        y_combined = np.memmap(y_combined_path, dtype=np.float32, mode='w+',
                               shape=(total_samples, 2))

        y_combined[:, 0] = 0
        y_combined[:, 1] = 1

        chunk_size = 5000
        for i in tqdm(range(0, pos_samples, chunk_size)):
            end = min(i + chunk_size, pos_samples)
            y_combined[i:end, 0] = 1
            y_combined[i:end, 1] = 0
        y_combined.flush()
        print_memory_usage("After creating labels")
    finally:
        os.remove(pos_emb_path)
        os.remove(neg_emb_path)
        gc.collect()

    return combined_emb_path, y_combined_path, padding_size, dimension, total_samples, {
        "pos_samples": pos_samples,
        "neg_samples": neg_samples
    }

class BalancedDataGenerator(tf.keras.utils.Sequence):
    """Data generator without SMOTE"""
    def __init__(self, x_path, y_path, indices, batch_size, padding_size, dimension):
        self.x_path = x_path
        self.y_path = y_path
        self.indices = indices
        self.batch_size = batch_size
        self.padding_size = padding_size
        self.dimension = dimension
        self.total_samples = get_samples(x_path, padding_size, dimension)

    def __len__(self):
        return int(np.ceil(len(self.indices) / self.batch_size))

    def __getitem__(self, idx):
        batch_indices = self.indices[idx*self.batch_size : (idx+1)*self.batch_size]
        x_mem = np.memmap(self.x_path, dtype=np.float16, mode='r',
                          shape=(self.total_samples, self.padding_size, self.dimension))
        y_mem = np.memmap(self.y_path, dtype=np.float32, mode='r',
                          shape=(self.total_samples, 2))
        x_batch = np.array(x_mem[batch_indices], dtype=np.float32)
        y_batch = np.array(y_mem[batch_indices], dtype=np.float32)
        return x_batch, y_batch

    def on_epoch_end(self):
        gc.collect()

class IntervalCheckpoint(tf.keras.callbacks.Callback):
    def __init__(self, checkpoint_dir, save_interval_seconds=300, max_to_keep=2):
        """
        Args:
            checkpoint_dir (str): Directory where checkpoints are saved.
            save_interval_seconds (int): Interval between saves in seconds.
            max_to_keep (int): Number of latest checkpoints to keep.
        """
        super().__init__()
        self.checkpoint_dir = checkpoint_dir
        self.save_interval_seconds = save_interval_seconds
        self.max_to_keep = max_to_keep
        self.last_save_time = time.time()
        self.checkpoints = []
        if not os.path.exists(self.checkpoint_dir):
            os.makedirs(self.checkpoint_dir)

    def on_epoch_end(self, epoch, logs=None):
        current_time = time.time()
        if current_time - self.last_save_time >= self.save_interval_seconds:
            checkpoint_filename = f"checkpoint_epoch_{epoch+1}_{int(current_time)}.h5"
            checkpoint_path = os.path.join(self.checkpoint_dir, checkpoint_filename)
            self.model.save(checkpoint_path)
            print(f"Saved checkpoint: {checkpoint_path}")
            self.checkpoints.append(checkpoint_path)
            if len(self.checkpoints) > self.max_to_keep:
                file_to_remove = self.checkpoints.pop(0)
                if os.path.exists(file_to_remove):
                    os.remove(file_to_remove)
                    print(f"Removed old checkpoint: {file_to_remove}")
            self.last_save_time = current_time

class EnhancedTextClassifier:
    """Enhanced CNN model with multi-scale features and F1 optimization"""
    def __init__(self, input_shape, class_weights=None):
        self.input_shape = input_shape
        self.class_weights = class_weights
        self.model = self._build_model()
        self._compile_model()

    def _build_model(self):
        input_layer = Input(shape=self.input_shape)

        branches = []
        for kernel_size in [2, 3, 5]:
            branch = Conv1D(64, kernel_size, padding='same', activation='relu')(input_layer)
            branch = BatchNormalization()(branch)
            branch = GlobalMaxPooling1D()(branch)
            branches.append(branch)
        x = concatenate(branches)
        x = Dense(128, activation='relu')(x)
        x = Dropout(0.5)(x)
        x = Dense(64, activation='relu')(x)
        x = Dropout(0.3)(x)
        outputs = Dense(2, activation='softmax')(x)
        return Model(inputs=input_layer, outputs=outputs)

    def _compile_model(self):
        self.model.compile(
            loss='categorical_crossentropy',
            optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
            metrics=['accuracy', self._f1_score]
        )

    @staticmethod
    def _f1_score(y_true, y_pred):
        true_positives = tf.reduce_sum(tf.round(tf.clip_by_value(y_true * y_pred, 0, 1)))
        possible_positives = tf.reduce_sum(tf.round(tf.clip_by_value(y_true, 0, 1)))
        predicted_positives = tf.reduce_sum(tf.round(tf.clip_by_value(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        recall = true_positives / (possible_positives + K.epsilon())
        return 2 * ((precision * recall) / (precision + recall + K.epsilon()))

    def train(self, train_gen, val_gen, epochs=10, checkpoint_dir="/content/drive/MyDrive/checkpoints"):
        callbacks = [
            EarlyStopping(monitor='val_f1_score', patience=3, mode='max', restore_best_weights=True),
            ReduceLROnPlateau(monitor='val_f1_score', factor=0.2, patience=2, mode='max'),
            IntervalCheckpoint(checkpoint_dir=checkpoint_dir, save_interval_seconds=300, max_to_keep=2)
        ]
        self.history = self.model.fit(
            train_gen,
            validation_data=val_gen,
            epochs=epochs,
            callbacks=callbacks,
            class_weight=self.class_weights,
            verbose=1
        )

    def evaluate(self, test_gen, threshold=DEFAULT_THRESHOLD):
        y_true, y_pred = [], []
        for x_batch, y_batch in test_gen:
            preds = self.model.predict(x_batch)
            y_pred.extend((preds[:, 1] > threshold).astype(int))
            y_true.extend(np.argmax(y_batch, axis=1))
        return self._calculate_metrics(y_true, y_pred)

    def _calculate_metrics(self, y_true, y_pred):
        precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')
        return {
            'accuracy': np.mean(np.array(y_true) == np.array(y_pred)),
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'cm': confusion_matrix(y_true, y_pred)
        }

def save_model_with_metadata(model, model_dir, metadata=None):
    """Save model with associated metadata"""
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    model_path = os.path.join(model_dir, "model.h5")
    model.save(model_path)
    print(f"Model saved to {model_path}")
    if metadata:
        metadata_path = os.path.join(model_dir, "metadata.json")
        with open(metadata_path, 'w') as f:
            json.dump(metadata, f, indent=4)
        print(f"Model metadata saved to {metadata_path}")
    return model_path

def load_checkpoint_model(checkpoint_path):
    """Load model from checkpoint"""
    print(f"Loading model from checkpoint: {checkpoint_path}")
    model = load_model(checkpoint_path, custom_objects={
        '_f1_score': EnhancedTextClassifier._f1_score
    })
    print("Model successfully loaded from checkpoint")
    return model

def find_checkpoint(checkpoint_dir, specific_checkpoint=None):
    """Find the latest checkpoint or a specific one in the checkpoint directory"""
    if specific_checkpoint:
        checkpoint_path = os.path.join(checkpoint_dir, specific_checkpoint)
        if os.path.exists(checkpoint_path):
            return checkpoint_path
        else:
            raise FileNotFoundError(f"Specified checkpoint not found: {checkpoint_path}")


    checkpoints = [f for f in os.listdir(checkpoint_dir) if f.endswith('.h5')]
    if not checkpoints:
        raise FileNotFoundError(f"No checkpoints found in directory: {checkpoint_dir}")


    checkpoints.sort(key=lambda x: os.path.getmtime(os.path.join(checkpoint_dir, x)), reverse=True)
    latest_checkpoint = os.path.join(checkpoint_dir, checkpoints[0])
    return latest_checkpoint

def continue_training_from_checkpoint(checkpoint_path, new_pos_path, new_neg_path, model_path,
                                     batch_size=64, nb_epoch=10, model_save_dir=None):
    """Train existing model from checkpoint on new data"""
    print_memory_usage("Before continued training")


    model = load_checkpoint_model(checkpoint_path)


    print("Preprocessing new data...")
    x_path, y_path, padding_size, dimension, total_samples, dataset_metadata = CNN_preprocess(
        new_pos_path,
        new_neg_path,
        model_path,
        max_reports=None,
        batch_size=256
    )
    print_memory_usage("After preprocessing new data")


    pos_samples = dataset_metadata['pos_samples']
    neg_samples = dataset_metadata['neg_samples']
    class_weights = {
        0: (pos_samples + neg_samples) / (2 * neg_samples),
        1: (pos_samples + neg_samples) / (2 * pos_samples)
    }


    indices = np.random.permutation(total_samples)
    train_idx, test_idx = train_test_split(indices, test_size=0.2)
    train_gen = BalancedDataGenerator(x_path, y_path, train_idx, batch_size, padding_size, dimension)
    test_gen = BalancedDataGenerator(x_path, y_path, test_idx, batch_size, padding_size, dimension)


    input_shape = (padding_size, dimension)
    classifier = EnhancedTextClassifier(input_shape, class_weights)

    classifier.model = model


    classifier._compile_model()


    print("Continuing training from checkpoint...")
    checkpoint_dir = os.path.join(model_save_dir, "continued_checkpoints")
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    classifier.train(train_gen, test_gen, epochs=nb_epoch, checkpoint_dir=checkpoint_dir)


    metrics = classifier.evaluate(test_gen, threshold=DEFAULT_THRESHOLD)


    result = {
        'model': classifier.model,
        'metrics': metrics,
        'history': classifier.history.history
    }


    if model_save_dir:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        save_dir = os.path.join(model_save_dir, f"continued_model_{timestamp}")
        metadata = {
            'timestamp': timestamp,
            'input_shape': input_shape,
            'original_checkpoint': checkpoint_path,
            'class_distribution': dataset_metadata,
            'optimal_threshold': DEFAULT_THRESHOLD,
            'metrics': metrics,
            'training_history': classifier.history.history
        }
        saved_model_path = save_model_with_metadata(classifier.model, save_dir, metadata)
        result['saved_model_path'] = saved_model_path

    print_memory_usage("After continued training")
    return result


if __name__ == "__main__":
    try:

        checkpoint_dir = "/content/drive/MyDrive/trained_models/continued_checkpoints"
        new_pos_path = "/content/sample_data/out/positive_reports.txt"
        new_neg_path = "/content/sample_data/out/negative_reports.txt"
        model_path = "/content/drive/MyDrive/sem_model_4"
        model_save_dir = "/content/drive/MyDrive/trained_models"


        if not os.path.exists(model_save_dir):
            os.makedirs(model_save_dir)
            print(f"Created directory for saving models: {model_save_dir}")


        continued_checkpoints_dir = os.path.join(model_save_dir, "continued_checkpoints")
        if not os.path.exists(continued_checkpoints_dir):
            os.makedirs(continued_checkpoints_dir)
            print(f"Created directory for continued checkpoints: {continued_checkpoints_dir}")


        specific_checkpoint = None


        checkpoint_path = find_checkpoint(checkpoint_dir, specific_checkpoint)
        print(f"Using checkpoint: {checkpoint_path}")


        results = continue_training_from_checkpoint(
            checkpoint_path=checkpoint_path,
            new_pos_path=new_pos_path,
            new_neg_path=new_neg_path,
            model_path=model_path,
            batch_size=64,
            nb_epoch=10,
            model_save_dir=model_save_dir
        )


        print("\nFinal Metrics after continued training:")
        print(f"Accuracy: {results['metrics']['accuracy']:.4f}")
        print(f"Precision (Positive class): {results['metrics']['precision']:.4f}")
        print(f"Recall (Positive class): {results['metrics']['recall']:.4f}")
        print(f"F1-Score: {results['metrics']['f1']:.4f}")
        print("\nConfusion Matrix:")
        print(results['metrics']['cm'])
        if 'saved_model_path' in results:
            print(f"\nUpdated model saved to: {results['saved_model_path']}")


        del results['model']
        K.clear_session()
        gc.collect()

    except Exception as e:
        print(f"Error occurred: {e}")
        import traceback
        traceback.print_exc()

    finally:

        try:
            if 'x_path' in locals() and os.path.exists(x_path):
                os.remove(x_path)
            if 'y_path' in locals() and os.path.exists(y_path):
                os.remove(y_path)
            print("Cleanup complete")
        except Exception as e:
            print(f"Error during cleanup: {e}")

    print("Continued training process completed successfully!")

Using checkpoint: /content/drive/MyDrive/trained_models/continued_checkpoints/checkpoint_epoch_10_1744736100.h5
[MEMORY] Before continued training: 1116.02 MB
Loading model from checkpoint: /content/drive/MyDrive/trained_models/continued_checkpoints/checkpoint_epoch_10_1744736100.h5




Model successfully loaded from checkpoint
Preprocessing new data...
[MEMORY] Before loading reports: 1122.52 MB
[MEMORY] After loading positive reports: 1127.67 MB
[MEMORY] After processing positive reports: 1160.66 MB
[MEMORY] After loading negative reports: 1160.92 MB
[MEMORY] After processing negative reports: 1184.64 MB
Using padding size: 30
Loading transformer model...
[MEMORY] After loading model: 1375.82 MB
[MEMORY] Before generating positive embeddings: 1375.82 MB


Generating embeddings: 100%|██████████| 337/337 [22:36<00:00,  4.03s/it]


[MEMORY] After generating positive embeddings: 2541.07 MB
[MEMORY] Before generating negative embeddings: 2522.12 MB


Generating embeddings: 100%|██████████| 260/260 [17:12<00:00,  3.97s/it]


[MEMORY] After generating negative embeddings: 2208.75 MB
[MEMORY] After clearing model: 2115.67 MB
Positive samples: 86048, Negative samples: 66380
Merging positive embeddings...


100%|██████████| 173/173 [00:17<00:00,  9.84it/s]


Merging negative embeddings...


100%|██████████| 133/133 [00:10<00:00, 12.24it/s]


[MEMORY] After merging embeddings: 5306.21 MB


100%|██████████| 18/18 [00:00<00:00, 35213.37it/s]

[MEMORY] After creating labels: 5307.20 MB





[MEMORY] After preprocessing new data: 2053.90 MB
Continuing training from checkpoint...


  self._warn_if_super_not_called()


Epoch 1/10
[1m1906/1906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m163s[0m 83ms/step - _f1_score: 0.6747 - accuracy: 0.6747 - loss: 0.5132 - val__f1_score: 0.6828 - val_accuracy: 0.6830 - val_loss: 0.5052 - learning_rate: 0.0010
Epoch 2/10
[1m   1/1906[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m3:57[0m 124ms/step - _f1_score: 0.6562 - accuracy: 0.6562 - loss: 0.4825

  current = self.get_monitor_value(logs)
  callback.on_epoch_end(epoch, logs)


[1m1906/1906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step - _f1_score: 0.6844 - accuracy: 0.6844 - loss: 0.5004



Saved checkpoint: /content/drive/MyDrive/trained_models/continued_checkpoints/checkpoint_epoch_2_1744833886.h5
[1m1906/1906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m158s[0m 83ms/step - _f1_score: 0.6844 - accuracy: 0.6844 - loss: 0.5004 - val__f1_score: 0.6758 - val_accuracy: 0.6759 - val_loss: 0.5086 - learning_rate: 0.0010
Epoch 3/10
[1m1906/1906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m162s[0m 85ms/step - _f1_score: 0.6868 - accuracy: 0.6868 - loss: 0.4961 - val__f1_score: 0.6871 - val_accuracy: 0.6873 - val_loss: 0.5021 - learning_rate: 0.0010
Epoch 4/10
[1m1906/1906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step - _f1_score: 0.6918 - accuracy: 0.6918 - loss: 0.4930



Saved checkpoint: /content/drive/MyDrive/trained_models/continued_checkpoints/checkpoint_epoch_4_1744834207.h5
[1m1906/1906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m160s[0m 84ms/step - _f1_score: 0.6918 - accuracy: 0.6918 - loss: 0.4930 - val__f1_score: 0.6830 - val_accuracy: 0.6832 - val_loss: 0.5086 - learning_rate: 0.0010
Epoch 5/10
[1m1906/1906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 83ms/step - _f1_score: 0.6991 - accuracy: 0.6991 - loss: 0.4862 - val__f1_score: 0.6809 - val_accuracy: 0.6811 - val_loss: 0.5132 - learning_rate: 0.0010
Epoch 6/10
[1m1906/1906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step - _f1_score: 0.7054 - accuracy: 0.7054 - loss: 0.4821



Saved checkpoint: /content/drive/MyDrive/trained_models/continued_checkpoints/checkpoint_epoch_6_1744834523.h5
Removed old checkpoint: /content/drive/MyDrive/trained_models/continued_checkpoints/checkpoint_epoch_2_1744833886.h5
[1m1906/1906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m157s[0m 82ms/step - _f1_score: 0.7054 - accuracy: 0.7054 - loss: 0.4821 - val__f1_score: 0.6886 - val_accuracy: 0.6889 - val_loss: 0.5056 - learning_rate: 0.0010
Epoch 7/10
[1m1906/1906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m166s[0m 87ms/step - _f1_score: 0.7101 - accuracy: 0.7101 - loss: 0.4750 - val__f1_score: 0.6774 - val_accuracy: 0.6776 - val_loss: 0.5840 - learning_rate: 0.0010
Epoch 8/10
[1m1906/1906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step - _f1_score: 0.7172 - accuracy: 0.7172 - loss: 0.4669



Saved checkpoint: /content/drive/MyDrive/trained_models/continued_checkpoints/checkpoint_epoch_8_1744834848.h5
Removed old checkpoint: /content/drive/MyDrive/trained_models/continued_checkpoints/checkpoint_epoch_4_1744834207.h5
[1m1906/1906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m160s[0m 84ms/step - _f1_score: 0.7172 - accuracy: 0.7172 - loss: 0.4669 - val__f1_score: 0.6805 - val_accuracy: 0.6807 - val_loss: 0.5222 - learning_rate: 0.0010
Epoch 9/10
[1m1906/1906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m156s[0m 82ms/step - _f1_score: 0.7227 - accuracy: 0.7227 - loss: 0.4603 - val__f1_score: 0.6873 - val_accuracy: 0.6874 - val_loss: 0.5315 - learning_rate: 0.0010
Epoch 10/10
[1m1906/1906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step - _f1_score: 0.7259 - accuracy: 0.7259 - loss: 0.4537



Saved checkpoint: /content/drive/MyDrive/trained_models/continued_checkpoints/checkpoint_epoch_10_1744835164.h5
Removed old checkpoint: /content/drive/MyDrive/trained_models/continued_checkpoints/checkpoint_epoch_6_1744834523.h5
[1m1906/1906[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m160s[0m 84ms/step - _f1_score: 0.7259 - accuracy: 0.7259 - loss: 0.4537 - val__f1_score: 0.6025 - val_accuracy: 0.6025 - val_loss: 1.1489 - learning_rate: 0.0010
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step 
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

Traceback (most recent call last):
  File "<ipython-input-3-4e11fc80cf95>", line 554, in <cell line: 0>
    results = continue_training_from_checkpoint(
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "<ipython-input-3-4e11fc80cf95>", line 497, in continue_training_from_checkpoint
    metrics = classifier.evaluate(test_gen, threshold=DEFAULT_THRESHOLD)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "<ipython-input-3-4e11fc80cf95>", line 390, in evaluate
    preds = self.model.predict(x_batch)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py", line 122, in error_handler
    raise e.with_traceback(filtered_tb) from None
  File "/usr/local/lib/python3.11/dist-packages/keras/src/utils/progbar.py", line 119, in update
    numdigits = int(math.log10(self.target)) + 1
                    ^^^^^^^^^^^^^^^^^^^^^^^
ValueError: math domain error
