In [12]:
#!/usr/bin/env python3
"""
Data Embeddings Generation Script

This script generates embeddings for bug and feature data using SBERT models with 
four different preprocessing approaches:
1. Bug data with filename: vocabulary filtering → stopwords/lemmatization → filename appending
2. Bug data without filename: stopwords/lemmatization → vocabulary filtering
3. Feature data with filename: vocabulary filtering → stopwords/lemmatization → filename appending  
4. Feature data without filename: stopwords/lemmatization → vocabulary filtering

The script adds two columns to each input CSV:
- 'with_filename_embeddings': Embeddings using with-filename preprocessing
- 'without_filename_embeddings': Embeddings using without-filename preprocessing

Usage:
    python generate_embeddings.py \
        --bug_csv_path "/path/to/bug/reference data.csv" \
        --feature_csv_path "/path/to/feature/reference data.csv" \
        --bug_with_filename_model "/path/to/bug/with filename/sbert bug.pt" \
        --bug_without_filename_model "/path/to/bug/without filename/sbert augmented bug.pt" \
        --feature_with_filename_model "/path/to/feature/with filename/sbert augmented feature.pt" \
        --feature_without_filename_model "/path/to/feature/without filename/sbert augmented features.pt" \
        --bug_with_filename_vocab "/path/to/bug/with filename/vocabulary.csv" \
        --bug_without_filename_vocab "/path/to/bug/without filename/vocabulary.csv" \
        --feature_with_filename_vocab "/path/to/feature/with filename/vocabulary.csv" \
        --feature_without_filename_vocab "/path/to/feature/without filename/vocabulary.csv"
"""

import argparse
import os
import sys
import re
import json
import time
from pathlib import Path
from typing import List, Dict, Tuple, Optional, Set

import torch
import numpy as np
import pandas as pd
import spacy
from spacy.cli import download
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

# Ensure reproducibility
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)
    torch.cuda.manual_seed_all(42)

class EmbeddingGenerator:
    """
    Generates embeddings for text data using SBERT models with different preprocessing approaches.
    Mirrors the exact preprocessing logic from the classification system.
    """
    
    def __init__(self, device: str = 'auto'):
        """
        Initialize the embedding generator.
        
        Args:
            device: Device to use ('auto', 'cuda', 'cpu')
        """
        # Set device
        if device == 'auto':
            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        else:
            self.device = torch.device(device)
            
        print(f"Using device: {self.device}")
        
        # Initialize spaCy
        self._init_nlp()
        
        # Storage for models and vocabularies
        self.models = {}
        self.vocabularies = {}
        
        print("EmbeddingGenerator initialized successfully")
    
    def _init_nlp(self):
        """Initialize spaCy model for text preprocessing."""
        try:
            self.nlp = spacy.load('en_core_web_sm')
            print("Loaded spaCy model 'en_core_web_sm'")
        except OSError:
            print("Downloading spaCy model 'en_core_web_sm'...")
            download('en_core_web_sm')
            self.nlp = spacy.load('en_core_web_sm')
            print("Downloaded and loaded spaCy model 'en_core_web_sm'")
    
    def load_vocabulary(self, vocab_path: str, key: str):
        """
        Load vocabulary from CSV file.
        
        Args:
            vocab_path: Path to vocabulary CSV file
            key: Key to store vocabulary under
        """
        print(f"Loading vocabulary from {vocab_path}")
        
        if not os.path.exists(vocab_path):
            raise FileNotFoundError(f"Vocabulary file not found: {vocab_path}")
        
        try:
            vocab_df = pd.read_csv(vocab_path)
            # Assuming vocabulary is in the first column
            vocab_words = set(vocab_df.iloc[:, 0].astype(str).str.lower().tolist())
            self.vocabularies[key] = vocab_words
            print(f"Loaded {len(vocab_words)} vocabulary words for {key}")
        except Exception as e:
            raise RuntimeError(f"Error loading vocabulary from {vocab_path}: {str(e)}")
    
    def load_sbert_model(self, model_path: str, key: str):
        """
        Load SBERT model from file.
        
        Args:
            model_path: Path to SBERT model file
            key: Key to store model under
        """
        print(f"Loading SBERT model from {model_path}")
        
        if not os.path.exists(model_path):
            raise FileNotFoundError(f"Model file not found: {model_path}")
        
        try:
            # Load the model using SentenceTransformer with all-mpnet-base-v2 architecture
            model = SentenceTransformer('all-mpnet-base-v2')
            
            # Load the fine-tuned weights
            state_dict = torch.load(model_path, map_location=self.device)
            
            # Filter out classifier layers that don't belong to SentenceTransformer
            # Keep only the layers that are part of the sentence transformer
            filtered_state_dict = {}
            for key_name, value in state_dict.items():
                # Skip classifier layers
                if not key_name.startswith('classifier.'):
                    filtered_state_dict[key_name] = value
            
            # Load the filtered state dict with strict=False to allow missing keys
            model.load_state_dict(filtered_state_dict, strict=False)
            
            # Move to device and set to eval mode
            model = model.to(self.device)
            model.eval()
            
            self.models[key] = model
            print(f"Loaded SBERT model for {key}")
            
        except Exception as e:
            raise RuntimeError(f"Error loading SBERT model from {model_path}: {str(e)}")
    
    def _clean_text(self, text: str) -> str:
        """
        Basic text cleaning (steps 1-4 from classification preprocessing).
        
        Args:
            text: Input text to clean
            
        Returns:
            Cleaned text
        """
        if pd.isna(text) or text is None:
            return ""
        
        text = str(text)
        
        # Convert to lowercase
        text = text.lower()
        
        # Remove line breaks
        text = text.replace('\r', ' ')
        text = text.replace('\n', ' ')
        
        # Remove non-alphanumeric characters (keeping spaces)
        text = re.sub(r'[^a-zA-Z0-9 ]', '', text)
        
        # Remove extra spaces
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text
    
    def _preprocess_without_filename(self, title: str, description: str, comments: str, vocabulary_words: Set[str]) -> str:
        """
        Preprocess text for "without filename" variant.
        Uses title + description + comments (since no filename available)
        Order: stopwords/lemmatization → vocabulary filtering
        
        Args:
            title: Issue title
            description: Issue description  
            comments: Issue comments
            vocabulary_words: Set of vocabulary words for filtering
            
        Returns:
            Preprocessed text
        """
        # Convert to strings and handle NaN values
        title = str(title) if pd.notna(title) else ""
        description = str(description) if pd.notna(description) else ""
        comments = str(comments) if pd.notna(comments) else ""
        
        # 1-4. Basic cleaning - use title + description + comments (no filename available)
        all_text = self._clean_text(title + " " + description + " " + comments)
        
        if not all_text:
            return ""
        
        # 5. Remove stopwords and lemmatize FIRST
        doc = self.nlp(all_text)
        all_text = ' '.join([word.lemma_ for word in doc if not word.is_stop and word.lemma_.strip()])
        
        # 6. Filter words based on vocabulary
        words = all_text.split()
        filtered_words = [word for word in words if word in vocabulary_words]
        all_text = ' '.join(filtered_words)
        
        return all_text
    
    def _preprocess_with_filename(self, title: str, description: str, comments: str, filename: str, vocabulary_words: Set[str]) -> str:
        """
        Preprocess text for "with filename" variant.
        Uses title + description (excludes comments when filename is available)
        Order: vocabulary filtering → stopwords/lemmatization → filename appending
        
        Args:
            title: Issue title
            description: Issue description
            comments: Issue comments (ignored when filename is present)
            filename: Filename to append
            vocabulary_words: Set of vocabulary words for filtering
            
        Returns:
            Preprocessed text
        """
        # Convert to strings and handle NaN values
        title = str(title) if pd.notna(title) else ""
        description = str(description) if pd.notna(description) else ""
        comments = str(comments) if pd.notna(comments) else ""
        filename = str(filename) if pd.notna(filename) else ""
        
        # 1-4. Basic cleaning - use title + description (exclude comments when filename present)
        all_text = self._clean_text(title + " " + description)
        
        if not all_text:
            all_text = ""
        
        # 5. Filter words based on vocabulary FIRST
        words = all_text.split()
        filtered_words = [word for word in words if word in vocabulary_words]
        all_text = ' '.join(filtered_words)
        
        # 6. Remove stopwords and lemmatize
        if all_text:
            doc = self.nlp(all_text)
            all_text = ' '.join([word.lemma_ for word in doc if not word.is_stop and word.lemma_.strip()])
        
        # 7. Add filename to the end
        if filename and filename.strip() and filename.lower() != 'nan':
            cleaned_filename = self._clean_text(filename)
            if cleaned_filename:
                all_text += " " + cleaned_filename
        
        return all_text
    
    def generate_embeddings_batch(self, texts: List[str], model_key: str, batch_size: int = 32) -> List[List[float]]:
        """
        Generate embeddings for a batch of texts.
        
        Args:
            texts: List of preprocessed texts
            model_key: Key of the model to use
            batch_size: Batch size for processing
            
        Returns:
            List of embedding vectors
        """
        if model_key not in self.models:
            raise ValueError(f"Model {model_key} not loaded")
        
        model = self.models[model_key]
        all_embeddings = []
        
        # Process in batches
        for i in tqdm(range(0, len(texts), batch_size), desc=f"Generating embeddings ({model_key})"):
            batch_texts = texts[i:i + batch_size]
            
            # Handle empty texts
            batch_texts = [text if text else " " for text in batch_texts]
            
            with torch.no_grad():
                batch_embeddings = model.encode(
                    batch_texts,
                    convert_to_tensor=True,
                    device=self.device,
                    show_progress_bar=False
                )
                
                # Convert to CPU and numpy
                batch_embeddings = batch_embeddings.cpu().numpy()
                all_embeddings.extend(batch_embeddings.tolist())
        
        return all_embeddings
    
    def process_dataset(self, csv_path: str, request_type: str, batch_size: int = 32) -> pd.DataFrame:
        """
        Process a dataset (bug or feature) and add embedding columns.
        
        Args:
            csv_path: Path to the CSV file
            request_type: 'bug' or 'feature'
            batch_size: Batch size for embedding generation
            
        Returns:
            DataFrame with added embedding columns
        """
        print(f"\nProcessing {request_type} dataset: {csv_path}")
        
        # Load data
        if not os.path.exists(csv_path):
            raise FileNotFoundError(f"CSV file not found: {csv_path}")
        
        df = pd.read_csv(csv_path)
        print(f"Loaded {len(df)} rows")
        
        # Get model and vocabulary keys
        with_filename_model_key = f"{request_type}_with_filename"
        without_filename_model_key = f"{request_type}_without_filename"
        with_filename_vocab_key = f"{request_type}_with_filename_vocab"
        without_filename_vocab_key = f"{request_type}_without_filename_vocab"
        
        # Check if models and vocabularies are loaded
        for key in [with_filename_model_key, without_filename_model_key]:
            if key not in self.models:
                raise ValueError(f"Model {key} not loaded")
        
        for key in [with_filename_vocab_key, without_filename_vocab_key]:
            if key not in self.vocabularies:
                raise ValueError(f"Vocabulary {key} not loaded")
        
        # Preprocess texts for both variants
        print("Preprocessing texts...")
        
        # Without filename preprocessing
        without_filename_texts = []
        for idx, row in tqdm(df.iterrows(), total=len(df), desc="Preprocessing (without filename)"):
            text = self._preprocess_without_filename(
                row.get('title', ''),
                row.get('body', ''),
                row.get('all_comments', ''),
                self.vocabularies[without_filename_vocab_key]
            )
            without_filename_texts.append(text)
        
        # With filename preprocessing
        with_filename_texts = []
        for idx, row in tqdm(df.iterrows(), total=len(df), desc="Preprocessing (with filename)"):
            text = self._preprocess_with_filename(
                row.get('title', ''),
                row.get('body', ''),
                row.get('all_comments', ''),
                row.get('filename', ''),
                self.vocabularies[with_filename_vocab_key]
            )
            with_filename_texts.append(text)
        
        # Generate embeddings
        print("Generating embeddings...")
        
        without_filename_embeddings = self.generate_embeddings_batch(
            without_filename_texts, 
            without_filename_model_key, 
            batch_size
        )
        
        with_filename_embeddings = self.generate_embeddings_batch(
            with_filename_texts, 
            with_filename_model_key, 
            batch_size
        )
        
        # Add embedding columns to dataframe
        print("Adding embedding columns to dataframe...")
        df['without_filename_embeddings'] = [str(emb) for emb in without_filename_embeddings]
        df['with_filename_embeddings'] = [str(emb) for emb in with_filename_embeddings]
        
        print(f"Successfully processed {request_type} dataset")
        return df


def main(args):
    """Main function to run the embedding generation script."""
    
    print("=" * 80)
    print("DATA EMBEDDINGS GENERATION")
    print("=" * 80)
    print(f"Device: {args.device}")
    print(f"Batch size: {args.batch_size}")
    print()
    
    try:
        # Initialize generator
        generator = EmbeddingGenerator(device=args.device)
        
        # Load vocabularies
        print("Loading vocabularies...")
        generator.load_vocabulary(args.bug_with_filename_vocab, 'bug_with_filename_vocab')
        generator.load_vocabulary(args.bug_without_filename_vocab, 'bug_without_filename_vocab')
        generator.load_vocabulary(args.feature_with_filename_vocab, 'feature_with_filename_vocab')
        generator.load_vocabulary(args.feature_without_filename_vocab, 'feature_without_filename_vocab')
        
        # Load models
        print("\nLoading SBERT models...")
        generator.load_sbert_model(args.bug_with_filename_model, 'bug_with_filename')
        generator.load_sbert_model(args.bug_without_filename_model, 'bug_without_filename')
        generator.load_sbert_model(args.feature_with_filename_model, 'feature_with_filename')
        generator.load_sbert_model(args.feature_without_filename_model, 'feature_without_filename')
        
        # Process bug dataset
        print("\n" + "=" * 50)
        print("PROCESSING BUG DATASET")
        print("=" * 50)
        start_time = time.time()
        
        bug_df = generator.process_dataset(args.bug_csv_path, 'bug', args.batch_size)
        
        # Save bug results
        # Extract filename and save to working directory (Kaggle writable)
        bug_filename = os.path.basename(args.bug_csv_path).replace('.csv', f'{args.output_suffix}.csv')
        bug_output_path = os.path.join('/kaggle/working', bug_filename)
        bug_df.to_csv(bug_output_path, index=False)
        print(f"Bug dataset saved to: {bug_output_path}")
        
        bug_time = time.time() - start_time
        print(f"Bug dataset processing time: {bug_time:.2f} seconds")
        
        # Process feature dataset
        print("\n" + "=" * 50)
        print("PROCESSING FEATURE DATASET")
        print("=" * 50)
        start_time = time.time()
        
        feature_df = generator.process_dataset(args.feature_csv_path, 'feature', args.batch_size)
        
        # Save feature results
        # Extract filename and save to working directory (Kaggle writable)
        feature_filename = os.path.basename(args.feature_csv_path).replace('.csv', f'{args.output_suffix}.csv')
        feature_output_path = os.path.join('/kaggle/working', feature_filename)
        feature_df.to_csv(feature_output_path, index=False)
        print(f"Feature dataset saved to: {feature_output_path}")
        
        feature_time = time.time() - start_time
        print(f"Feature dataset processing time: {feature_time:.2f} seconds")
        
        # Summary
        print("\n" + "=" * 50)
        print("PROCESSING COMPLETE")
        print("=" * 50)
        print(f"Bug dataset: {len(bug_df)} rows processed → {bug_output_path}")
        print(f"Feature dataset: {len(feature_df)} rows processed → {feature_output_path}")
        print(f"Total processing time: {bug_time + feature_time:.2f} seconds")
        print("\nEmbedding columns added:")
        print("- 'without_filename_embeddings': Embeddings using without-filename preprocessing")
        print("- 'with_filename_embeddings': Embeddings using with-filename preprocessing")
        
    except Exception as e:
        print(f"\nError: {str(e)}")
        import traceback
        traceback.print_exc()
        sys.exit(1)

In [13]:
if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Generate embeddings for bug and feature data using SBERT models",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=__doc__
    )
    
    # CSV file paths
    parser.add_argument('--bug_csv_path', default='/kaggle/input/bug-and-feature-reference-data/bug reference data.csv', help='Path to bug reference data CSV')
    parser.add_argument('--feature_csv_path', default='/kaggle/input/bug-and-feature-reference-data/feature reference data.csv', help='Path to feature reference data CSV')
    
    # Model paths
    parser.add_argument('--bug_with_filename_model', default='/kaggle/input/sbert-similarity/pytorch/default/1/sbert bug with filename.pt', help='Path to bug with filename SBERT model')
    parser.add_argument('--bug_without_filename_model', default='/kaggle/input/sbert-similarity/pytorch/default/1/sbert augmented bug without filename.pt', help='Path to bug without filename SBERT model')
    parser.add_argument('--feature_with_filename_model', default='/kaggle/input/sbert-similarity/pytorch/default/1/sbert augmented feature with filename.pt', help='Path to feature with filename SBERT model')
    parser.add_argument('--feature_without_filename_model', default='/kaggle/input/sbert-similarity/pytorch/default/1/sbert augmented features without filename.pt', help='Path to feature without filename SBERT model')
    
    # Vocabulary paths
    parser.add_argument('--bug_with_filename_vocab', default='/kaggle/input/bug-and-features-word-filtering-vocabularies/bug with filename vocabulary.csv', help='Path to bug with filename vocabulary CSV')
    parser.add_argument('--bug_without_filename_vocab', default='/kaggle/input/bug-and-features-word-filtering-vocabularies/bug without filename vocabulary.csv', help='Path to bug without filename vocabulary CSV')
    parser.add_argument('--feature_with_filename_vocab', default='/kaggle/input/bug-and-features-word-filtering-vocabularies/feature with filename vocabulary.csv', help='Path to feature with filename vocabulary CSV')
    parser.add_argument('--feature_without_filename_vocab', default='/kaggle/input/bug-and-features-word-filtering-vocabularies/feature without filename vocabulary.csv', help='Path to feature without filename vocabulary CSV')
    
    # Optional parameters
    parser.add_argument('--batch_size', type=int, default=32, help='Batch size for embedding generation (default: 32)')
    parser.add_argument('--device', choices=['auto', 'cuda', 'cpu'], default='auto', help='Device to use (default: auto)')
    parser.add_argument('--output_suffix', default='_with_embeddings', help='Suffix for output files (default: _with_embeddings)')
    
    args, unknown = parser.parse_known_args()
    main(args)


DATA EMBEDDINGS GENERATION
Device: auto
Batch size: 32

Using device: cuda
Loaded spaCy model 'en_core_web_sm'
EmbeddingGenerator initialized successfully
Loading vocabularies...
Loading vocabulary from /kaggle/input/bug-and-features-word-filtering-vocabularies/bug with filename vocabulary.csv
Loaded 28194 vocabulary words for bug_with_filename_vocab
Loading vocabulary from /kaggle/input/bug-and-features-word-filtering-vocabularies/bug without filename vocabulary.csv
Loaded 45595 vocabulary words for bug_without_filename_vocab
Loading vocabulary from /kaggle/input/bug-and-features-word-filtering-vocabularies/feature with filename vocabulary.csv
Loaded 10083 vocabulary words for feature_with_filename_vocab
Loading vocabulary from /kaggle/input/bug-and-features-word-filtering-vocabularies/feature without filename vocabulary.csv
Loaded 17498 vocabulary words for feature_without_filename_vocab

Loading SBERT models...
Loading SBERT model from /kaggle/input/sbert-similarity/pytorch/default/

Preprocessing (without filename): 100%|██████████| 976/976 [01:30<00:00, 10.73it/s]
Preprocessing (with filename): 100%|██████████| 976/976 [00:23<00:00, 41.07it/s]


Generating embeddings...


Generating embeddings (bug_without_filename): 100%|██████████| 31/31 [00:13<00:00,  2.28it/s]
Generating embeddings (bug_with_filename): 100%|██████████| 31/31 [00:13<00:00,  2.32it/s]


Adding embedding columns to dataframe...
Successfully processed bug dataset
Bug dataset saved to: /kaggle/working/bug reference data_with_embeddings.csv
Bug dataset processing time: 143.88 seconds

PROCESSING FEATURE DATASET

Processing feature dataset: /kaggle/input/bug-and-feature-reference-data/feature reference data.csv
Loaded 165 rows
Preprocessing texts...


Preprocessing (without filename): 100%|██████████| 165/165 [00:15<00:00, 10.59it/s]
Preprocessing (with filename): 100%|██████████| 165/165 [00:02<00:00, 58.17it/s]


Generating embeddings...


Generating embeddings (feature_without_filename): 100%|██████████| 6/6 [00:02<00:00,  2.58it/s]
Generating embeddings (feature_with_filename): 100%|██████████| 6/6 [00:02<00:00,  2.63it/s]


Adding embedding columns to dataframe...
Successfully processed feature dataset
Feature dataset saved to: /kaggle/working/feature reference data_with_embeddings.csv
Feature dataset processing time: 23.42 seconds

PROCESSING COMPLETE
Bug dataset: 976 rows processed → /kaggle/working/bug reference data_with_embeddings.csv
Feature dataset: 165 rows processed → /kaggle/working/feature reference data_with_embeddings.csv
Total processing time: 167.30 seconds

Embedding columns added:
- 'without_filename_embeddings': Embeddings using without-filename preprocessing
- 'with_filename_embeddings': Embeddings using with-filename preprocessing
