In [2]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.auto import tqdm
import logging
import warnings
from typing import Dict, List, Tuple, Union, Optional
from collections import Counter
from pathlib import Path
from torch.cuda.amp import autocast, GradScaler
import traceback
import re



# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)







class Config:
    def __init__(self):
        # Data paths
        self.train_path = 'cleaned_cybercrime_data.csv'
        self.test_path = 'cleaned_cybercrime_test_data.csv'
        
        # High-accuracy model parameters
        self.model_name = 'distilbert-base-uncased'
        self.max_length = 256     # Keep original length for accuracy
        self.batch_size = 8      # Keep smaller batch size for stability
        self.num_epochs = 3
        self.learning_rate = 1e-5
        self.warmup_ratio = 0.2
        self.weight_decay = 0.01
        
        # Speed optimizations that won't hurt accuracy
        self.gradient_accumulation_steps = 4
        self.eval_steps = 500    # Evaluate less frequently
        self.save_steps = 500
        
        # Device settings
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'  # Add this line
        
        # Parallel processing settings
        self.num_workers = 4 if self.device == 'cuda' else 2  # Adjust based on device
        self.pin_memory = True if self.device == 'cuda' else False
        
        # Text preprocessing
        self.max_vocab_size = 50000
        self.text_column = 'crimeaditionalinfo'
        self.label_column = 'category'
        
        # Output directory
        self.output_dir = Path('models/cybercrime_classifier')
        self.output_dir.mkdir(parents=True, exist_ok=True)

        

        
        
class TextPreprocessor:
    """Efficient text preprocessing for CPU"""
    def __init__(self):
        self.patterns = {
            'url': re.compile(r'https?://\S+|www\.\S+'),
            'email': re.compile(r'\S+@\S+'),
            'phone': re.compile(r'(\+\d{1,3}[-.]?)?\d{3}[-.]?\d{3}[-.]?\d{4}'),
            'amount': re.compile(r'(?:rs\.?|inr|₹|\$)\s*\d+(?:[,\.]\d+)*'),
            'special_chars': re.compile(r'[^\w\s]'),
            'extra_spaces': re.compile(r'\s+')
        }
        
        self.cache = {}
        
    def clean_text(self, text: str) -> str:
        """Clean text with caching for efficiency"""
        if text in self.cache:
            return self.cache[text]
            
        text = text.lower().strip()
        
        # Replace patterns with tokens
        text = self.patterns['url'].sub('[URL]', text)
        text = self.patterns['email'].sub('[EMAIL]', text)
        text = self.patterns['phone'].sub('[PHONE]', text)
        text = self.patterns['amount'].sub('[AMOUNT]', text)
        
        # Remove special characters and extra spaces
        text = self.patterns['special_chars'].sub(' ', text)
        text = self.patterns['extra_spaces'].sub(' ', text).strip()
        
        # Cache result
        if len(self.cache) > 10000:  # Limit cache size
            self.cache.clear()
        self.cache[text] = text
        
        return text

    def batch_process(self, texts: List[str], batch_size: int = 1000) -> List[str]:
        """Process texts in batches"""
        processed_texts = []
        
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]
            processed_batch = [self.clean_text(text) for text in batch]
            processed_texts.extend(processed_batch)
            
        return processed_texts
        
        
class SmartLabelEncoder:
    """Enhanced label encoder that handles unseen labels using semantic similarity"""
    def __init__(self):
        self.encoder = LabelEncoder()
        self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
        self.label_embeddings = None
        self.original_labels = None
        self.mapping = {}
        self.label_vectors = None

    def fit(self, labels: List[str], texts: List[str]) -> 'SmartLabelEncoder':
        """Fit encoder and compute label embeddings"""
        self.original_labels = list(set(labels))
        self.encoder.fit(self.original_labels)
        
        # Create mapping of labels to their texts
        label_texts = {}
        for label, text in zip(labels, texts):
            if label not in label_texts:
                label_texts[label] = []
            label_texts[label].append(text)
        
        # Create embeddings for each label's texts
        self.label_vectors = {}
        for label, texts in label_texts.items():
            # Get text embeddings
            text_embeddings = self.sentence_model.encode(texts[:5])  # Use up to 5 examples
            # Calculate centroid
            self.label_vectors[label] = np.mean(text_embeddings, axis=0)
        
        return self
    
    def _create_label_descriptions(self, labels: List[str], texts: List[str]) -> Dict[str, str]:
        """Create rich descriptions for each label based on its examples"""
        descriptions = {}
        for label, text in zip(labels, texts):
            if label not in descriptions:
                descriptions[label] = []
            descriptions[label].append(text)
        
        # Combine examples for each label
        for label in descriptions:
            examples = descriptions[label][:5]  # Take up to 5 examples
            descriptions[label] = f"{label}: " + " ".join(examples)
            
        return descriptions
    
    def _create_label_vectors(self, descriptions: Dict[str, str]) -> Dict[str, Dict[str, np.ndarray]]:
        """Create enhanced vector representations of labels"""
        vectors = {}
        for label, texts in descriptions.items():
            # Create embeddings for all texts
            text_embeddings = self.sentence_model.encode(texts)
            text_centroid = np.mean(text_embeddings, axis=0)
            
            vectors[label] = {
                'text_embeddings': text_embeddings,
                'text_centroid': text_centroid,
            }
        return vectors

    
    def _find_most_similar_label(self, unknown_label: str, texts: List[str]) -> str:
        """Find most similar known label using semantic similarity"""
        # Create embedding for unknown label's texts
        unknown_embeddings = self.sentence_model.encode(texts[:5])  # Use up to 5 examples
        unknown_centroid = np.mean(unknown_embeddings, axis=0)
        
        # Calculate similarities
        similarities = {}
        for known_label, known_embedding in self.label_vectors.items():
            # Calculate cosine similarity
            similarity = cosine_similarity(
                unknown_centroid.reshape(1, -1),
                known_embedding.reshape(1, -1)
            )[0][0]
            
            similarities[known_label] = similarity
            logger.info(f"Similarity between '{unknown_label}' and '{known_label}': {similarity:.4f}")
        
        # Get most similar label
        most_similar = max(similarities.items(), key=lambda x: x[1])
        logger.info(
            f"Mapping unknown label '{unknown_label}' to '{most_similar[0]}' "
            f"with similarity {most_similar[1]:.4f}"
        )
        return most_similar[0]
    
    def transform(self, labels: List[str], texts: List[str] = None) -> np.ndarray:
        """Transform labels, mapping unseen ones to most similar known labels"""
        processed_labels = []
        for i, label in enumerate(labels):
            if label not in self.original_labels:
                if label not in self.mapping:
                    # Get relevant texts for this label
                    label_texts = [text for l, text in zip(labels, texts) if l == label]
                    if not label_texts:  # If no texts found, use empty list
                        label_texts = ['']
                    self.mapping[label] = self._find_most_similar_label(label, label_texts)
                processed_labels.append(self.mapping[label])
            else:
                processed_labels.append(label)
        
        return self.encoder.transform(processed_labels)
    
    def inverse_transform(self, indices: np.ndarray) -> np.ndarray:
        return self.encoder.inverse_transform(indices)
    
    @property
    def classes_(self) -> np.ndarray:
        return self.encoder.classes_

class CybercrimeDataset(Dataset):
    """Custom dataset for cybercrime text classification"""
    def __init__(
        self, 
        texts: List[str],
        labels: List[int] = None,
        tokenizer = None,
        max_length: int = 128
    ):
        # Convert texts to list and handle NaN values
        self.texts = [str(text) for text in texts]
        
        # Pre-tokenize all texts at once
        self.encodings = tokenizer(
            self.texts,
            padding='max_length',
            truncation=True,
            max_length=max_length,
            return_tensors='pt'
        )
        
        # Convert labels to Long tensor if provided
        if labels is not None:
            self.labels = torch.tensor(labels, dtype=torch.long)
        else:
            self.labels = None

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        item = {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
        }
        
        if self.labels is not None:
            item['labels'] = self.labels[idx]
            
        return item

class CybercrimeClassifier:
    """Main classifier class with enhanced training and evaluation"""
    def __init__(self, config: Config):
        self.config = config
        self.device = config.device
        self.tokenizer = AutoTokenizer.from_pretrained(config.model_name)
        self.model = None
        self.label_encoder = SmartLabelEncoder()
        
    def prepare_data(self) -> Tuple[DataLoader, DataLoader]:
        """Load and prepare train and test data"""
        logger.info("Loading datasets...")
        
        # Load data
        train_df = pd.read_csv(self.config.train_path)
        test_df = pd.read_csv(self.config.test_path)
        
        # Convert text columns to string and fill NaN values
        train_df[self.config.text_column] = train_df[self.config.text_column].fillna('').astype(str)
        test_df[self.config.text_column] = test_df[self.config.text_column].fillna('').astype(str)
        
        # Log dataset statistics
        logger.info(f"Training set size: {len(train_df)}")
        logger.info(f"Test set size: {len(test_df)}")
        logger.info("\nTraining set label distribution:")
        for label, count in train_df[self.config.label_column].value_counts().items():
            logger.info(f"{label}: {count}")
        
        # Check for unseen labels
        train_labels = set(train_df[self.config.label_column].unique())
        test_labels = set(test_df[self.config.label_column].unique())
        unseen_labels = test_labels - train_labels
        if unseen_labels:
            logger.warning(f"Found {len(unseen_labels)} unseen labels in test set: {unseen_labels}")
        
        # Encode labels using smart encoder
        self.label_encoder.fit(
            train_df[self.config.label_column].tolist(),
            train_df[self.config.text_column].tolist()
        )
        
        train_labels = self.label_encoder.transform(
            train_df[self.config.label_column].tolist(),
            train_df[self.config.text_column].tolist()
        ).astype(np.int64)
        
        test_labels = self.label_encoder.transform(
            test_df[self.config.label_column].tolist(),
            test_df[self.config.text_column].tolist()
        ).astype(np.int64)
        
        num_labels = len(self.label_encoder.classes_)
        logger.info(f"Number of unique labels: {num_labels}")
        
        # Initialize model
        self.model = AutoModelForSequenceClassification.from_pretrained(
            self.config.model_name,
            num_labels=num_labels,
            problem_type="single_label_classification"
        ).to(self.device)
        
        # Create datasets
        train_dataset = CybercrimeDataset(
            texts=train_df[self.config.text_column].tolist(),
            labels=train_labels,
            tokenizer=self.tokenizer,
            max_length=self.config.max_length
        )
        
        test_dataset = CybercrimeDataset(
            texts=test_df[self.config.text_column].tolist(),
            labels=test_labels,
            tokenizer=self.tokenizer,
            max_length=self.config.max_length
        )
        
        # Create dataloaders
        train_loader = DataLoader(
            train_dataset,
            batch_size=self.config.batch_size,
            shuffle=True,
            num_workers=0,
            pin_memory=False
        )
        
        test_loader = DataLoader(
            test_dataset,
            batch_size=self.config.batch_size,
            shuffle=False,
            num_workers=0,
            pin_memory=False
        )
        
        return train_loader, test_loader
        
    def train(self, train_loader: DataLoader) -> None:
        """Balanced training loop for speed and accuracy"""
        try:
            logger.info("Starting training...")

            optimizer = torch.optim.AdamW(
                self.model.parameters(),
                lr=self.config.learning_rate,
                weight_decay=self.config.weight_decay,
                betas=(0.9, 0.999)
            )

            total_steps = len(train_loader) * self.config.num_epochs

            scheduler = torch.optim.lr_scheduler.OneCycleLR(
                optimizer,
                max_lr=self.config.learning_rate,
                total_steps=total_steps,
                pct_start=0.1
            )

            best_loss = float('inf')
            best_accuracy = 0.0

            # Track overall progress
            overall_progress = tqdm(total=self.config.num_epochs, desc="Overall Progress")

            for epoch in range(self.config.num_epochs):
                logger.info(f"\nStarting Epoch {epoch + 1}/{self.config.num_epochs}")
                self.model.train()
                running_loss = 0
                running_correct = 0
                running_total = 0

                # Create progress bar for this epoch
                progress_bar = tqdm(
                    enumerate(train_loader),
                    total=len(train_loader),
                    desc=f'Epoch {epoch + 1}/{self.config.num_epochs}',
                    ncols=100,
                    leave=False  # Don't leave the progress bar
                )

                try:
                    for batch_idx, batch in progress_bar:
                        # Zero gradients
                        optimizer.zero_grad()

                        # Move batch to device
                        batch = {k: v.to(self.device, non_blocking=True) for k, v in batch.items()}

                        # Forward pass
                        outputs = self.model(**batch)
                        loss = outputs.loss / self.config.gradient_accumulation_steps

                        # Backward pass
                        loss.backward()

                        # Gradient accumulation
                        if (batch_idx + 1) % self.config.gradient_accumulation_steps == 0:
                            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                            optimizer.step()
                            scheduler.step()
                            optimizer.zero_grad()

                        # Calculate accuracy
                        with torch.no_grad():
                            predictions = outputs.logits.argmax(-1)
                            correct = (predictions == batch['labels']).sum().item()
                            total = batch['labels'].size(0)
                            running_correct += correct
                            running_total += total

                        running_loss += loss.item() * self.config.gradient_accumulation_steps

                        # Update progress
                        if (batch_idx + 1) % 10 == 0:
                            avg_loss = running_loss / (batch_idx + 1)
                            accuracy = (running_correct / running_total) * 100

                            progress_bar.set_postfix({
                                'loss': f'{avg_loss:.4f}',
                                'acc': f'{accuracy:.2f}%'
                            }, refresh=True)

                        # Save best model
                        if batch_idx > 0 and batch_idx % self.config.eval_steps == 0:
                            current_accuracy = (running_correct / running_total) * 100
                            if current_accuracy > best_accuracy:
                                best_accuracy = current_accuracy
                                self.save_model('best_model.pt')
                                logger.info(f"New best model saved with accuracy: {best_accuracy:.2f}%")

                    # Epoch end validation
                    val_accuracy = self.quick_evaluate(train_loader)
                    logger.info(f"Epoch {epoch + 1} completed. Validation accuracy: {val_accuracy:.2f}%")

                    # Save checkpoint after each epoch
                    checkpoint_path = f'checkpoint_epoch_{epoch+1}.pt'
                    self.save_model(checkpoint_path)
                    logger.info(f"Checkpoint saved: {checkpoint_path}")

                    # Reset metrics
                    running_loss = 0
                    running_correct = 0
                    running_total = 0

                    # Update overall progress
                    overall_progress.update(1)

                except Exception as e:
                    logger.error(f"Error during epoch {epoch + 1}: {str(e)}")
                    logger.error(traceback.format_exc())
                    # Save emergency checkpoint
                    self.save_model(f'emergency_checkpoint_epoch_{epoch+1}.pt')
                    raise

            overall_progress.close()

        except Exception as e:
            logger.error(f"Training error: {str(e)}")
            logger.error(traceback.format_exc())
            raise
        finally:
            # Save final model even if there was an error
            try:
                self.save_model('final_model.pt')
                logger.info("Final model saved")
            except Exception as e:
                logger.error(f"Error saving final model: {str(e)}")

    def quick_evaluate(self, loader: DataLoader) -> float:
        """Quick evaluation during training"""
        correct = 0
        total = 0
        self.model.eval()

        with torch.no_grad():
            for batch in loader:
                batch = {k: v.to(self.device, non_blocking=True) for k, v in batch.items()}
                outputs = self.model(**batch)
                predictions = outputs.logits.argmax(-1)
                correct += (predictions == batch['labels']).sum().item()
                total += batch['labels'].size(0)

        return (correct / total) * 100


                
    def evaluate(self, test_loader: DataLoader) -> None:
        """Evaluate the model on test data with improved error handling"""
        logger.info("Evaluating model...")
        
        self.model.eval()
        all_preds = []
        all_labels = []
        
        # Collect predictions
        with torch.no_grad():
            for batch in tqdm(test_loader, desc="Testing"):
                batch = {k: v.to(self.device) for k, v in batch.items()}
                outputs = self.model(**batch)
                preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
                all_preds.extend(preds)
                all_labels.extend(batch['labels'].cpu().numpy())
        
        # Convert to numpy arrays
        all_preds = np.array(all_preds)
        all_labels = np.array(all_labels)
        
        # Get unique classes in predictions and true labels
        unique_preds = np.unique(all_preds)
        unique_labels = np.unique(all_labels)
        
        # Print diagnostics
        print("\nDiagnostics:")
        print(f"Number of unique classes in predictions: {len(unique_preds)}")
        print(f"Number of unique classes in true labels: {len(unique_labels)}")
        print(f"Number of classes in label encoder: {len(self.label_encoder.classes_)}")
        
        print("\nClass distribution in predictions:")
        pred_counts = Counter(all_preds)
        for class_idx, count in sorted(pred_counts.items()):
            class_name = self.label_encoder.inverse_transform([class_idx])[0]
            print(f"Class {class_idx} ({class_name}): {count} samples")
        
        print("\nClass distribution in true labels:")
        label_counts = Counter(all_labels)
        for class_idx, count in sorted(label_counts.items()):
            class_name = self.label_encoder.inverse_transform([class_idx])[0]
            print(f"Class {class_idx} ({class_name}): {count} samples")
        
        # Get the intersection of classes present in both predictions and true labels
        present_classes = sorted(set(unique_preds) & set(unique_labels))
        class_names = self.label_encoder.inverse_transform(present_classes)
        
        # Print classification report with only present classes
        print("\nClassification Report:")
        try:
            report = classification_report(
                all_labels,
                all_preds,
                labels=present_classes,
                target_names=class_names,
                digits=4,
                zero_division=0
            )
            print(report)
            
            # Print confusion matrix
            cm = confusion_matrix(all_labels, all_preds, labels=present_classes)
            print("\nConfusion Matrix:")
            print("Labels:", class_names.tolist())
            print(cm)
            
            # Calculate and print accuracy
            accuracy = (all_preds == all_labels).mean() * 100
            print(f"\nOverall Accuracy: {accuracy:.2f}%")
            
        except Exception as e:
            logger.error(f"Error in generating classification report: {str(e)}")
            logger.error(traceback.format_exc())
    
    # Update the CybercrimeClassifier class with the new evaluate method
    
        
    def predict(self, text: str) -> Tuple[str, float]:
        """Predict category for a single text"""
        self.model.eval()
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.config.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        encoding = {k: v.to(self.device) for k, v in encoding.items()}
        
        with torch.no_grad():
            outputs = self.model(**encoding)
            probs = F.softmax(outputs.logits, dim=1)
            predicted_class = torch.argmax(probs, dim=1).cpu().numpy()[0]
            confidence = probs[0][predicted_class].cpu().numpy()
        
        predicted_category = self.label_encoder.inverse_transform([predicted_class])[0]
        return predicted_category, confidence
    
    
    
    
    def save_model(self, path: str) -> None:
        """Save model and label encoder"""
        save_dict = {
            'model_state_dict': self.model.state_dict(),
            'label_encoder': self.label_encoder,
            'config': self.config,
        }
        torch.save(save_dict, path)
        logger.info(f"Model saved to {path}")

    def load_model(self, path: str) -> None:
        """Load model and label encoder"""
        save_dict = torch.load(path, map_location=self.device)
        self.model.load_state_dict(save_dict['model_state_dict'])
        self.label_encoder = save_dict['label_encoder']
        logger.info(f"Model loaded from {path}")

def main():
    """Main execution function"""
    try:
        # Initialize configuration
        config = Config()
        logger.info(f"Using device: {config.device}")
        
        # Initialize classifier
        classifier = CybercrimeClassifier(config)
        
        # Prepare data
        logger.info("Preparing data...")
        train_loader, test_loader = classifier.prepare_data()
        
        # Train model
        logger.info("Starting training process...")
        classifier.train(train_loader)
        
        # Save final model
        classifier.save_model(config.output_dir / 'final_model.pt')
        
        # Evaluate model
        logger.info("Evaluating model...")
        classifier.evaluate(test_loader)
        
        # Example prediction
        sample_text = """I received a call from someone claiming to be from SBI bank asking for my card details. 
        They said my card would be blocked. I gave them my details and lost Rs. 50,000."""
        
        logger.info("\nTesting model with sample prediction...")
        predicted_category, confidence = classifier.predict(sample_text)
        print(f"\nSample Prediction:")
        print(f"Text: {sample_text[:100]}...")
        print(f"Predicted Category: {predicted_category}")
        print(f"Confidence: {confidence:.2%}")
        
    except Exception as e:
        logger.error(f"An error occurred: {str(e)}")
        logger.error(f"Stack trace: {traceback.format_exc()}")
        raise

if __name__ == "__main__":
    # Set random seeds for reproducibility
    torch.manual_seed(42)
    np.random.seed(42)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(42)
    
    main()


2024-11-07 14:19:21,483 - INFO - Using device: cuda
2024-11-07 14:19:21,780 - INFO - Use pytorch device_name: cuda
2024-11-07 14:19:21,780 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2024-11-07 14:19:24,289 - INFO - Preparing data...
2024-11-07 14:19:24,289 - INFO - Loading datasets...
2024-11-07 14:19:24,983 - INFO - Training set size: 93686
2024-11-07 14:19:24,983 - INFO - Test set size: 31229
2024-11-07 14:19:24,983 - INFO - 
Training set label distribution:
2024-11-07 14:19:24,999 - INFO - Online Financial Fraud: 65459
2024-11-07 14:19:25,000 - INFO - Online and Social Media Related Crime: 12733
2024-11-07 14:19:25,000 - INFO - Cyber Attack/ Dependent Crimes: 3704
2024-11-07 14:19:25,000 - INFO - RapeGang Rape RGRSexually Abusive Content: 2822
2024-11-07 14:19:25,000 - INFO - Any Other Cyber Crime: 2154
2024-11-07 14:19:25,000 - INFO - Sexually Obscene material: 1838
2024-11-07 14:19:25,002 - INFO - Hacking  Damage to computercomputer system etc: 1710
2024-11-07 

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-07 14:19:25,745 - INFO - Similarity between 'Crime Against Women & Children' and 'Online and Social Media Related Crime': 0.4543
2024-11-07 14:19:25,747 - INFO - Similarity between 'Crime Against Women & Children' and 'Online Financial Fraud': 0.3502
2024-11-07 14:19:25,748 - INFO - Similarity between 'Crime Against Women & Children' and 'Online Gambling  Betting': 0.3267
2024-11-07 14:19:25,750 - INFO - Similarity between 'Crime Against Women & Children' and 'RapeGang Rape RGRSexually Abusive Content': 0.2141
2024-11-07 14:19:25,751 - INFO - Similarity between 'Crime Against Women & Children' and 'Cyber Attack/ Dependent Crimes': 0.9750
2024-11-07 14:19:25,752 - INFO - Similarity between 'Crime Against Women & Children' and 'Cryptocurrency Crime': 0.2977
2024-11-07 14:19:25,753 - INFO - Similarity between 'Crime Against Women & Children' and 'Sexually Explicit Act': 0.3848
2024-11-07 14:19:25,755 - INFO - Similarity between 'Crime Against Women & Children' and 'Sexually Obscen

Overall Progress:   0%|          | 0/3 [00:00<?, ?it/s]

2024-11-07 14:19:44,213 - INFO - 
Starting Epoch 1/3


Epoch 1/3:   0%|                                                          | 0/11711 [00:00<?, ?it/s]

2024-11-07 14:21:48,880 - INFO - Model saved to best_model.pt
2024-11-07 14:21:48,880 - INFO - New best model saved with accuracy: 23.68%
2024-11-07 14:23:54,781 - INFO - Model saved to best_model.pt
2024-11-07 14:23:54,792 - INFO - New best model saved with accuracy: 43.53%
2024-11-07 14:26:00,895 - INFO - Model saved to best_model.pt
2024-11-07 14:26:00,895 - INFO - New best model saved with accuracy: 52.64%
2024-11-07 14:28:07,480 - INFO - Model saved to best_model.pt
2024-11-07 14:28:07,480 - INFO - New best model saved with accuracy: 56.79%
2024-11-07 14:30:14,326 - INFO - Model saved to best_model.pt
2024-11-07 14:30:14,326 - INFO - New best model saved with accuracy: 59.25%
2024-11-07 14:31:49,489 - INFO - Model saved to best_model.pt
2024-11-07 14:31:49,491 - INFO - New best model saved with accuracy: 60.93%
2024-11-07 14:33:12,795 - INFO - Model saved to best_model.pt
2024-11-07 14:33:12,796 - INFO - New best model saved with accuracy: 62.31%
2024-11-07 14:34:38,214 - INFO - M

Epoch 2/3:   0%|                                                          | 0/11711 [00:00<?, ?it/s]

2024-11-07 15:08:46,753 - INFO - Model saved to best_model.pt
2024-11-07 15:08:46,753 - INFO - New best model saved with accuracy: 82.93%
2024-11-07 15:10:13,681 - INFO - Model saved to best_model.pt
2024-11-07 15:10:13,683 - INFO - New best model saved with accuracy: 83.18%
2024-11-07 15:14:32,205 - INFO - Model saved to best_model.pt
2024-11-07 15:14:32,208 - INFO - New best model saved with accuracy: 83.21%
2024-11-07 15:28:51,429 - INFO - Model saved to best_model.pt
2024-11-07 15:28:51,432 - INFO - New best model saved with accuracy: 83.25%
2024-11-07 15:30:17,617 - INFO - Model saved to best_model.pt
2024-11-07 15:30:17,619 - INFO - New best model saved with accuracy: 83.26%
2024-11-07 15:36:01,149 - INFO - Model saved to best_model.pt
2024-11-07 15:36:01,150 - INFO - New best model saved with accuracy: 83.26%
2024-11-07 15:40:19,731 - INFO - Model saved to best_model.pt
2024-11-07 15:40:19,733 - INFO - New best model saved with accuracy: 83.26%
2024-11-07 15:50:55,913 - INFO - E

Epoch 3/3:   0%|                                                          | 0/11711 [00:00<?, ?it/s]

2024-11-07 15:52:21,850 - INFO - Model saved to best_model.pt
2024-11-07 15:52:21,850 - INFO - New best model saved with accuracy: 83.83%
2024-11-07 15:53:47,042 - INFO - Model saved to best_model.pt
2024-11-07 15:53:47,044 - INFO - New best model saved with accuracy: 84.13%
2024-11-07 15:55:12,171 - INFO - Model saved to best_model.pt
2024-11-07 15:55:12,172 - INFO - New best model saved with accuracy: 84.28%
2024-11-07 15:56:37,107 - INFO - Model saved to best_model.pt
2024-11-07 15:56:37,109 - INFO - New best model saved with accuracy: 84.39%
2024-11-07 16:35:13,387 - INFO - Epoch 3 completed. Validation accuracy: 84.65%
2024-11-07 16:35:13,786 - INFO - Model saved to checkpoint_epoch_3.pt
2024-11-07 16:35:13,788 - INFO - Checkpoint saved: checkpoint_epoch_3.pt
2024-11-07 16:35:14,182 - INFO - Model saved to final_model.pt
2024-11-07 16:35:14,184 - INFO - Final model saved
2024-11-07 16:35:14,569 - INFO - Model saved to models\cybercrime_classifier\final_model.pt
2024-11-07 16:35:14

Testing:   0%|          | 0/3904 [00:00<?, ?it/s]

2024-11-07 16:38:53,973 - INFO - 
Testing model with sample prediction...



Diagnostics:
Number of unique classes in predictions: 9
Number of unique classes in true labels: 14
Number of classes in label encoder: 15

Class distribution in predictions:
Class 0 (Any Other Cyber Crime): 556 samples
Class 2 (Cryptocurrency Crime): 67 samples
Class 3 (Cyber Attack/ Dependent Crimes): 1265 samples
Class 5 (Hacking  Damage to computercomputer system etc): 556 samples
Class 7 (Online Financial Fraud): 23252 samples
Class 9 (Online and Social Media Related Crime): 4555 samples
Class 11 (RapeGang Rape RGRSexually Abusive Content): 825 samples
Class 13 (Sexually Explicit Act): 4 samples
Class 14 (Sexually Obscene material): 149 samples

Class distribution in true labels:
Class 0 (Any Other Cyber Crime): 710 samples
Class 1 (Child Pornography CPChild Sexual Abuse Material CSAM): 123 samples
Class 2 (Cryptocurrency Crime): 167 samples
Class 3 (Cyber Attack/ Dependent Crimes): 1303 samples
Class 4 (Cyber Terrorism): 52 samples
Class 5 (Hacking  Damage to computercomputer sy

# Running the file


In [6]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.nn.functional as F
import pandas as pd
from tqdm.auto import tqdm
import logging
import os
from pathlib import Path
from typing import Tuple, Dict, List, Union

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

class Config:
    def __init__(self):
        self.model_name = 'distilbert-base-uncased'
        self.max_length = 256
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.test_path = 'cleaned_cybercrime_test_data.csv'  # Path to test data if available and change to differnt test

class CybercrimePredictor:
    def __init__(self, model_path: str):
        """Initialize the predictor with a saved model"""
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        logger.info(f"Using device: {self.device}")
        
        # Load saved model data
        logger.info(f"Loading model from {model_path}")
        saved_data = torch.load(model_path, map_location=self.device)
        
        # Get configuration
        self.config = saved_data['config']
        
        # Initialize tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(self.config.model_name)
        
        # Initialize model
        self.model = AutoModelForSequenceClassification.from_pretrained(
            self.config.model_name,
            num_labels=len(saved_data['label_encoder'].classes_)
        ).to(self.device)
        
        # Load model weights and label encoder
        self.model.load_state_dict(saved_data['model_state_dict'])
        self.label_encoder = saved_data['label_encoder']
        self.model.eval()
        
        logger.info("Model loaded successfully!")

    def predict(self, text: str) -> Tuple[str, float]:
        """Predict the category of a given text"""
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.config.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        encoding = {k: v.to(self.device) for k, v in encoding.items()}
        
        with torch.no_grad():
            outputs = self.model(**encoding)
            probs = F.softmax(outputs.logits, dim=1)
            predicted_class = torch.argmax(probs, dim=1).cpu().numpy()[0]
            confidence = probs[0][predicted_class].cpu().numpy()
        
        predicted_category = self.label_encoder.inverse_transform([predicted_class])[0]
        return predicted_category, confidence

def test_custom_input(predictor: CybercrimePredictor, text: str) -> None:
    """Test the model with custom user input"""
    predicted_category, confidence = predictor.predict(text)
    print("\nPrediction Results:")
    print("=" * 50)
    print(f"Input Text: {text}")
    print(f"Predicted Category: {predicted_category}")
    print(f"Confidence: {confidence:.2%}")
    print("=" * 50)

def test_from_csv(predictor: CybercrimePredictor, csv_path: str, num_samples: Union[int, None] = None) -> None:
    """Test the model with examples from a CSV file with detailed statistics"""
    test_df = pd.read_csv(csv_path)
    
    if num_samples:
        test_df = test_df.sample(n=min(num_samples, len(test_df)), random_state=42)
    
    print(f"\nTesting {len(test_df)} examples from {csv_path}")
    print("=" * 50)
    
    correct = 0
    results = []
    category_stats = {}
    confusion_matrix = {}
    
    for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Processing examples"):
        text = str(row['crimeaditionalinfo'])
        true_category = row['category']
        predicted_category, confidence = predictor.predict(text)
        
        # Initialize statistics
        if true_category not in category_stats:
            category_stats[true_category] = {'total': 0, 'correct': 0, 'wrong': 0, 'confidences': []}
        if true_category not in confusion_matrix:
            confusion_matrix[true_category] = {}
        if predicted_category not in confusion_matrix[true_category]:
            confusion_matrix[true_category][predicted_category] = 0
            
        # Update statistics
        confusion_matrix[true_category][predicted_category] += 1
        category_stats[true_category]['total'] += 1
        
        is_correct = predicted_category == true_category
        if is_correct:
            correct += 1
            category_stats[true_category]['correct'] += 1
        else:
            category_stats[true_category]['wrong'] += 1
            
        category_stats[true_category]['confidences'].append(float(confidence))
        
        results.append({
            'text': text[:100] + "..." if len(text) > 100 else text,
            'true_category': true_category,
            'predicted_category': predicted_category,
            'confidence': confidence,
            'is_correct': is_correct
        })
    
    # Print statistics
    print_statistics(results, category_stats, confusion_matrix)

def print_statistics(results: List[Dict], category_stats: Dict, confusion_matrix: Dict) -> None:
    """Print detailed statistics of the model's performance"""
    total = len(results)
    correct = sum(1 for r in results if r['is_correct'])
    accuracy = (correct / total) * 100
    
    print("\n=== Overall Statistics ===")
    print(f"Total Examples: {total}")
    print(f"Correct Predictions: {correct}")
    print(f"Wrong Predictions: {total - correct}")
    print(f"Overall Accuracy: {accuracy:.2f}%")
    
    # Category-wise statistics
    print("\n=== Category-wise Statistics ===")
    for category, stats in sorted(category_stats.items(), key=lambda x: x[1]['total'], reverse=True):
        cat_accuracy = (stats['correct'] / stats['total'] * 100) if stats['total'] > 0 else 0
        avg_confidence = sum(stats['confidences']) / len(stats['confidences'])
        
        print(f"\n{category} ({stats['total']} cases):")
        print(f"├── Correct: {stats['correct']}")
        print(f"├── Wrong: {stats['wrong']}")
        print(f"├── Accuracy: {cat_accuracy:.2f}%")
        print(f"└── Avg Confidence: {avg_confidence:.2f}%")
    
    # Confusion matrix
    print("\n=== Confusion Matrix ===")
    for true_cat in confusion_matrix:
        print(f"\nTrue Category: {true_cat}")
        for pred_cat, count in sorted(confusion_matrix[true_cat].items(), key=lambda x: x[1], reverse=True):
            if count > 0:
                percentage = (count / category_stats[true_cat]['total']) * 100
                print(f"├── Predicted as {pred_cat}: {count} ({percentage:.1f}%)")

def interactive_testing():
    """Interactive interface for testing the model"""
    print("Initializing Cybercrime Classification System...")
    
    try:
        model_path = 'final_model.pt'
        predictor = CybercrimePredictor(model_path)
        
        while True:
            print("\nCybercrime Classification Menu:")
            print("1. Test with custom input")
            print("2. Test with examples from test dataset")
            print("3. Exit")
            
            choice = input("\nEnter your choice (1-3): ")
            
            if choice == '1':
                text = input("\nEnter the text to classify (or 'q' to return to menu): ")
                if text.lower() == 'q':
                    continue
                test_custom_input(predictor, text)
                
            elif choice == '2':
                if not os.path.exists('cleaned_cybercrime_test_data.csv'):
                    print("\nTest dataset not found! Please place 'cleaned_cybercrime_test_data.csv' in the current directory.")
                    continue
                    
                try:
                    num_samples = input("\nHow many random examples to test? (press Enter for all): ")
                    num_samples = int(num_samples) if num_samples.strip() else None
                    test_from_csv(predictor, 'cleaned_cybercrime_test_data.csv', num_samples)
                except ValueError:
                    print("Please enter a valid number")
                
            elif choice == '3':
                print("\nThank you for using the Cybercrime Classification System! By Muhammad Mamoon jan")
                break
                
            else:
                print("Invalid choice! Please enter 1, 2, or 3.")
    
    except Exception as e:
        logger.error(f"Error: {str(e)}")
        logger.error("Please make sure all required files are present and valid.")

if __name__ == "__main__":
    interactive_testing()

2024-11-07 17:17:20,335 - INFO - Using device: cuda
2024-11-07 17:17:20,336 - INFO - Loading model from final_model.pt


Initializing Cybercrime Classification System...


  saved_data = torch.load(model_path, map_location=self.device)
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2024-11-07 17:17:22,325 - INFO - Model loaded successfully!



Cybercrime Classification Menu:
1. Test with custom input
2. Test with examples from test dataset
3. Exit



Enter your choice (1-3):  1

Enter the text to classify (or 'q' to return to menu):  q



Cybercrime Classification Menu:
1. Test with custom input
2. Test with examples from test dataset
3. Exit



Enter your choice (1-3):  1

Enter the text to classify (or 'q' to return to menu):  I received a call from someone claiming to be from SBI bank asking for my card details.          They said my card would be blocked. I gave them my details and lost Rs. 50,000.



Prediction Results:
Input Text: I received a call from someone claiming to be from SBI bank asking for my card details.          They said my card would be blocked. I gave them my details and lost Rs. 50,000.
Predicted Category: Online Financial Fraud
Confidence: 98.91%

Cybercrime Classification Menu:
1. Test with custom input
2. Test with examples from test dataset
3. Exit



Enter your choice (1-3):  2

How many random examples to test? (press Enter for all):  10



Testing 10 examples from cleaned_cybercrime_test_data.csv


Processing examples:   0%|          | 0/10 [00:00<?, ?it/s]


=== Overall Statistics ===
Total Examples: 10
Correct Predictions: 7
Wrong Predictions: 3
Overall Accuracy: 70.00%

=== Category-wise Statistics ===

Online Financial Fraud (7 cases):
├── Correct: 6
├── Wrong: 1
├── Accuracy: 85.71%
└── Avg Confidence: 0.88%

Online and Social Media Related Crime (1 cases):
├── Correct: 0
├── Wrong: 1
├── Accuracy: 0.00%
└── Avg Confidence: 0.75%

Cyber Attack/ Dependent Crimes (1 cases):
├── Correct: 1
├── Wrong: 0
├── Accuracy: 100.00%
└── Avg Confidence: 1.00%

Sexually Explicit Act (1 cases):
├── Correct: 0
├── Wrong: 1
├── Accuracy: 0.00%
└── Avg Confidence: 0.81%

=== Confusion Matrix ===

True Category: Online Financial Fraud
├── Predicted as Online Financial Fraud: 6 (85.7%)
├── Predicted as Hacking  Damage to computercomputer system etc: 1 (14.3%)

True Category: Online and Social Media Related Crime
├── Predicted as Online Financial Fraud: 1 (100.0%)

True Category: Cyber Attack/ Dependent Crimes
├── Predicted as Cyber Attack/ Dependent Crim


Enter your choice (1-3):  3



Thank you for using the Cybercrime Classification System! By Muhammad Mamoon jan
