In [5]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from scipy.sparse import csr_matrix
import pandas as pd
from collections import defaultdict
import networkx as nx
from scipy.spatial.distance import cosine
import os
import json
import logging
from tqdm.auto import tqdm
from datetime import datetime
import sys

class CrimeClassifier:
    def __init__(self, min_df=2, max_features=10000, log_dir="logs"):
        self.setup_logging(log_dir)
        self.logger.info("Initializing CrimeClassifier")
        
        self.tfidf = TfidfVectorizer(
            min_df=min_df,
            max_features=max_features,
            ngram_range=(1, 3),
            analyzer='char_wb'
        )
        self.label_propagation_iterations = 10
        self.similarity_threshold = 0.3
        self.graph = None
        self.id_to_text = {}
        self.id_to_label = {}
        
        self.logger.info(f"Initialized with min_df={min_df}, max_features={max_features}")
        
    def setup_logging(self, log_dir):
        """Setup logging to both file and console"""
        os.makedirs(log_dir, exist_ok=True)
        
        # Create timestamp for log file
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        log_file = os.path.join(log_dir, f'crime_classifier_{timestamp}.log')
        
        # Setup logger
        self.logger = logging.getLogger('CrimeClassifier')
        self.logger.setLevel(logging.INFO)
        
        # File handler
        file_handler = logging.FileHandler(log_file)
        file_handler.setLevel(logging.INFO)
        file_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
        file_handler.setFormatter(file_formatter)
        
        # Console handler
        console_handler = logging.StreamHandler(sys.stdout)
        console_handler.setLevel(logging.INFO)
        console_formatter = logging.Formatter('%(message)s')
        console_handler.setFormatter(console_formatter)
        
        # Add handlers
        self.logger.addHandler(file_handler)
        self.logger.addHandler(console_handler)
        
        self.logger.info(f"Logging setup complete. Log file: {log_file}")
        
    def clean_text_data(self, texts):
        """Clean text data by removing NaN values and invalid entries"""
        self.logger.info(f"Starting text cleaning on {len(texts)} documents")
        cleaned_texts = []
        valid_indices = []
        
        for i, text in tqdm(enumerate(texts), total=len(texts), desc="Cleaning texts"):
            if pd.isna(text):
                self.logger.debug(f"Skipping NaN at index {i}")
                continue
                
            if not isinstance(text, str):
                try:
                    text = str(text)
                    self.logger.debug(f"Converted non-string to string at index {i}")
                except:
                    self.logger.warning(f"Failed to convert non-string at index {i}")
                    continue
                    
            if text.strip():
                cleaned_texts.append(text.strip())
                valid_indices.append(i)
            else:
                self.logger.debug(f"Skipping empty string at index {i}")
                
        self.logger.info(f"Text cleaning complete. Kept {len(cleaned_texts)}/{len(texts)} documents")
        return cleaned_texts, valid_indices
        
    def create_custom_embeddings(self, texts):
        """Create custom embeddings using TF-IDF and enhance with domain-specific features"""
        self.logger.info("Starting embedding creation")
        
        # Clean the text data
        cleaned_texts, valid_indices = self.clean_text_data(texts)
        self.valid_indices = valid_indices
        
        # Base TF-IDF embeddings
        self.logger.info("Creating TF-IDF embeddings")
        try:
            base_embeddings = self.tfidf.fit_transform(cleaned_texts)
            self.logger.info(f"TF-IDF vocabulary size: {len(self.tfidf.vocabulary_)}")
        except Exception as e:
            self.logger.error(f"Error in TF-IDF transformation: {str(e)}")
            raise
        
        # Create additional features
        self.logger.info("Creating enhanced features")
        enhanced_features = []
        
        for text in tqdm(cleaned_texts, desc="Creating enhanced features"):
            features = []
            features.append(len(text))
            features.append(len(text.split()))
            
            keywords = {
                'fraud': 1.5,
                'scam': 1.5,
                'cyber': 1.2,
                'online': 1.2,
                'hack': 1.3,
                'threat': 1.4,
                'attack': 1.3,
                'phish': 1.5
            }
            text_lower = text.lower()
            keyword_score = sum(weight for word, weight in keywords.items() 
                              if word in text_lower)
            features.append(keyword_score)
            
            enhanced_features.append(features)
            
        enhanced_features = normalize(np.array(enhanced_features))
        
        # Combine embeddings
        self.logger.info("Combining embeddings")
        final_embeddings = np.hstack([
            base_embeddings.toarray(),
            enhanced_features
        ])
        
        final_embeddings = normalize(final_embeddings)
        self.logger.info(f"Final embedding shape: {final_embeddings.shape}")
        
        return final_embeddings
    
    def build_similarity_graph(self, embeddings, labels=None):
        """Build a similarity graph from embeddings"""
        self.logger.info("Building similarity graph")
        n_samples = embeddings.shape[0]
        
        G = nx.Graph()
        
        # Add nodes
        self.logger.info("Adding nodes to graph")
        for i in tqdm(range(n_samples), desc="Adding nodes"):
            G.add_node(i, 
                      embedding=embeddings[i],
                      label=labels[i] if labels is not None and i < len(labels) else None)
        
        # Add edges
        self.logger.info("Adding edges to graph")
        edge_count = 0
        with tqdm(total=(n_samples * (n_samples - 1)) // 2, desc="Adding edges") as pbar:
            for i in range(n_samples):
                for j in range(i + 1, n_samples):
                    similarity = 1 - cosine(embeddings[i], embeddings[j])
                    if similarity > self.similarity_threshold:
                        G.add_edge(i, j, weight=similarity)
                        edge_count += 1
                    pbar.update(1)
        
        self.logger.info(f"Graph built with {n_samples} nodes and {edge_count} edges")
        self.graph = G
        return G
    
    def propagate_labels(self, unlabeled_indices):
        """Propagate labels through the graph"""
        self.logger.info("Starting label propagation")
        if not self.graph:
            raise ValueError("Graph not built yet!")
            
        # Initialize distributions
        self.logger.info("Initializing label distributions")
        label_distributions = {}
        for node in tqdm(self.graph.nodes(), desc="Initializing distributions"):
            if node not in unlabeled_indices:
                label = self.graph.nodes[node]['label']
                dist = defaultdict(float)
                dist[label] = 1.0
                label_distributions[node] = dist
            else:
                label_distributions[node] = defaultdict(float)
        
        # Propagate labels
        self.logger.info(f"Propagating labels for {len(unlabeled_indices)} unlabeled nodes")
        for iteration in range(self.label_propagation_iterations):
            self.logger.info(f"Iteration {iteration + 1}/{self.label_propagation_iterations}")
            new_distributions = {}
            
            for node in tqdm(unlabeled_indices, desc=f"Iteration {iteration + 1}"):
                new_dist = defaultdict(float)
                weight_sum = 0
                
                for neighbor in self.graph.neighbors(node):
                    weight = self.graph[node][neighbor]['weight']
                    neighbor_dist = label_distributions[neighbor]
                    
                    for label, prob in neighbor_dist.items():
                        new_dist[label] += weight * prob
                    weight_sum += weight
                
                if weight_sum > 0:
                    for label in new_dist:
                        new_dist[label] /= weight_sum
                        
                new_distributions[node] = new_dist
            
            label_distributions.update(new_distributions)
        
        # Assign final labels
        self.logger.info("Assigning final labels")
        final_labels = {}
        for node in tqdm(unlabeled_indices, desc="Assigning labels"):
            dist = label_distributions[node]
            if dist:
                final_labels[node] = max(dist.items(), key=lambda x: x[1])[0]
            else:
                final_labels[node] = None
                
        self.logger.info("Label propagation complete")
        return final_labels
    
    def fit_predict(self, labeled_texts, labeled_categories, unlabeled_texts):
        """Main method to fit the model and predict unlabeled data"""
        # Combine all texts
        all_texts = labeled_texts + unlabeled_texts
        
        # Create embeddings and get valid indices
        embeddings = self.create_custom_embeddings(all_texts)
        
        # Filter labels based on valid indices
        n_labeled_original = len(labeled_texts)
        valid_labeled_indices = [i for i in self.valid_indices if i < n_labeled_original]
        filtered_categories = [labeled_categories[i] for i in valid_labeled_indices]
        
        # Adjust indices for unlabeled data
        valid_unlabeled_indices = [i - n_labeled_original for i in self.valid_indices 
                                 if i >= n_labeled_original]
        
        # Build similarity graph
        n_labeled = len(valid_labeled_indices)
        all_labels = filtered_categories + [None] * (len(self.valid_indices) - n_labeled)
        self.build_similarity_graph(embeddings, all_labels)
        
        # Define unlabeled indices
        unlabeled_indices = list(range(n_labeled, len(self.valid_indices)))
        
        # Propagate labels
        predictions = self.propagate_labels(unlabeled_indices)
        
        # Create full predictions list with None for invalid entries
        full_predictions = [None] * len(unlabeled_texts)
        for idx, pred_idx in enumerate(valid_unlabeled_indices):
            if pred_idx < len(full_predictions):
                full_predictions[pred_idx] = predictions.get(idx + n_labeled)
        
        return full_predictions
    
    def save_results(self, predictions, output_dir="results"):
        """Save predictions and statistics to files"""
        os.makedirs(output_dir, exist_ok=True)
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        
        # Save predictions
        predictions_file = os.path.join(output_dir, f'predictions_{timestamp}.json')
        with open(predictions_file, 'w') as f:
            json.dump(predictions, f, indent=2)
        self.logger.info(f"Predictions saved to {predictions_file}")
        
        # Calculate and save statistics
        stats = {
            'total_predictions': len(predictions),
            'null_predictions': predictions.count(None),
            'unique_labels': len(set(filter(None, predictions))),
            'timestamp': timestamp
        }
        
        stats_file = os.path.join(output_dir, f'stats_{timestamp}.json')
        with open(stats_file, 'w') as f:
            json.dump(stats, f, indent=2)
        self.logger.info(f"Statistics saved to {stats_file}")
        
        return predictions_file, stats_file
    
    
def load_labeled_data(data_dir, category_mapping_file):
    """
    Load labeled data from CSV files in the specified directory
    
    Parameters:
    data_dir: str - Directory containing CSV files with labeled data
    category_mapping_file: str - Path to JSON file containing category mapping
    
    Returns:
    tuple: (texts, categories)
    """
    # Load category mapping
    with open(category_mapping_file, 'r') as f:
        category_mapping = json.load(f)
    
    # Flatten subcategories
    subcategories = {}
    for main_category, subs in category_mapping.items():
        subcategories.update(subs)
    
    texts = []
    categories = []
    
    # Process each CSV file in the directory
    for filename in os.listdir(data_dir):
        if filename.endswith('.csv'):
            file_path = os.path.join(data_dir, filename)
            try:
                df = pd.read_csv(file_path)
                if 'crimeinformation' in df.columns and 'subcategory' in df.columns:
                    # Filter out NaN values
                    df = df.dropna(subset=['crimeinformation', 'subcategory'])
                    texts.extend(df['crimeinformation'].tolist())
                    categories.extend(df['subcategory'].tolist())
            except Exception as e:
                print(f"Error processing {filename}: {str(e)}")
                continue
    
    return texts, categories


In [6]:
# Initialize the classifier
classifier = CrimeClassifier(log_dir="logs")


try:
    # Load labeled data
    data_dir = "category_csvs/"  
    category_mapping_file = "new.json" 

    # Load labeled data
    labeled_texts, labeled_categories = load_labeled_data(data_dir, category_mapping_file)
    classifier.logger.info(f"Loaded {len(labeled_texts)} labeled examples")
    unlabeled_df = pd.read_csv("remaining_dataset.csv")
    unlabeled_df = unlabeled_df.dropna(subset=['crimeaditionalinfo'])
    unlabeled_texts = unlabeled_df['crimeaditionalinfo'].tolist()
    classifier.logger.info(f"Loaded {len(unlabeled_texts)} unlabeled examples")

    # Get predictions
    predictions = classifier.fit_predict(labeled_texts, labeled_categories, unlabeled_texts)

    pred_file, stats_file = classifier.save_results(predictions)
    classifier.logger.info("Processing complete!")
        
except Exception as e:
    classifier.logger.error(f"Error in processing: {str(e)}", exc_info=True)
    raise

Logging setup complete. Log file: logs/crime_classifier_20250218_130821.log
Logging setup complete. Log file: logs/crime_classifier_20250218_130821.log
Initializing CrimeClassifier
Initializing CrimeClassifier
Initialized with min_df=2, max_features=10000
Initialized with min_df=2, max_features=10000
Loaded 0 labeled examples
Loaded 0 labeled examples
Loaded 103325 unlabeled examples
Loaded 103325 unlabeled examples
Starting embedding creation
Starting embedding creation
Starting text cleaning on 103325 documents
Starting text cleaning on 103325 documents


Cleaning texts: 100%|██████████| 103325/103325 [00:00<00:00, 978232.27it/s]

Text cleaning complete. Kept 103240/103325 documents
Text cleaning complete. Kept 103240/103325 documents
Creating TF-IDF embeddings
Creating TF-IDF embeddings





TF-IDF vocabulary size: 10000
TF-IDF vocabulary size: 10000
Creating enhanced features
Creating enhanced features


Creating enhanced features: 100%|██████████| 103240/103240 [00:00<00:00, 109815.30it/s]


Combining embeddings
Combining embeddings
Final embedding shape: (103240, 10003)
Final embedding shape: (103240, 10003)
Building similarity graph
Building similarity graph
Adding nodes to graph
Adding nodes to graph


Adding nodes: 100%|██████████| 103240/103240 [00:00<00:00, 571798.99it/s]

Adding edges to graph
Adding edges to graph



Adding edges:   0%|          | 9711453/5329197180 [06:25<58:40:42, 25181.91it/s] 


KeyboardInterrupt: 