In [None]:
# Install necessary packages
!pip install datasets scikit-learn sentence-transformers matplotlib umap-learn pandas seaborn hdbscan

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re
import time
import gc
import os

# Memory management function
def memory_status():
    """Print current memory usage status"""
    import psutil
    process = psutil.Process(os.getpid())
    memory_info = process.memory_info()

    # Convert to GB for readability
    memory_gb = memory_info.rss / (1024 ** 3)
    print(f"Current memory usage: {memory_gb:.2f} GB")
    return memory_gb

# Force garbage collection
def force_gc():
    """Force garbage collection to free memory"""
    collected = gc.collect()
    memory_status()
    print(f"Garbage collected: {collected} objects")

# Data processing and ML libraries
from datasets import load_dataset
import hdbscan
from sklearn.metrics import silhouette_score, adjusted_rand_score, normalized_mutual_info_score
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer
import umap

# Set random seed for reproducibility
np.random.seed(42)

# Initial memory check
memory_status()

In [None]:
def clean_text(text):
    text = str(text) if text is not None else ""
    text = re.sub(r'[\u4e00-\u9fff]+', '', text)
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'RT : ', '', text)
    text = re.sub(r'&', 'and', text)
    text = re.sub(r'â€™', '\'', text)
    text = re.sub(r'["&;]', '', text)
    # text = re.sub(r'', '', text) # Assuming the zero-width space removal was intentional but empty, check if needed
    text = re.sub(r'\.[Xx]', '', text)
    text = re.sub(r'\.\.+', '...', text)
    text = re.sub(r'@|\|', '', text)
    text = re.sub(r'\s+', ' ', text).strip().lower()
    wallet_pattern = r'0x[a-fA-F0-9]{40}|[13][a-km-zA-HJ-NP-Z1-9]{25,34}'
    text = re.sub(wallet_pattern, '', text)
    return text

# Function to map sentiment labels (for later evaluation)
def sentiment_map(text):
    if 'Bullish' in text:
        return 0
    elif 'Neutral' in text:
        return 1
    else:
        return 2  # Bearish

In [None]:
# Load dataset with memory management
print("Loading dataset...")
start_time = time.time()

data = load_dataset("StephanAkkerman/financial-tweets-crypto")
train_dataset = data['train']
train_dataset = train_dataset.filter(lambda data: data['sentiment'] is not None)
memory_status()

if 'tweet_type' in train_dataset.features:
    train_dataset = train_dataset.filter(lambda x: x['tweet_type'] != 'quote tweet')
else:
    print("Warning: 'tweet_type' column not found, skipping quote tweet filter.")

train_dataset = train_dataset.filter(lambda x: x['description'] is not None and len(x['description'].split(' ')) > 1)
memory_status() 

train_df = train_dataset.to_pandas()
force_gc()

train_df['description_cleaned'] = train_df['description'].apply(clean_text)
force_gc()

train_df.drop_duplicates(subset=['description_cleaned'], inplace=True, ignore_index=True)
force_gc()

texts = train_df['description_cleaned'].tolist()

true_labels = train_df['sentiment'].apply(sentiment_map).tolist()
train_df = None
train_dataset = None
data = None
force_gc()


print(f"Total samples: {len(texts)}")
print(f"Dataset loaded and processed in {time.time() - start_time:.2f} seconds")

# Check sentiment distribution
sentiment_counts = Counter(true_labels)
print("\nSentiment distribution in original dataset:")
for sentiment, count in sorted(sentiment_counts.items()):
    sentiment_name = ["Bullish", "Neutral", "Bearish"][sentiment]
    print(f"{sentiment_name}: {count} samples ({count/len(true_labels)*100:.2f}%)")

memory_status()

In [None]:
# Sample the data to fit memory constraints
def sample_data(texts, true_labels, sample_size=25000):
    """Sample the data to work within memory constraints"""
    print(f"Sampling {sample_size} examples from {len(texts)} total...")

    if len(texts) > sample_size:
        # Use stratified sampling to maintain class distribution
        indices = []
        labels = np.array(true_labels)

        for label in np.unique(labels):
            # Get indices for this label
            label_indices = np.where(labels == label)[0]

            # Calculate proportional sample size for this label
            label_sample_size = int(sample_size * len(label_indices) / len(labels))

            # Sample indices for this label
            sampled = np.random.choice(label_indices, size=label_sample_size, replace=False)
            indices.extend(sampled)

        # Shuffle the indices
        np.random.shuffle(indices)

        # Limit to exactly sample_size
        indices = indices[:sample_size]

        # Sample the data
        sampled_texts = [texts[i] for i in indices]
        sampled_labels = [true_labels[i] for i in indices]

        print(f"Sampled {len(sampled_texts)} examples")

        # Check sentiment distribution in sample
        sentiment_counts = Counter(sampled_labels)
        print("Sentiment distribution in sample:")
        for sentiment, count in sorted(sentiment_counts.items()):
            sentiment_name = ["Bullish", "Neutral", "Bearish"][sentiment]
            print(f"{sentiment_name}: {count} samples ({count/len(sampled_labels)*100:.2f}%)")

        return sampled_texts, sampled_labels
    else:
        return texts, true_labels

# Sample the data
texts, true_labels = sample_data(texts, true_labels)
force_gc()  # Free memory

In [None]:
# Extract features using Sentence Transformers with memory optimization
def extract_features(texts, model_name='all-MiniLM-L6-v2', batch_size=128):
    print(f"Extracting features using {model_name}...")
    start_time = time.time()

    model = SentenceTransformer(model_name)

    # Process in smaller batches to avoid memory issues
    embeddings = []

    total_batches = (len(texts) + batch_size - 1) // batch_size
    for i in range(0, len(texts), batch_size):
        if (i // batch_size) % 10 == 0:
            print(f"Processing batch {i//batch_size + 1}/{total_batches}...")
            memory_status()

        batch = texts[i:i+batch_size]
        batch_embeddings = model.encode(batch, show_progress_bar=False)
        embeddings.append(batch_embeddings)

        # Clear batch from memory
        del batch
        if (i // batch_size) % 10 == 0:
            force_gc()

    # Combine all batches
    print("Combining batches...")
    embeddings = np.vstack(embeddings)

    print(f"Feature extraction completed in {time.time() - start_time:.2f} seconds")
    print(f"Embedding shape: {embeddings.shape}")

    return embeddings

# Clear the texts from memory after feature extraction
embeddings = extract_features(texts)
del texts
force_gc()  # Free memory

# Normalize embeddings (in-place to save memory)
print("Normalizing embeddings...")
norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
embeddings = embeddings / norms
del norms
force_gc()  # Free memory

memory_status()

In [None]:
# Reduce dimensionality with UMAP for better clustering
def reduce_dimensions(embeddings, n_components=20, random_state=42):
    print(f"Reducing dimensions to {n_components} components...")
    start_time = time.time()

    # Use UMAP for dimensionality reduction
    # Lower n_neighbors and min_dist for memory efficiency
    reducer = umap.UMAP(
        n_components=n_components,
        random_state=random_state,
        n_neighbors=10,  # Lower value for memory efficiency
        min_dist=0.1,
        metric='euclidean',
        low_memory=True  # Enable low memory mode
    )
    reduced_embeddings = reducer.fit_transform(embeddings)

    print(f"Dimensionality reduction completed in {time.time() - start_time:.2f} seconds")
    print(f"Reduced embedding shape: {reduced_embeddings.shape}")

    return reduced_embeddings

# Reduce dimensions
reduced_embeddings = reduce_dimensions(embeddings, n_components=20)
del embeddings
force_gc()  # Free memory

memory_status()

In [None]:
# Implement HDBSCAN Clustering with memory optimization
def run_hdbscan(embeddings, min_cluster_size=15, min_samples=5):
    print(f"Running HDBSCAN with min_cluster_size={min_cluster_size}, min_samples={min_samples}...")
    start_time = time.time()
    memory_status()

    # Initialize and fit the model with memory-efficient settings
    clusterer = hdbscan.HDBSCAN(
        min_cluster_size=min_cluster_size,
        min_samples=min_samples,
        metric='euclidean',
        cluster_selection_method='eom',  # More conservative than 'leaf'
        alpha=1.0,  # Conservative cluster selection
        approx_min_span_tree=True,  # Memory optimization
        gen_min_span_tree=False,  # Memory optimization - don't store the full tree
        algorithm='best',  # Let it choose the best algorithm
        core_dist_n_jobs=2  # Limit parallelism to save memory
    )

    # Fit the model
    print("Fitting HDBSCAN model...")
    cluster_labels = clusterer.fit_predict(embeddings)

    # Count samples in each cluster
    n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
    n_noise = list(cluster_labels).count(-1)

    print(f"Number of clusters: {n_clusters}")
    print(f"Number of noise points: {n_noise} ({n_noise/len(embeddings)*100:.2f}%)")

    counter = Counter(cluster_labels)
    print("\nCluster distribution:")
    for cluster, count in sorted(counter.items()):
        if cluster == -1:
            print(f"Noise points: {count} samples ({count/len(embeddings)*100:.2f}%)")
        else:
            print(f"Cluster {cluster}: {count} samples ({count/len(embeddings)*100:.2f}%)")

    print(f"Clustering completed in {time.time() - start_time:.2f} seconds")

    # Clear memory
    del clusterer.condensed_tree_
    del clusterer.single_linkage_tree_
    force_gc()

    return cluster_labels

# Try different min_cluster_size values (lighter than full parameter search)
def find_best_hdbscan_params(embeddings, true_labels):
    print("Finding best HDBSCAN parameters...")

    # Try different min_cluster_size values
    min_cluster_sizes = [10, 15, 20, 30]
    min_samples_values = [5, 10]

    best_score = -1
    best_params = None
    best_labels = None

    for min_cluster_size in min_cluster_sizes:
        for min_samples in min_samples_values:
            print(f"\nTrying min_cluster_size={min_cluster_size}, min_samples={min_samples}")

            # Run HDBSCAN
            try:
                clusterer = hdbscan.HDBSCAN(
                    min_cluster_size=min_cluster_size,
                    min_samples=min_samples,
                    metric='euclidean',
                    cluster_selection_method='eom',
                    gen_min_span_tree=False,
                    approx_min_span_tree=True,
                    core_dist_n_jobs=2
                )

                labels = clusterer.fit_predict(embeddings)

                # Calculate metrics
                n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
                n_noise = list(labels).count(-1)
                noise_ratio = n_noise / len(embeddings)

                print(f"Found {n_clusters} clusters, {n_noise} noise points ({noise_ratio:.2%})")

                # Skip if too few clusters or too many noise points
                if n_clusters < 2 or noise_ratio > 0.5:
                    print("Skipping: Poor clustering structure")
                    del clusterer, labels
                    force_gc()
                    continue

                # Calculate metrics
                try:
                    ari = adjusted_rand_score(true_labels, labels)
                    nmi = normalized_mutual_info_score(true_labels, labels)
                    print(f"ARI: {ari:.4f}, NMI: {nmi:.4f}")

                    # Use ARI as score (better for comparing to ground truth)
                    if ari > best_score:
                        best_score = ari
                        best_params = (min_cluster_size, min_samples)
                        best_labels = labels.copy()
                except Exception as e:
                    print(f"Error calculating metrics: {e}")

                # Clear memory
                del clusterer, labels
                force_gc()

            except Exception as e:
                print(f"Error running HDBSCAN: {e}")
                force_gc()

    if best_params is not None:
        print(f"\nBest parameters: min_cluster_size={best_params[0]}, min_samples={best_params[1]}")
        print(f"Best ARI score: {best_score:.4f}")
        return best_params[0], best_params[1], best_labels
    else:
        print("No suitable parameters found. Using defaults.")
        return 15, 5, None

# Find best parameters or run with default
min_cluster_size, min_samples, best_labels = find_best_hdbscan_params(reduced_embeddings, true_labels)

# If parameter search didn't work, run with default parameters
if best_labels is None:
    cluster_labels = run_hdbscan(reduced_embeddings, min_cluster_size=min_cluster_size, min_samples=min_samples)
else:
    cluster_labels = best_labels

force_gc()  # Free memory

In [None]:
# Visualize clusters using PCA for 2D representation (memory efficient)
def visualize_clusters(embeddings, labels, max_points=3000):
    print("Visualizing clusters...")
    start_time = time.time()
    memory_status()

    # Sample data for visualization
    if len(embeddings) > max_points:
        indices = np.random.choice(len(embeddings), max_points, replace=False)
        sample_embeddings = embeddings[indices]
        sample_labels = labels[indices]
    else:
        sample_embeddings = embeddings
        sample_labels = labels

    # Reduce to 2D for visualization using PCA
    pca = PCA(n_components=2, random_state=42)
    reduced_data = pca.fit_transform(sample_embeddings)

    # Clear memory
    del sample_embeddings
    force_gc()

    # Create a scatter plot with custom colors
    plt.figure(figsize=(10, 8))

    # Define colors for the clusters
    colors = ['#7f7f7f', '#2077B4', '#FF7F0E', '#2CA02C', '#D62728', '#9467BD', '#8C564B']

    # Plot noise points first (if any)
    noise_mask = sample_labels == -1
    if np.any(noise_mask):
        plt.scatter(
            reduced_data[noise_mask, 0],
            reduced_data[noise_mask, 1],
            s=5,  # Smaller points
            c='#7f7f7f',
            alpha=0.3,
            label='Noise'
        )

    # Plot points for each cluster
    for label in sorted(set(sample_labels)):
        if label == -1:  # Skip noise points (already plotted)
            continue
        mask = sample_labels == label
        plt.scatter(
            reduced_data[mask, 0],
            reduced_data[mask, 1],
            s=10,  # Smaller points
            c=colors[(label+1) % len(colors)],
            alpha=0.6,
            label=f'Cluster {label}'
        )

    plt.title('HDBSCAN Clustering Visualization', fontsize=14)
    plt.xlabel('Principal Component 1', fontsize=12)
    plt.ylabel('Principal Component 2', fontsize=12)
    plt.legend(loc='best', fontsize=10)
    plt.grid(True, alpha=0.3)

    print(f"Visualization completed in {time.time() - start_time:.2f} seconds")
    plt.show()

    # Clear memory
    del reduced_data
    force_gc()

# Visualize clusters alongside ground truth
def visualize_clusters_with_ground_truth(embeddings, cluster_labels, true_labels, max_points=3000):
    print("Visualizing clusters and ground truth side-by-side...")
    start_time = time.time()
    memory_status()

    # Convert to numpy arrays if needed
    cluster_labels = np.array(cluster_labels)
    true_labels = np.array(true_labels)

    # Sample data for visualization
    if len(embeddings) > max_points:
        indices = np.random.choice(len(embeddings), max_points, replace=False)
        sample_embeddings = embeddings[indices]
        sample_cluster_labels = cluster_labels[indices]
        sample_true_labels = true_labels[indices]
    else:
        sample_embeddings = embeddings
        sample_cluster_labels = cluster_labels
        sample_true_labels = true_labels

    # Reduce to 2D for visualization using PCA
    pca = PCA(n_components=2, random_state=42)
    reduced_data = pca.fit_transform(sample_embeddings)

    # Clear memory
    del sample_embeddings
    force_gc()

    # Create a figure with two subplots side by side
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))

    # Define nice colors for the clusters
    cluster_colors = ['#7f7f7f', '#2077B4', '#FF7F0E', '#2CA02C', '#D62728', '#9467BD', '#8C564B']

    # Define colors for sentiment labels (Bullish, Neutral, Bearish)
    sentiment_colors = ['#00CC00', '#FFD700', '#FF4500']  # Green, Gold, Red-Orange
    sentiment_names = ["Bullish", "Neutral", "Bearish"]

    # Plot 1: Predicted Clusters
    # Plot noise points first (if any)
    noise_mask = sample_cluster_labels == -1
    if np.any(noise_mask):
        ax1.scatter(
            reduced_data[noise_mask, 0],
            reduced_data[noise_mask, 1],
            s=5,
            c='#7f7f7f',
            alpha=0.3,
            label='Noise'
        )

    # Plot points for each cluster
    for label in sorted(set(sample_cluster_labels)):
        if label == -1:  # Skip noise points (already plotted)
            continue
        mask = sample_cluster_labels == label
        ax1.scatter(
            reduced_data[mask, 0],
            reduced_data[mask, 1],
            s=10,
            c=cluster_colors[(label+1) % len(cluster_colors)],
            alpha=0.6,
            label=f'Cluster {label}'
        )

    ax1.set_title('HDBSCAN Clustering Results', fontsize=14)
    ax1.set_xlabel('Principal Component 1', fontsize=10)
    ax1.set_ylabel('Principal Component 2', fontsize=10)
    ax1.legend(loc='best', fontsize=8)
    ax1.grid(True, alpha=0.3)

    # Plot 2: Ground Truth Sentiment Labels
    for label in sorted(set(sample_true_labels)):
        mask = sample_true_labels == label
        ax2.scatter(
            reduced_data[mask, 0],
            reduced_data[mask, 1],
            s=10,
            c=sentiment_colors[label % len(sentiment_colors)],
            alpha=0.6,
            label=sentiment_names[label]
        )

    ax2.set_title('Ground Truth Sentiment Labels', fontsize=14)
    ax2.set_xlabel('Principal Component 1', fontsize=10)
    ax2.set_ylabel('Principal Component 2', fontsize=10)
    ax2.legend(loc='best', fontsize=8)
    ax2.grid(True, alpha=0.3)

    # Adjust layout
    plt.tight_layout()
    print(f"Visualization completed in {time.time() - start_time:.2f} seconds")
    plt.show()

    # Clear memory
    del reduced_data
    force_gc()

# Visualize clusters
visualize_clusters(reduced_embeddings, cluster_labels)

# Visualize clusters alongside ground truth
visualize_clusters_with_ground_truth(reduced_embeddings, cluster_labels, true_labels)

In [None]:
# Analyze clusters in relation to true sentiment labels (memory efficient)
def analyze_clusters(cluster_labels, true_labels):
    print("Analyzing clusters in relation to true sentiment labels...")
    memory_status()

    # Convert to numpy arrays for easier manipulation
    cluster_labels = np.array(cluster_labels)
    true_labels = np.array(true_labels)

    # Define sentiment names for readability
    sentiment_names = {0: "Bullish", 1: "Neutral", 2: "Bearish"}

    # Analyze each cluster
    clusters = sorted(set(cluster_labels))

    # Create a confusion matrix-like structure
    n_sentiments = len(sentiment_names)
    conf_matrix = np.zeros((len(clusters), n_sentiments), dtype=int)

    for i, cluster in enumerate(clusters):
        # Get indices of tweets in this cluster
        cluster_indices = np.where(cluster_labels == cluster)[0]

        # Get the true sentiments for these tweets
        cluster_true_sentiments = true_labels[cluster_indices]

        # Count sentiment distribution
        sentiment_counts = Counter(cluster_true_sentiments)
        total = len(cluster_indices)

        if cluster == -1:
            print(f"\nNoise points analysis ({total} samples):")
        else:
            print(f"\nCluster {cluster} analysis ({total} samples):")

        for sentiment in range(n_sentiments):
            count = sentiment_counts.get(sentiment, 0)
            conf_matrix[i, sentiment] = count
            percentage = (count/total*100) if total > 0 else 0
            print(f"{sentiment_names[sentiment]}: {count} samples ({percentage:.2f}%)")

        # Find the dominant sentiment (if any samples in cluster)
        if total > 0:
            dominant_sentiment = max(sentiment_counts.items(), key=lambda x: x[1])[0]
            dominant_percent = sentiment_counts[dominant_sentiment] / total * 100
            print(f"Dominant sentiment: {sentiment_names[dominant_sentiment]} ({dominant_percent:.2f}%)")

    # Calculate metrics (excluding noise points)
    if -1 in clusters:
        # Calculate metrics only for non-noise points
        non_noise_mask = cluster_labels != -1
        if np.sum(non_noise_mask) > 0:  # Make sure there are non-noise points
            ari = adjusted_rand_score(true_labels[non_noise_mask], cluster_labels[non_noise_mask])
            nmi = normalized_mutual_info_score(true_labels[non_noise_mask], cluster_labels[non_noise_mask])
            print(f"\nMetrics (excluding noise points):")
            print(f"Adjusted Rand Index: {ari:.4f}")
            print(f"Normalized Mutual Information: {nmi:.4f}")

    # Calculate metrics for all points
    ari_all = adjusted_rand_score(true_labels, cluster_labels)
    nmi_all = normalized_mutual_info_score(true_labels, cluster_labels)
    print(f"\nMetrics (including noise points):")
    print(f"Adjusted Rand Index: {ari_all:.4f}")
    print(f"Normalized Mutual Information: {nmi_all:.4f}")

    # Visualize the confusion matrix
    plt.figure(figsize=(10, 8))

    # Create labels for clusters
    cluster_labels_str = []
    for cluster in clusters:
        if cluster == -1:
            cluster_labels_str.append("Noise")
        else:
            cluster_labels_str.append(f"Cluster {cluster}")

    # Create heatmap
    sns.heatmap(
        conf_matrix,
        annot=True,
        fmt='d',
        cmap='Blues',
        xticklabels=[sentiment_names[i] for i in range(n_sentiments)],
        yticklabels=cluster_labels_str
    )
    plt.xlabel('True Sentiment', fontsize=12)
    plt.ylabel('Cluster', fontsize=12)
    plt.title('Cluster vs. True Sentiment Distribution', fontsize=14)
    plt.tight_layout()
    plt.show()

    # Attempt to map clusters to sentiments (excluding noise)
    cluster_to_sentiment = {}
    for i, cluster in enumerate(clusters):
        if cluster != -1:  # Skip noise
            dominant_sentiment = np.argmax(conf_matrix[i])
            cluster_to_sentiment[cluster] = dominant_sentiment

    print("\nProposed cluster-to-sentiment mapping:")
    for cluster, sentiment in cluster_to_sentiment.items():
        print(f"Cluster {cluster} → {sentiment_names[sentiment]}")

    return cluster_to_sentiment

# Analyze clusters
cluster_to_sentiment = analyze_clusters(cluster_labels, true_labels)
force_gc()  # Free memory

In [None]:
def visualize_sentiment_grouped_clusters(embeddings, cluster_labels, true_labels, cluster_to_sentiment, max_points=5000):
    """
    Visualize clusters grouped by their mapped sentiment, compared to ground truth.
    """
    print("Visualizing clusters grouped by sentiment...")
    start_time = time.time()
    memory_status()

    # Convert to numpy arrays
    cluster_labels = np.array(cluster_labels)
    true_labels = np.array(true_labels)

    # Sample data for visualization
    if len(embeddings) > max_points:
        indices = np.random.choice(len(embeddings), max_points, replace=False)
        sample_embeddings = embeddings[indices]
        sample_cluster_labels = cluster_labels[indices]
        sample_true_labels = true_labels[indices]
    else:
        sample_embeddings = embeddings
        sample_cluster_labels = cluster_labels
        sample_true_labels = true_labels

    # Map clusters to their sentiment
    mapped_sentiment = np.zeros_like(sample_cluster_labels)
    for i, cluster in enumerate(sample_cluster_labels):
        if cluster == -1:
            mapped_sentiment[i] = -1  # Noise
        else:
            mapped_sentiment[i] = cluster_to_sentiment.get(cluster, -1)

    # Reduce to 2D for visualization using UMAP for better separation
    try:
        import umap
        reducer = umap.UMAP(
            n_components=2,
            random_state=42,
            n_neighbors=15,
            min_dist=0.1
        )
        reduced_data = reducer.fit_transform(sample_embeddings)
    except:
        # Fall back to PCA if UMAP fails
        from sklearn.decomposition import PCA
        reducer = PCA(n_components=2, random_state=42)
        reduced_data = reducer.fit_transform(sample_embeddings)

    # Create a figure with two subplots side by side
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 8))

    # Define colors for sentiment labels
    sentiment_colors = {
        -1: '#7f7f7f',  # Gray for noise
        0: '#00CC00',   # Green for Bullish
        1: '#FFD700',   # Gold for Neutral
        2: '#FF4500'    # Red-Orange for Bearish
    }
    sentiment_names = {-1: "Noise", 0: "Bullish", 1: "Neutral", 2: "Bearish"}

    # Plot 1: Grouped by mapped sentiment
    for sentiment in sorted(set(mapped_sentiment)):
        mask = mapped_sentiment == sentiment
        ax1.scatter(
            reduced_data[mask, 0],
            reduced_data[mask, 1],
            s=15,
            c=sentiment_colors[sentiment],
            alpha=0.7,
            label=sentiment_names[sentiment],
            edgecolors='none'  # Remove edges for cleaner look
        )

    ax1.set_title('Clusters Grouped by Mapped Sentiment', fontsize=14)
    ax1.set_xlabel('Dimension 1', fontsize=10)
    ax1.set_ylabel('Dimension 2', fontsize=10)
    ax1.legend(loc='best', fontsize=10)
    ax1.grid(True, alpha=0.3)

    # Plot 2: Ground Truth Sentiment Labels
    for label in sorted(set(sample_true_labels)):
        mask = sample_true_labels == label
        ax2.scatter(
            reduced_data[mask, 0],
            reduced_data[mask, 1],
            s=15,
            c=sentiment_colors[label],
            alpha=0.7,
            label=sentiment_names[label],
            edgecolors='none'
        )

    ax2.set_title('Ground Truth Sentiment Labels', fontsize=14)
    ax2.set_xlabel('Dimension 1', fontsize=10)
    ax2.set_ylabel('Dimension 2', fontsize=10)
    ax2.legend(loc='best', fontsize=10)
    ax2.grid(True, alpha=0.3)

    # Adjust layout
    plt.tight_layout()
    plt.savefig('sentiment_comparison.png', dpi=300, bbox_inches='tight')
    print(f"Visualization completed in {time.time() - start_time:.2f} seconds")
    plt.show()

    # Create additional analysis
    # Count distribution of clusters by sentiment
    print("\nDistribution of clusters by mapped sentiment:")
    unique_clusters = set(cluster_labels)
    sentiment_counts = {0: 0, 1: 0, 2: 0}  # Bullish, Neutral, Bearish

    for cluster in unique_clusters:
        if cluster != -1:  # Skip noise
            sentiment = cluster_to_sentiment.get(cluster, -1)
            if sentiment in sentiment_counts:
                sentiment_counts[sentiment] += 1

    for sentiment, count in sentiment_counts.items():
        print(f"{sentiment_names[sentiment]}: {count} clusters ({count/len(unique_clusters)*100:.1f}%)")

    # Calculate confusion matrix
    from sklearn.metrics import confusion_matrix
    import seaborn as sns

    # Filter out noise points for confusion matrix
    non_noise_mask = mapped_sentiment != -1
    if sum(non_noise_mask) > 0:
        conf_mat = confusion_matrix(
            sample_true_labels[non_noise_mask],
            mapped_sentiment[non_noise_mask],
            labels=[0, 1, 2]
        )

        # Plot confusion matrix
        plt.figure(figsize=(10, 8))
        sns.heatmap(
            conf_mat,
            annot=True,
            fmt='d',
            cmap='Blues',
            xticklabels=["Bullish", "Neutral", "Bearish"],
            yticklabels=["Bullish", "Neutral", "Bearish"]
        )
        plt.xlabel('Mapped Sentiment (Predicted)', fontsize=12)
        plt.ylabel('True Sentiment', fontsize=12)
        plt.title('Confusion Matrix: True vs. Mapped Sentiment', fontsize=14)
        plt.tight_layout()
        plt.savefig('sentiment_confusion_matrix.png', dpi=300, bbox_inches='tight')
        plt.show()

    # Clear memory
    del reduced_data
    force_gc()

# Use this function with your existing data
visualize_sentiment_grouped_clusters(reduced_embeddings, cluster_labels, true_labels, cluster_to_sentiment)

In [None]:
def investigate_sentiment_mapping_issue(embeddings, cluster_labels, true_labels, cluster_to_sentiment, max_points=5000):
    """
    Create a visualization that highlights the potential mapping problem
    """
    print("Investigating sentiment mapping issues...")

    # Convert to arrays and sample
    cluster_labels = np.array(cluster_labels)
    true_labels = np.array(true_labels)

    if len(embeddings) > max_points:
        indices = np.random.choice(len(embeddings), max_points, replace=False)
        sample_embeddings = embeddings[indices]
        sample_cluster_labels = cluster_labels[indices]
        sample_true_labels = true_labels[indices]
    else:
        sample_embeddings = embeddings
        sample_cluster_labels = cluster_labels
        sample_true_labels = true_labels

    # Map clusters to their sentiment
    mapped_sentiment = np.zeros_like(sample_cluster_labels)
    for i, cluster in enumerate(sample_cluster_labels):
        if cluster == -1:
            mapped_sentiment[i] = -1  # Noise
        else:
            mapped_sentiment[i] = cluster_to_sentiment.get(cluster, -1)

    # Use UMAP for better visualization
    import umap
    reducer = umap.UMAP(n_components=2, random_state=42, min_dist=0.1)
    reduced_data = reducer.fit_transform(sample_embeddings)

    # Create a figure with three subplots
    fig, axes = plt.subplots(1, 3, figsize=(24, 8))

    # Define colors
    sentiment_colors = {
        -1: '#7f7f7f',  # Gray for noise
        0: '#00CC00',   # Green for Bullish
        1: '#FFD700',   # Gold for Neutral
        2: '#FF4500'    # Red-Orange for Bearish
    }
    sentiment_names = {-1: "Noise", 0: "Bullish", 1: "Neutral", 2: "Bearish"}

    # Plot 1: Ground Truth
    for label in sorted(set(sample_true_labels)):
        mask = sample_true_labels == label
        axes[0].scatter(
            reduced_data[mask, 0],
            reduced_data[mask, 1],
            s=15, c=sentiment_colors[label], alpha=0.7,
            label=f"{sentiment_names[label]} (n={np.sum(mask)})",
            edgecolors='none'
        )
    axes[0].set_title('Ground Truth Sentiment', fontsize=14)
    axes[0].legend(loc='best')
    axes[0].grid(True, alpha=0.3)

    # Plot 2: Mapped Sentiment
    for label in sorted(set(mapped_sentiment)):
        mask = mapped_sentiment == label
        axes[1].scatter(
            reduced_data[mask, 0],
            reduced_data[mask, 1],
            s=15, c=sentiment_colors[label], alpha=0.7,
            label=f"{sentiment_names[label]} (n={np.sum(mask)})",
            edgecolors='none'
        )
    axes[1].set_title('Mapped Sentiment from Clusters', fontsize=14)
    axes[1].legend(loc='best')
    axes[1].grid(True, alpha=0.3)

    # Plot 3: Mapping Errors
    correct_mask = mapped_sentiment == sample_true_labels
    incorrect_mask = ~correct_mask

    # Plot correct ones in light gray
    axes[2].scatter(
        reduced_data[correct_mask, 0],
        reduced_data[correct_mask, 1],
        s=10, c='lightgray', alpha=0.3, label='Correct')

    # Plot incorrect ones with their true label colors
    for label in sorted(set(sample_true_labels)):
        if label != -1:  # Skip noise
            mask = (sample_true_labels == label) & incorrect_mask
            if np.sum(mask) > 0:
                axes[2].scatter(
                    reduced_data[mask, 0],
                    reduced_data[mask, 1],
                    s=20, c=sentiment_colors[label], alpha=0.8,
                    label=f"True {sentiment_names[label]} (n={np.sum(mask)})",
                    edgecolors='black', linewidth=0.5
                )

    axes[2].set_title('Mapping Errors (colored by true sentiment)', fontsize=14)
    axes[2].legend(loc='best')
    axes[2].grid(True, alpha=0.3)

    plt.tight_layout()
    plt.savefig('sentiment_mapping_investigation.png', dpi=300, bbox_inches='tight')
    plt.show()

    # Print statistics
    print("\nSentiment distribution analysis:")

    print("\nGround truth sentiment distribution:")
    for sentiment in sorted(set(sample_true_labels)):
        count = np.sum(sample_true_labels == sentiment)
        print(f"{sentiment_names[sentiment]}: {count} samples ({count/len(sample_true_labels)*100:.1f}%)")

    print("\nMapped sentiment distribution:")
    for sentiment in sorted(set(mapped_sentiment)):
        count = np.sum(mapped_sentiment == sentiment)
        print(f"{sentiment_names[sentiment]}: {count} samples ({count/len(mapped_sentiment)*100:.1f}%)")

    # Confusion matrix
    from sklearn.metrics import confusion_matrix

    non_noise_mask = (mapped_sentiment != -1) & (sample_true_labels != -1)
    if sum(non_noise_mask) > 0:
        conf_mat = confusion_matrix(
            sample_true_labels[non_noise_mask],
            mapped_sentiment[non_noise_mask],
            labels=[0, 1, 2]
        )

        print("\nConfusion Matrix:")
        print(conf_mat)

        # Accuracy by sentiment
        for i, sentiment in enumerate([0, 1, 2]):
            true_count = np.sum(sample_true_labels[non_noise_mask] == sentiment)
            correct_count = conf_mat[i, i]
            if true_count > 0:
                accuracy = correct_count / true_count * 100
                print(f"Accuracy for {sentiment_names[sentiment]}: {accuracy:.1f}% ({correct_count}/{true_count})")

    # Calculate overall accuracy
    overall_accuracy = np.mean(mapped_sentiment == sample_true_labels) * 100
    print(f"\nOverall accuracy: {overall_accuracy:.1f}%")

# Use this function with your existing data
investigate_sentiment_mapping_issue(reduced_embeddings, cluster_labels, true_labels, cluster_to_sentiment)