In [None]:
# Install necessary packages
!pip install datasets scikit-learn sentence-transformers matplotlib umap-learn pandas seaborn

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re
import time

# Data processing and ML libraries
from datasets import load_dataset
from sklearn.cluster import Birch
from sklearn.metrics import silhouette_score, adjusted_rand_score, normalized_mutual_info_score
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer
import umap

# Set random seed for reproducibility
np.random.seed(42)

In [None]:
# Text cleaning function
def clean_text(text):
    text = str(text) if text is not None else ""
    text = re.sub(r'[\u4e00-\u9fff]+', '', text)
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'RT : ', '', text)
    text = re.sub(r'&', 'and', text)
    text = re.sub(r'â€™', '\'', text)
    text = re.sub(r'["&;]', '', text)
    # text = re.sub(r'', '', text) # Assuming the zero-width space removal was intentional but empty, check if needed
    text = re.sub(r'\.[Xx]', '', text)
    text = re.sub(r'\.\.+', '...', text)
    text = re.sub(r'@|\|', '', text)
    text = re.sub(r'\s+', ' ', text).strip().lower()
    wallet_pattern = r'0x[a-fA-F0-9]{40}|[13][a-km-zA-HJ-NP-Z1-9]{25,34}'
    text = re.sub(wallet_pattern, '', text)
    return text


# Function to map sentiment labels (for later evaluation)
def sentiment_map(text):
    if 'Bullish' in text:
        return 0
    elif 'Neutral' in text:
        return 1
    else:
        return 2  # Bearish

In [None]:
# Load dataset with memory management
print("Loading dataset...")
start_time = time.time()

data = load_dataset("StephanAkkerman/financial-tweets-crypto")
train_dataset = data['train']
train_dataset = train_dataset.filter(lambda data: data['sentiment'] is not None)
memory_status()

if 'tweet_type' in train_dataset.features:
    train_dataset = train_dataset.filter(lambda x: x['tweet_type'] != 'quote tweet')
else:
    print("Warning: 'tweet_type' column not found, skipping quote tweet filter.")

train_dataset = train_dataset.filter(lambda x: x['description'] is not None and len(x['description'].split(' ')) > 1)
memory_status() 

train_df = train_dataset.to_pandas()
force_gc()

train_df['description_cleaned'] = train_df['description'].apply(clean_text)
force_gc()

train_df.drop_duplicates(subset=['description_cleaned'], inplace=True, ignore_index=True)
force_gc()

texts = train_df['description_cleaned'].tolist()

true_labels = train_df['sentiment'].apply(sentiment_map).tolist()
train_df = None
train_dataset = None
data = None
force_gc()


print(f"Total samples: {len(texts)}")
print(f"Dataset loaded and processed in {time.time() - start_time:.2f} seconds")

# Check sentiment distribution
sentiment_counts = Counter(true_labels)
print("\nSentiment distribution in original dataset:")
for sentiment, count in sorted(sentiment_counts.items()):
    sentiment_name = ["Bullish", "Neutral", "Bearish"][sentiment]
    print(f"{sentiment_name}: {count} samples ({count/len(true_labels)*100:.2f}%)")

memory_status()

In [None]:
# Extract features using Sentence Transformers
def extract_features(texts, model_name='all-MiniLM-L6-v2'):
    print(f"Extracting features using {model_name}...")
    start_time = time.time()

    model = SentenceTransformer(model_name)

    # Process in batches to avoid memory issues
    batch_size = 256  # Larger batch size for speed
    embeddings = []

    total_batches = (len(texts) + batch_size - 1) // batch_size
    for i in range(0, len(texts), batch_size):
        if i % (10 * batch_size) == 0:
            print(f"Processing batch {i//batch_size + 1}/{total_batches}...")

        batch = texts[i:i+batch_size]
        batch_embeddings = model.encode(batch, show_progress_bar=False)
        embeddings.append(batch_embeddings)

    # Combine all batches
    embeddings = np.vstack(embeddings)

    print(f"Feature extraction completed in {time.time() - start_time:.2f} seconds")
    print(f"Embedding shape: {embeddings.shape}")

    return embeddings

# Get embeddings
embeddings = extract_features(texts)

# Normalize embeddings
norm_embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
print("Embeddings normalized")

In [None]:
# Reduce dimensionality with UMAP for better clustering
def reduce_dimensions(embeddings, n_components=50, random_state=42):
    print(f"Reducing dimensions to {n_components} components...")
    start_time = time.time()

    # Use UMAP for dimensionality reduction
    reducer = umap.UMAP(
        n_components=n_components,
        random_state=random_state,
        n_neighbors=15,
        min_dist=0.1
    )
    reduced_embeddings = reducer.fit_transform(embeddings)

    print(f"Dimensionality reduction completed in {time.time() - start_time:.2f} seconds")
    print(f"Reduced embedding shape: {reduced_embeddings.shape}")

    return reduced_embeddings

# Apply dimensionality reduction
reduced_embeddings = reduce_dimensions(embeddings)

In [None]:
# Implement BIRCH Clustering
def run_birch_clustering(embeddings, n_clusters=3, threshold=0.5, branching_factor=50):
    print(f"Running BIRCH Clustering with {n_clusters} clusters...")
    print(f"Parameters: threshold={threshold}, branching_factor={branching_factor}")
    start_time = time.time()

    # Initialize and fit the model
    birch = Birch(
        n_clusters=n_clusters,
        threshold=threshold,
        branching_factor=branching_factor
    )
    cluster_labels = birch.fit_predict(embeddings)

    # Get cluster centers (subcluster centers as approximation)
    cluster_centers = birch.subcluster_centers_

    # Count samples in each cluster
    counter = Counter(cluster_labels)
    print("\nCluster distribution:")
    for cluster, count in sorted(counter.items()):
        print(f"Cluster {cluster}: {count} samples ({count/len(embeddings)*100:.2f}%)")

    # Calculate silhouette score
    silhouette = silhouette_score(embeddings, cluster_labels)
    print(f"Silhouette score: {silhouette:.4f}")

    print(f"Clustering completed in {time.time() - start_time:.2f} seconds")

    return cluster_labels, cluster_centers, birch

# Run BIRCH clustering
cluster_labels, cluster_centers, birch_model = run_birch_clustering(reduced_embeddings)

In [None]:
# Try different BIRCH parameters (optional)
def tune_birch_parameters(embeddings):
    print("Tuning BIRCH parameters...")

    # Parameters to try
    thresholds = [0.3, 0.5, 0.7, 1.0]
    branching_factors = [50, 100, 200]

    # Store results
    results = []

    for threshold in thresholds:
        for branching_factor in branching_factors:
            print(f"\nTrying threshold={threshold}, branching_factor={branching_factor}")

            # Run BIRCH
            birch = Birch(
                n_clusters=3,  # Fixed number of clusters for sentiment analysis
                threshold=threshold,
                branching_factor=branching_factor
            )
            labels = birch.fit_predict(embeddings)

            # Calculate metrics
            try:
                silhouette = silhouette_score(embeddings, labels)
                ari = adjusted_rand_score(true_labels, labels)
                nmi = normalized_mutual_info_score(true_labels, labels)

                # Count distribution
                cluster_counts = Counter(labels)

                # Save results
                results.append({
                    'threshold': threshold,
                    'branching_factor': branching_factor,
                    'silhouette': silhouette,
                    'ari': ari,
                    'nmi': nmi,
                    'cluster_counts': dict(cluster_counts)
                })

                print(f"Silhouette: {silhouette:.4f}, ARI: {ari:.4f}, NMI: {nmi:.4f}")

            except Exception as e:
                print(f"Error calculating metrics: {e}")
                continue

    # Convert to DataFrame for easy viewing
    if results:
        results_df = pd.DataFrame(results)
        results_df = results_df.sort_values('silhouette', ascending=False)
        print("\nTop 5 parameter combinations by silhouette score:")
        print(results_df[['threshold', 'branching_factor', 'silhouette', 'ari', 'nmi']].head(5))

        # Get best parameters
        best_row = results_df.iloc[0]
        best_threshold = best_row['threshold']
        best_branching_factor = best_row['branching_factor']

        print(f"\nBest parameters: threshold={best_threshold}, branching_factor={best_branching_factor}")
        return best_threshold, best_branching_factor
    else:
        print("No valid results found")
        return 0.5, 50  # Default values

# Uncomment to run parameter tuning (this will take time)
best_threshold, best_branching_factor = tune_birch_parameters(reduced_embeddings)
cluster_labels, cluster_centers, birch_model = run_birch_clustering(
    reduced_embeddings, threshold=best_threshold, branching_factor=best_branching_factor
)

In [None]:
# Visualize clusters and ground truth side-by-side using PCA for 2D representation
def visualize_clusters_with_ground_truth(embeddings, cluster_labels, true_labels, centers=None, max_points=5000):
    print("Visualizing clusters and ground truth side-by-side...")
    start_time = time.time()

    # Sample data if too large
    if len(embeddings) > max_points:
        indices = np.random.choice(len(embeddings), max_points, replace=False)
        sample_embeddings = embeddings[indices]
        sample_cluster_labels = cluster_labels[indices]
        sample_true_labels = true_labels[indices] if isinstance(true_labels, np.ndarray) else np.array(true_labels)[indices]
    else:
        sample_embeddings = embeddings
        sample_cluster_labels = cluster_labels
        sample_true_labels = true_labels if isinstance(true_labels, np.ndarray) else np.array(true_labels)

    # Reduce to 2D for visualization using PCA
    pca = PCA(n_components=2, random_state=42)
    reduced_data = pca.fit_transform(sample_embeddings)

    # Plot centers if provided
    if centers is not None:
        centers_2d = pca.transform(centers)

    # Create a figure with two subplots side by side
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10))

    # Define nice colors for the clusters
    cluster_colors = ['#2077B4', '#FF7F0E', '#2CA02C', '#D62728', '#9467BD', '#8C564B']

    # Define colors for sentiment labels (Bullish, Neutral, Bearish)
    sentiment_colors = ['#00CC00', '#FFD700', '#FF4500']  # Green, Gold, Red-Orange
    sentiment_names = ["Bullish", "Neutral", "Bearish"]

    # Plot 1: Predicted Clusters
    for label in sorted(set(sample_cluster_labels)):
        mask = sample_cluster_labels == label
        ax1.scatter(
            reduced_data[mask, 0],
            reduced_data[mask, 1],
            s=20,
            c=cluster_colors[label % len(cluster_colors)],
            alpha=0.6,
            label=f'Cluster {label}'
        )

    # Plot cluster centers if provided
    if centers is not None and len(centers) <= 10:  # Only if few centers
        ax1.scatter(
            centers_2d[:, 0],
            centers_2d[:, 1],
            s=150,
            c='black',
            alpha=1.0,
            marker='X',
            label='Cluster centers'
        )

    ax1.set_title('BIRCH Clustering Results', fontsize=16)
    ax1.set_xlabel('Principal Component 1', fontsize=12)
    ax1.set_ylabel('Principal Component 2', fontsize=12)
    ax1.legend(loc='best', fontsize=10)
    ax1.grid(True, alpha=0.3)

    # Plot 2: Ground Truth Sentiment Labels
    for label in sorted(set(sample_true_labels)):
        mask = sample_true_labels == label
        ax2.scatter(
            reduced_data[mask, 0],
            reduced_data[mask, 1],
            s=20,
            c=sentiment_colors[label % len(sentiment_colors)],
            alpha=0.6,
            label=sentiment_names[label]
        )

    ax2.set_title('Ground Truth Sentiment Labels', fontsize=16)
    ax2.set_xlabel('Principal Component 1', fontsize=12)
    ax2.set_ylabel('Principal Component 2', fontsize=12)
    ax2.legend(loc='best', fontsize=10)
    ax2.grid(True, alpha=0.3)

    # Adjust layout
    plt.tight_layout()
    print(f"Visualization completed in {time.time() - start_time:.2f} seconds")
    plt.show()

# Visualize the clusters alongside ground truth
visualize_clusters_with_ground_truth(reduced_embeddings, cluster_labels, true_labels)

In [None]:
# Analyze clusters in relation to true sentiment labels
def analyze_clusters(cluster_labels, true_labels):
    print("Analyzing clusters in relation to true sentiment labels...")

    # Convert true_labels to numpy array for easier manipulation
    true_labels = np.array(true_labels)

    # Define sentiment names for readability
    sentiment_names = {0: "Bullish", 1: "Neutral", 2: "Bearish"}

    # Analyze each cluster
    clusters = sorted(set(cluster_labels))

    # Create a confusion matrix-like structure
    conf_matrix = np.zeros((len(clusters), len(sentiment_names)), dtype=int)

    for cluster in clusters:
        # Get indices of tweets in this cluster
        cluster_indices = np.where(cluster_labels == cluster)[0]

        # Get the true sentiments for these tweets
        cluster_true_sentiments = true_labels[cluster_indices]

        # Count sentiment distribution
        sentiment_counts = Counter(cluster_true_sentiments)
        total = len(cluster_indices)

        print(f"\nCluster {cluster} analysis ({total} samples):")
        for sentiment, count in sorted(sentiment_counts.items()):
            conf_matrix[cluster, sentiment] = count
            print(f"{sentiment_names[sentiment]}: {count} samples ({count/total*100:.2f}%)")

        # Find the dominant sentiment
        dominant_sentiment = max(sentiment_counts.items(), key=lambda x: x[1])[0]
        dominant_percent = sentiment_counts[dominant_sentiment] / total * 100

        print(f"Dominant sentiment: {sentiment_names[dominant_sentiment]} ({dominant_percent:.2f}%)")

    # Calculate metrics
    ari = adjusted_rand_score(true_labels, cluster_labels)
    nmi = normalized_mutual_info_score(true_labels, cluster_labels)
    print(f"\nAdjusted Rand Index: {ari:.4f}")
    print(f"Normalized Mutual Information: {nmi:.4f}")

    # Visualize the confusion matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(
        conf_matrix,
        annot=True,
        fmt='d',
        cmap='Blues',
        xticklabels=[sentiment_names[i] for i in range(len(sentiment_names))],
        yticklabels=[f'Cluster {i}' for i in clusters]
    )
    plt.xlabel('True Sentiment', fontsize=12)
    plt.ylabel('Cluster', fontsize=12)
    plt.title('Cluster vs. True Sentiment Distribution', fontsize=14)
    plt.tight_layout()
    plt.show()

    # Attempt to map clusters to sentiments
    cluster_to_sentiment = {}
    for cluster in clusters:
        dominant_sentiment = np.argmax(conf_matrix[cluster])
        cluster_to_sentiment[cluster] = dominant_sentiment

    print("\nProposed cluster-to-sentiment mapping:")
    for cluster, sentiment in cluster_to_sentiment.items():
        print(f"Cluster {cluster} → {sentiment_names[sentiment]}")

    return cluster_to_sentiment

# Analyze clusters
cluster_to_sentiment = analyze_clusters(cluster_labels, true_labels)