In [None]:
!pip install datasets scikit-learn sentence-transformers matplotlib umap-learn pandas

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re
import time

# Data processing and ML libraries
from datasets import load_dataset
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score, adjusted_rand_score, normalized_mutual_info_score
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer
import umap

# Set random seed for reproducibility
np.random.seed(42)

In [None]:
def clean_text(text):
    text = str(text) if text is not None else ""
    text = re.sub(r'[\u4e00-\u9fff]+', '', text)
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'RT : ', '', text)
    text = re.sub(r'&', 'and', text)
    text = re.sub(r'â€™', '\'', text)
    text = re.sub(r'["&;]', '', text)
    # text = re.sub(r'', '', text) # Assuming the zero-width space removal was intentional but empty, check if needed
    text = re.sub(r'\.[Xx]', '', text)
    text = re.sub(r'\.\.+', '...', text)
    text = re.sub(r'@|\|', '', text)
    text = re.sub(r'\s+', ' ', text).strip().lower()
    wallet_pattern = r'0x[a-fA-F0-9]{40}|[13][a-km-zA-HJ-NP-Z1-9]{25,34}'
    text = re.sub(wallet_pattern, '', text)
    return text


# Function to map sentiment labels (for later evaluation)
def sentiment_map(text):
    if 'Bullish' in text:
        return 0
    elif 'Neutral' in text:
        return 1
    else:
        return 2  # Bearish

In [None]:
# Load dataset with memory management
print("Loading dataset...")
start_time = time.time()

data = load_dataset("StephanAkkerman/financial-tweets-crypto")
train_dataset = data['train']
train_dataset = train_dataset.filter(lambda data: data['sentiment'] is not None)
memory_status()

if 'tweet_type' in train_dataset.features:
    train_dataset = train_dataset.filter(lambda x: x['tweet_type'] != 'quote tweet')
else:
    print("Warning: 'tweet_type' column not found, skipping quote tweet filter.")

train_dataset = train_dataset.filter(lambda x: x['description'] is not None and len(x['description'].split(' ')) > 1)
memory_status() 

train_df = train_dataset.to_pandas()
force_gc()

train_df['description_cleaned'] = train_df['description'].apply(clean_text)
force_gc()

train_df.drop_duplicates(subset=['description_cleaned'], inplace=True, ignore_index=True)
force_gc()

texts = train_df['description_cleaned'].tolist()

true_labels = train_df['sentiment'].apply(sentiment_map).tolist()
train_df = None
train_dataset = None
data = None
force_gc()


print(f"Total samples: {len(texts)}")
print(f"Dataset loaded and processed in {time.time() - start_time:.2f} seconds")

# Check sentiment distribution
sentiment_counts = Counter(true_labels)
print("\nSentiment distribution in original dataset:")
for sentiment, count in sorted(sentiment_counts.items()):
    sentiment_name = ["Bullish", "Neutral", "Bearish"][sentiment]
    print(f"{sentiment_name}: {count} samples ({count/len(true_labels)*100:.2f}%)")

memory_status()

In [None]:
# Extract features using Sentence Transformers
def extract_features(texts, model_name='all-MiniLM-L6-v2'):
    print(f"Extracting features using {model_name}...")
    start_time = time.time()

    model = SentenceTransformer(model_name)

    # Process in batches to avoid memory issues
    batch_size = 256  # Larger batch size for speed
    embeddings = []

    total_batches = (len(texts) + batch_size - 1) // batch_size
    for i in range(0, len(texts), batch_size):
        if i % (10 * batch_size) == 0:
            print(f"Processing batch {i//batch_size + 1}/{total_batches}...")

        batch = texts[i:i+batch_size]
        batch_embeddings = model.encode(batch, show_progress_bar=False)
        embeddings.append(batch_embeddings)

    # Combine all batches
    embeddings = np.vstack(embeddings)

    print(f"Feature extraction completed in {time.time() - start_time:.2f} seconds")
    print(f"Embedding shape: {embeddings.shape}")

    return embeddings

# Get embeddings
embeddings = extract_features(texts)

# Normalize embeddings (useful for cosine-based clustering)
norm_embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
print("Embeddings normalized")

In [None]:
 # Reduce dimensionality with UMAP for better clustering
def reduce_dimensions(embeddings, n_components=50, random_state=42):
    print(f"Reducing dimensions to {n_components} components...")
    start_time = time.time()

    # Use UMAP for dimensionality reduction
    reducer = umap.UMAP(
        n_components=n_components,
        random_state=random_state,
        n_neighbors=15,
        min_dist=0.1
    )
    reduced_embeddings = reducer.fit_transform(embeddings)

    print(f"Dimensionality reduction completed in {time.time() - start_time:.2f} seconds")
    print(f"Reduced embedding shape: {reduced_embeddings.shape}")

    return reduced_embeddings

# Apply dimensionality reduction
reduced_embeddings = reduce_dimensions(embeddings)

In [None]:
# Implement Mini-batch K-means
def run_minibatch_kmeans(embeddings, n_clusters=3, batch_size=1024, random_state=42, n_init=10):
    print(f"Running Mini-batch K-means with {n_clusters} clusters...")
    start_time = time.time()

    # Initialize and fit the model
    mbk = MiniBatchKMeans(
        n_clusters=n_clusters,
        batch_size=batch_size,
        random_state=random_state,
        n_init=n_init,
        max_iter=1000  # Increase for better convergence
    )
    cluster_labels = mbk.fit_predict(embeddings)

    # Count samples in each cluster
    counter = Counter(cluster_labels)
    print("\nCluster distribution:")
    for cluster, count in sorted(counter.items()):
        print(f"Cluster {cluster}: {count} samples ({count/len(embeddings)*100:.2f}%)")

    # Calculate silhouette score
    silhouette = silhouette_score(embeddings, cluster_labels)
    print(f"Silhouette score: {silhouette:.4f}")

    print(f"Clustering completed in {time.time() - start_time:.2f} seconds")

    return cluster_labels, mbk.cluster_centers_

# Run Mini-batch K-means
cluster_labels, cluster_centers = run_minibatch_kmeans(reduced_embeddings)

In [None]:
# Visualize clusters using PCA for final 2D representation
def visualize_clusters(embeddings, labels, centers=None, max_points=5000):
    print("Visualizing clusters...")
    start_time = time.time()

    # Sample data if too large
    if len(embeddings) > max_points:
        indices = np.random.choice(len(embeddings), max_points, replace=False)
        sample_embeddings = embeddings[indices]
        sample_labels = labels[indices]
    else:
        sample_embeddings = embeddings
        sample_labels = labels

    # Reduce to 2D for visualization using PCA
    pca = PCA(n_components=2, random_state=42)
    reduced_data = pca.fit_transform(sample_embeddings)

    # Plot centers if provided
    if centers is not None:
        centers_2d = pca.transform(centers)

    # Create a scatter plot with custom colors
    plt.figure(figsize=(12, 10))

    # Define nice colors for the clusters
    colors = ['#2077B4', '#FF7F0E', '#2CA02C', '#D62728', '#9467BD', '#8C564B']

    # Plot points for each cluster
    for label in sorted(set(sample_labels)):
        mask = sample_labels == label
        plt.scatter(
            reduced_data[mask, 0],
            reduced_data[mask, 1],
            s=20,
            c=colors[label % len(colors)],
            alpha=0.7,
            label=f'Cluster {label}'
        )

    # Plot cluster centers if provided
    if centers is not None:
        plt.scatter(
            centers_2d[:, 0],
            centers_2d[:, 1],
            s=200,
            c='black',
            alpha=1.0,
            marker='X',
            label='Cluster centers'
        )

    plt.title('Mini-batch K-means Clustering Visualization', fontsize=16)
    plt.xlabel('Principal Component 1', fontsize=12)
    plt.ylabel('Principal Component 2', fontsize=12)
    plt.legend(loc='best', fontsize=10)
    plt.grid(True, alpha=0.3)

    print(f"Visualization completed in {time.time() - start_time:.2f} seconds")
    plt.show()

# Visualize the clusters
visualize_clusters(reduced_embeddings, cluster_labels, cluster_centers)

In [None]:
# Analyze clusters in relation to true sentiment labels
def analyze_clusters(cluster_labels, true_labels):
    print("Analyzing clusters in relation to true sentiment labels...")

    # Convert true_labels to numpy array for easier manipulation
    true_labels = np.array(true_labels)

    # Define sentiment names for readability
    sentiment_names = {0: "Bullish", 1: "Neutral", 2: "Bearish"}

    # Analyze each cluster
    clusters = sorted(set(cluster_labels))

    # Create a confusion matrix-like structure
    conf_matrix = np.zeros((len(clusters), len(sentiment_names)), dtype=int)

    for cluster in clusters:
        # Get indices of tweets in this cluster
        cluster_indices = np.where(cluster_labels == cluster)[0]

        # Get the true sentiments for these tweets
        cluster_true_sentiments = true_labels[cluster_indices]

        # Count sentiment distribution
        sentiment_counts = Counter(cluster_true_sentiments)
        total = len(cluster_indices)

        print(f"\nCluster {cluster} analysis ({total} samples):")
        for sentiment, count in sorted(sentiment_counts.items()):
            conf_matrix[cluster, sentiment] = count
            print(f"{sentiment_names[sentiment]}: {count} samples ({count/total*100:.2f}%)")

        # Find the dominant sentiment
        dominant_sentiment = max(sentiment_counts.items(), key=lambda x: x[1])[0]
        dominant_percent = sentiment_counts[dominant_sentiment] / total * 100

        print(f"Dominant sentiment: {sentiment_names[dominant_sentiment]} ({dominant_percent:.2f}%)")

    # Calculate metrics
    ari = adjusted_rand_score(true_labels, cluster_labels)
    nmi = normalized_mutual_info_score(true_labels, cluster_labels)
    print(f"\nAdjusted Rand Index: {ari:.4f}")
    print(f"Normalized Mutual Information: {nmi:.4f}")

    # Visualize the confusion matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(
        conf_matrix,
        annot=True,
        fmt='d',
        cmap='Blues',
        xticklabels=[sentiment_names[i] for i in range(len(sentiment_names))],
        yticklabels=[f'Cluster {i}' for i in clusters]
    )
    plt.xlabel('True Sentiment', fontsize=12)
    plt.ylabel('Cluster', fontsize=12)
    plt.title('Cluster vs. True Sentiment Distribution', fontsize=14)
    plt.tight_layout()
    plt.show()

    # Attempt to map clusters to sentiments
    cluster_to_sentiment = {}
    for cluster in clusters:
        dominant_sentiment = np.argmax(conf_matrix[cluster])
        cluster_to_sentiment[cluster] = dominant_sentiment

    print("\nProposed cluster-to-sentiment mapping:")
    for cluster, sentiment in cluster_to_sentiment.items():
        print(f"Cluster {cluster} → {sentiment_names[sentiment]}")

    return cluster_to_sentiment

# Analyze clusters
cluster_to_sentiment = analyze_clusters(cluster_labels, true_labels)

In [None]:
# Implement Mini-batch K-means
def run_minibatch_kmeans(embeddings, n_clusters=6, batch_size=1024, random_state=42, n_init=10):
    print(f"Running Mini-batch K-means with {n_clusters} clusters...")
    start_time = time.time()

    # Initialize and fit the model
    mbk = MiniBatchKMeans(
        n_clusters=n_clusters,
        batch_size=batch_size,
        random_state=random_state,
        n_init=n_init,
        max_iter=1000  # Increase for better convergence
    )
    cluster_labels = mbk.fit_predict(embeddings)

    # Count samples in each cluster
    counter = Counter(cluster_labels)
    print("\nCluster distribution:")
    for cluster, count in sorted(counter.items()):
        print(f"Cluster {cluster}: {count} samples ({count/len(embeddings)*100:.2f}%)")

    # Calculate silhouette score
    silhouette = silhouette_score(embeddings, cluster_labels)
    print(f"Silhouette score: {silhouette:.4f}")

    print(f"Clustering completed in {time.time() - start_time:.2f} seconds")

    return cluster_labels, mbk.cluster_centers_

# Run Mini-batch K-means
cluster_labels, cluster_centers = run_minibatch_kmeans(reduced_embeddings)

In [None]:
# Visualize clusters using PCA for final 2D representation
def visualize_clusters(embeddings, labels, centers=None, max_points=5000):
    print("Visualizing clusters...")
    start_time = time.time()

    # Sample data if too large
    if len(embeddings) > max_points:
        indices = np.random.choice(len(embeddings), max_points, replace=False)
        sample_embeddings = embeddings[indices]
        sample_labels = labels[indices]
    else:
        sample_embeddings = embeddings
        sample_labels = labels

    # Reduce to 2D for visualization using PCA
    pca = PCA(n_components=2, random_state=42)
    reduced_data = pca.fit_transform(sample_embeddings)

    # Plot centers if provided
    if centers is not None:
        centers_2d = pca.transform(centers)

    # Create a scatter plot with custom colors
    plt.figure(figsize=(12, 10))

    # Define nice colors for the clusters
    colors = ['#2077B4', '#FF7F0E', '#2CA02C', '#D62728', '#9467BD', '#8C564B']

    # Plot points for each cluster
    for label in sorted(set(sample_labels)):
        mask = sample_labels == label
        plt.scatter(
            reduced_data[mask, 0],
            reduced_data[mask, 1],
            s=20,
            c=colors[label % len(colors)],
            alpha=0.7,
            label=f'Cluster {label}'
        )

    # Plot cluster centers if provided
    if centers is not None:
        plt.scatter(
            centers_2d[:, 0],
            centers_2d[:, 1],
            s=200,
            c='black',
            alpha=1.0,
            marker='X',
            label='Cluster centers'
        )

    plt.title('Mini-batch K-means Clustering Visualization', fontsize=16)
    plt.xlabel('Principal Component 1', fontsize=12)
    plt.ylabel('Principal Component 2', fontsize=12)
    plt.legend(loc='best', fontsize=10)
    plt.grid(True, alpha=0.3)

    print(f"Visualization completed in {time.time() - start_time:.2f} seconds")
    plt.show()

# Visualize the clusters
visualize_clusters(reduced_embeddings, cluster_labels, cluster_centers)

In [None]:
# Analyze clusters in relation to true sentiment labels
def analyze_clusters(cluster_labels, true_labels):
    print("Analyzing clusters in relation to true sentiment labels...")

    # Convert true_labels to numpy array for easier manipulation
    true_labels = np.array(true_labels)

    # Define sentiment names for readability
    sentiment_names = {0: "Bullish", 1: "Neutral", 2: "Bearish"}

    # Analyze each cluster
    clusters = sorted(set(cluster_labels))

    # Create a confusion matrix-like structure
    conf_matrix = np.zeros((len(clusters), len(sentiment_names)), dtype=int)

    for cluster in clusters:
        # Get indices of tweets in this cluster
        cluster_indices = np.where(cluster_labels == cluster)[0]

        # Get the true sentiments for these tweets
        cluster_true_sentiments = true_labels[cluster_indices]

        # Count sentiment distribution
        sentiment_counts = Counter(cluster_true_sentiments)
        total = len(cluster_indices)

        print(f"\nCluster {cluster} analysis ({total} samples):")
        for sentiment, count in sorted(sentiment_counts.items()):
            conf_matrix[cluster, sentiment] = count
            print(f"{sentiment_names[sentiment]}: {count} samples ({count/total*100:.2f}%)")

        # Find the dominant sentiment
        dominant_sentiment = max(sentiment_counts.items(), key=lambda x: x[1])[0]
        dominant_percent = sentiment_counts[dominant_sentiment] / total * 100

        print(f"Dominant sentiment: {sentiment_names[dominant_sentiment]} ({dominant_percent:.2f}%)")

    # Calculate metrics
    ari = adjusted_rand_score(true_labels, cluster_labels)
    nmi = normalized_mutual_info_score(true_labels, cluster_labels)
    print(f"\nAdjusted Rand Index: {ari:.4f}")
    print(f"Normalized Mutual Information: {nmi:.4f}")

    # Visualize the confusion matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(
        conf_matrix,
        annot=True,
        fmt='d',
        cmap='Blues',
        xticklabels=[sentiment_names[i] for i in range(len(sentiment_names))],
        yticklabels=[f'Cluster {i}' for i in clusters]
    )
    plt.xlabel('True Sentiment', fontsize=12)
    plt.ylabel('Cluster', fontsize=12)
    plt.title('Cluster vs. True Sentiment Distribution', fontsize=14)
    plt.tight_layout()
    plt.show()

    # Attempt to map clusters to sentiments
    cluster_to_sentiment = {}
    for cluster in clusters:
        dominant_sentiment = np.argmax(conf_matrix[cluster])
        cluster_to_sentiment[cluster] = dominant_sentiment

    print("\nProposed cluster-to-sentiment mapping:")
    for cluster, sentiment in cluster_to_sentiment.items():
        print(f"Cluster {cluster} → {sentiment_names[sentiment]}")

    return cluster_to_sentiment

# Analyze clusters
cluster_to_sentiment = analyze_clusters(cluster_labels, true_labels)