In [6]:
# Import required libraries
import pandas as pd
import numpy as np
import json
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from datetime import datetime
from pathlib import Path
from tqdm import tqdm

print("Libraries imported successfully!")

Libraries imported successfully!


In [7]:
# Load the data with embeddings
def load_data_with_embeddings(file_path):
    """Load the JSONL file and extract embeddings"""
    data = []
    print(f"Loading data from {file_path}...")
    
    with open(file_path, 'r') as f:
        for line in tqdm(f):
            try:
                item = json.loads(line.strip())
                data.append(item)
            except json.JSONDecodeError:
                continue
    
    print(f"Loaded {len(data)} posts")
    return data

# Load the data
data = load_data_with_embeddings('data/data_with_embeddings.jsonl')

Loading data from data/data_with_embeddings.jsonl...


8799it [00:01, 8636.61it/s] 

Loaded 8799 posts





In [8]:
# Extract embeddings for clustering and t-SNE
print("Extracting embeddings...")
embeddings = []
valid_indices = []

for i, item in enumerate(tqdm(data)):
    if 'embedding' in item and item['embedding'] is not None:
        embeddings.append(item['embedding'])
        valid_indices.append(i)

embeddings_array = np.array(embeddings)
print(f"Shape of embeddings array: {embeddings_array.shape}")
print(f"Number of posts with valid embeddings: {len(valid_indices)}")

Extracting embeddings...


100%|██████████| 8799/8799 [00:00<00:00, 1553204.03it/s]


Shape of embeddings array: (8799, 1024)
Number of posts with valid embeddings: 8799


In [9]:
# Perform K-means clustering with k=7 (as per embedding-cluster.ipynb)
best_k = 7
print(f"Performing K-means clustering with k={best_k}...")
kmeans = KMeans(n_clusters=best_k, random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(embeddings_array)

print("Clustering completed!")
print("Cluster distribution:")
unique, counts = np.unique(cluster_labels, return_counts=True)
for cluster_id, count in zip(unique, counts):
    print(f"  Cluster {cluster_id}: {count} posts")

Performing K-means clustering with k=7...
Clustering completed!
Cluster distribution:
  Cluster 0: 1447 posts
  Cluster 1: 1072 posts
  Cluster 2: 1713 posts
  Cluster 3: 1710 posts
  Cluster 4: 101 posts
  Cluster 5: 1107 posts
  Cluster 6: 1649 posts


In [10]:
# Perform t-SNE dimensionality reduction to 2D
print("Performing t-SNE dimensionality reduction (this may take a few minutes)...")
tsne = TSNE(
    n_components=2, 
    random_state=42, 
    perplexity=30, 
    max_iter=10000,
    verbose=1
)
tsne_coordinates = tsne.fit_transform(embeddings_array)
print(f"t-SNE completed! Shape: {tsne_coordinates.shape}")

Performing t-SNE dimensionality reduction (this may take a few minutes)...
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 8799 samples in 0.001s...
[t-SNE] Computed neighbors for 8799 samples in 0.368s...
[t-SNE] Computed conditional probabilities for sample 1000 / 8799
[t-SNE] Computed conditional probabilities for sample 2000 / 8799
[t-SNE] Computed conditional probabilities for sample 3000 / 8799
[t-SNE] Computed conditional probabilities for sample 4000 / 8799
[t-SNE] Computed conditional probabilities for sample 5000 / 8799
[t-SNE] Computed conditional probabilities for sample 6000 / 8799
[t-SNE] Computed conditional probabilities for sample 7000 / 8799
[t-SNE] Computed conditional probabilities for sample 8000 / 8799
[t-SNE] Computed conditional probabilities for sample 8799 / 8799
[t-SNE] Mean sigma: 0.254013
[t-SNE] KL divergence after 250 iterations with early exaggeration: 92.960083
[t-SNE] KL divergence after 10000 iterations: 2.332460
t-SNE completed! Shape: (879

In [11]:
# Create complete data entries
print("Creating complete data entries...")

complete_data = []

for idx, original_idx in enumerate(tqdm(valid_indices)):
    item = data[original_idx].copy()
    
    # Add cluster ID
    item['cluster_id'] = int(cluster_labels[idx])
    
    # Convert created_utc to readable date
    created_utc = item.get('created_utc', 0)
    if created_utc:
        item['date'] = datetime.utcfromtimestamp(created_utc).strftime('%Y-%m-%d %H:%M:%S')
    else:
        item['date'] = None
    
    # Add interaction amount (score + num_comments)
    score = item.get('score', 0) or 0
    num_comments = item.get('num_comments', 0) or 0
    item['interaction_amount'] = score + num_comments
    
    # Add t-SNE coordinates
    item['tsne_x'] = float(tsne_coordinates[idx, 0])
    item['tsne_y'] = float(tsne_coordinates[idx, 1])
    
    complete_data.append(item)

print(f"Created {len(complete_data)} complete data entries")

Creating complete data entries...


100%|██████████| 8799/8799 [00:00<00:00, 245297.08it/s]

Created 8799 complete data entries





In [12]:
# Preview a sample entry
print("Sample complete entry:")
sample = complete_data[0].copy()
# Truncate embedding for display
if 'embedding' in sample:
    sample['embedding'] = f"[{len(sample['embedding'])} dimensional vector]"
print(json.dumps(sample, indent=2, default=str))

Sample complete entry:
{
  "approved_at_utc": null,
  "subreddit": "Anarchism",
  "selftext": " What you are reading, watching, or listening to? Or how far have you gotten in your chosen selection since last week?",
  "author_fullname": "t2_6l4z3",
  "saved": false,
  "mod_reason_title": null,
  "gilded": 0,
  "clicked": false,
  "title": "What Are You Reading/Book Club Tuesday",
  "link_flair_richtext": [],
  "subreddit_name_prefixed": "r/Anarchism",
  "hidden": false,
  "pwls": NaN,
  "link_flair_css_class": NaN,
  "downs": 0,
  "thumbnail_height": NaN,
  "top_awarded_type": null,
  "hide_score": false,
  "name": "t3_1is5wgo",
  "quarantine": false,
  "link_flair_text_color": "dark",
  "upvote_ratio": 1.0,
  "author_flair_background_color": NaN,
  "subreddit_type": "public",
  "ups": 2,
  "total_awards_received": 0,
  "media_embed": {},
  "thumbnail_width": NaN,
  "author_flair_template_id": NaN,
  "is_original_content": false,
  "user_reports": [],
  "secure_media": null,
  "is_redd

In [13]:
# Save to data_complete.jsonl
output_file = 'data/data_complete.jsonl'
print(f"Saving complete data to {output_file}...")

with open(output_file, 'w') as f:
    for item in tqdm(complete_data):
        f.write(json.dumps(item, default=str) + '\n')

print(f"✓ Complete data saved to {output_file}")
print(f"  Total records: {len(complete_data)}")

Saving complete data to data/data_complete.jsonl...


100%|██████████| 8799/8799 [00:02<00:00, 3874.11it/s]

✓ Complete data saved to data/data_complete.jsonl
  Total records: 8799





In [14]:
# Summary statistics
print("\n" + "="*60)
print("DATA COMPLETE SUMMARY")
print("="*60)

# Cluster distribution
print("\nCluster Distribution:")
cluster_counts = {}
for item in complete_data:
    cid = item['cluster_id']
    cluster_counts[cid] = cluster_counts.get(cid, 0) + 1
for cid in sorted(cluster_counts.keys()):
    print(f"  Cluster {cid}: {cluster_counts[cid]} posts")

# Date range
dates = [item['date'] for item in complete_data if item['date']]
if dates:
    print(f"\nDate Range: {min(dates)} to {max(dates)}")

# Interaction stats
interactions = [item['interaction_amount'] for item in complete_data]
print(f"\nInteraction Amount Stats:")
print(f"  Min: {min(interactions)}")
print(f"  Max: {max(interactions)}")
print(f"  Mean: {np.mean(interactions):.2f}")
print(f"  Median: {np.median(interactions):.2f}")

# t-SNE coordinate ranges
tsne_x_vals = [item['tsne_x'] for item in complete_data]
tsne_y_vals = [item['tsne_y'] for item in complete_data]
print(f"\nt-SNE Coordinate Ranges:")
print(f"  X: [{min(tsne_x_vals):.2f}, {max(tsne_x_vals):.2f}]")
print(f"  Y: [{min(tsne_y_vals):.2f}, {max(tsne_y_vals):.2f}]")

print("\n" + "="*60)


DATA COMPLETE SUMMARY

Cluster Distribution:
  Cluster 0: 1447 posts
  Cluster 1: 1072 posts
  Cluster 2: 1713 posts
  Cluster 3: 1710 posts
  Cluster 4: 101 posts
  Cluster 5: 1107 posts
  Cluster 6: 1649 posts

Date Range: 2024-07-23 20:39:04 to 2025-02-18 11:45:33

Interaction Amount Stats:
  Min: 0
  Max: 54052
  Mean: 457.12
  Median: 82.00

t-SNE Coordinate Ranges:
  X: [-143.14, 162.55]
  Y: [-128.68, 174.55]



In [None]:
# Interactive visualization with Plotly - with filters
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd
import numpy as np

# Create DataFrame for Plotly
df = pd.DataFrame({
    'tsne_x': [item['tsne_x'] for item in complete_data],
    'tsne_y': [item['tsne_y'] for item in complete_data],
    'cluster_id': [item['cluster_id'] for item in complete_data],
    'interaction_amount': [item['interaction_amount'] for item in complete_data],
    'title': [item.get('title', 'No title')[:80] for item in complete_data],
    'selftext': [str(item.get('selftext', ''))[:200] for item in complete_data],
    'score': [item.get('score', 0) for item in complete_data],
    'num_comments': [item.get('num_comments', 0) for item in complete_data],
    'date': [item.get('date', 'N/A') for item in complete_data],
    'subreddit': [item.get('subreddit', 'unknown') for item in complete_data],
    'created_utc': [item.get('created_utc', 0) for item in complete_data]
})

# Convert to datetime for filtering
df['datetime'] = pd.to_datetime(df['created_utc'], unit='s', errors='coerce')
df['year_month'] = df['datetime'].dt.to_period('M').astype(str)

# Normalize sizes for better visualization
max_interaction = df['interaction_amount'].max()
df['size'] = 3 + 25 * np.log1p(df['interaction_amount']) / np.log1p(max_interaction)

# Create custom hover text
df['hover_text'] = (
    '<b>Cluster ' + df['cluster_id'].astype(str) + '</b><br>' +
    '<b>Subreddit:</b> r/' + df['subreddit'] + '<br>' +
    '<b>Title:</b> ' + df['title'] + '<br>' +
    '<b>Content:</b> ' + df['selftext'] + '...<br>' +
    '<b>Score:</b> ' + df['score'].astype(str) + '<br>' +
    '<b>Comments:</b> ' + df['num_comments'].astype(str) + '<br>' +
    '<b>Total Interactions:</b> ' + df['interaction_amount'].astype(str) + '<br>' +
    '<b>Date:</b> ' + df['date']
)

# Get unique subreddits and time periods
subreddits = ['All'] + sorted(df['subreddit'].unique().tolist())
time_periods = sorted(df['year_month'].dropna().unique().tolist())

# Define color palette
colors = px.colors.qualitative.Set1

# Create figure with all data initially
fig = go.Figure()

# Add scatter traces for each cluster
for cluster_id in sorted(df['cluster_id'].unique()):
    cluster_df = df[df['cluster_id'] == cluster_id]
    
    fig.add_trace(go.Scatter(
        x=cluster_df['tsne_x'],
        y=cluster_df['tsne_y'],
        mode='markers',
        name=f'Cluster {cluster_id}',
        marker=dict(
            size=cluster_df['size'],
            color=colors[cluster_id % len(colors)],
            opacity=0.7,
            line=dict(width=0.5, color='white')
        ),
        text=cluster_df['hover_text'],
        hovertemplate='%{text}<extra></extra>',
        legendgroup=f'cluster_{cluster_id}',
        customdata=np.column_stack((cluster_df['subreddit'], cluster_df['year_month']))
    ))

# Create dropdown for subreddit filter
subreddit_buttons = []
for subreddit in subreddits:
    if subreddit == 'All':
        visible = [True] * len(df['cluster_id'].unique())
    else:
        visible = []
        for cluster_id in sorted(df['cluster_id'].unique()):
            cluster_df = df[df['cluster_id'] == cluster_id]
            has_subreddit = (cluster_df['subreddit'] == subreddit).any()
            visible.append(True)  # Keep trace visible, filtering done via transforms
    
    subreddit_buttons.append(dict(
        label=f'r/{subreddit}' if subreddit != 'All' else 'All Subreddits',
        method='update',
        args=[{'visible': [True] * len(df['cluster_id'].unique())}]
    ))

# Create time range slider steps
slider_steps = []
for i, period in enumerate(time_periods):
    slider_steps.append(dict(
        args=[{'visible': [True] * len(df['cluster_id'].unique())}],
        label=period,
        method='update'
    ))

# Update layout with controls
fig.update_layout(
    title=dict(
        text='<b>Reddit Posts Clustered by Topic</b><br><sub>Point size = interaction amount | Use dropdown to filter by subreddit</sub>',
        font=dict(size=20)
    ),
    template='plotly_white',
    width=1200,
    height=900,
    legend=dict(
        title='Clusters<br><sub>(click to toggle)</sub>',
        yanchor='top',
        y=0.99,
        xanchor='left',
        x=1.02,
        bgcolor='rgba(255,255,255,0.8)'
    ),
    hoverlabel=dict(
        bgcolor='white',
        font_size=12,
        font_family='Arial'
    ),
    updatemenus=[
        dict(
            buttons=[
                dict(
                    label=f'r/{sub}' if sub != 'All' else 'All Subreddits',
                    method='restyle',
                    args=[{'visible': True}]
                ) for sub in subreddits[:20]  # Limit to top 20 subreddits
            ],
            direction='down',
            showactive=True,
            x=0.0,
            xanchor='left',
            y=1.15,
            yanchor='top',
            bgcolor='white',
            bordercolor='lightgray',
            font=dict(size=11)
        )
    ],
    annotations=[
        dict(
            text='Filter by Subreddit:',
            x=0.0,
            xref='paper',
            y=1.18,
            yref='paper',
            showarrow=False,
            font=dict(size=12)
        )
    ],
    xaxis=dict(showticklabels=False, showgrid=False, zeroline=False, title=''),
    yaxis=dict(showticklabels=False, showgrid=False, zeroline=False, title='')
)

# Save as interactive HTML
fig.write_html('data/cluster_tsne_interactive.html')

# Show the figure
fig.show()

print("Interactive visualization saved to data/cluster_tsne_interactive.html")

Interactive visualization saved to data/cluster_tsne_interactive.html
