In [29]:
# Import required libraries
import json
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

## 1. Load the Data

In [30]:
# Load JSONL data
data = []
with open('data.jsonl', 'r') as f:
    for line in f:
        entry = json.loads(line)
        if 'data' in entry:
            data.append(entry['data'])
        else:
            data.append(entry)

df = pd.DataFrame(data)
print(f"Loaded {len(df)} posts")

Loaded 8799 posts


## 2. Prepare Text for Embedding

In [31]:
# Combine title and selftext into a single text field
def prepare_text(row):
    title = str(row.get('title', '')) if row.get('title') else ''
    selftext = str(row.get('selftext', '')) if row.get('selftext') else ''
    # Combine title and selftext
    combined = f"{title}. {selftext}".strip()
    # Limit length to avoid memory issues (model has 256 token limit anyway)
    return combined[:2000] if combined else title

df['text_for_embedding'] = df.apply(prepare_text, axis=1)

# Preview
print("Sample texts:")
for i, text in enumerate(df['text_for_embedding'].head(3)):
    print(f"\n{i+1}. {text[:200]}...")

Sample texts:

1. What Are You Reading/Book Club Tuesday.  What you are reading, watching, or listening to? Or how far have you gotten in your chosen selection since last week?...

2. "WTF is Social Ecology?" by Usufruct Collective....

3. Who do you think is the most powerful/popular anarch-nihilist ever?. I am an anarcho-nihilist and i am reading similar type of books for a long time. Many of the authors i read, had really close conne...


## 3. Load Embedding Model

In [32]:
# Load Qwen3-Embedding-0.6B model
# ~600MB, 1024 dimensions, high quality multilingual embeddings
print("Loading Qwen3-Embedding-0.6B model...")
model = SentenceTransformer('Qwen/Qwen3-Embedding-0.6B', trust_remote_code=True)
print(f"Model loaded! Embedding dimension: {model.get_sentence_embedding_dimension()}")

Loading Qwen3-Embedding-0.6B model...


Loading weights:   0%|          | 0/310 [00:00<?, ?it/s]

Model loaded! Embedding dimension: 1024


## 4. Compute Embeddings (Parallelized)

In [33]:
# Compute embeddings in batches (faster than one-by-one)
# The model already uses GPU if available and handles batching internally

texts = df['text_for_embedding'].tolist()
batch_size = 32  # Process in batches for efficiency

print(f"Computing embeddings for {len(texts)} posts...")
print(f"Batch size: {batch_size}")

# Use model's built-in parallel processing
embeddings = model.encode(
    texts,
    batch_size=batch_size,
    show_progress_bar=True,
    normalize_embeddings=True,  # L2 normalize for cosine similarity
    convert_to_numpy=True
)

print(f"\nEmbeddings computed! Shape: {embeddings.shape}")

Computing embeddings for 8799 posts...
Batch size: 32


Batches:   0%|          | 0/275 [00:00<?, ?it/s]


Embeddings computed! Shape: (8799, 1024)


## 5. Append Embeddings to DataFrame

In [34]:
# Add embeddings as a new column (as list for JSON serialization)
df['embedding'] = [emb.tolist() for emb in embeddings]

# Verify
print(f"Added 'embedding' column to dataframe")
print(f"Embedding dimension: {len(df['embedding'].iloc[0])}")
print(f"\nSample embedding (first 10 values):")
print(df['embedding'].iloc[0][:10])

Added 'embedding' column to dataframe
Embedding dimension: 1024

Sample embedding (first 10 values):
[-0.0159912109375, -0.040771484375, -0.003997802734375, -0.09130859375, 0.059814453125, -0.0283203125, 0.05126953125, 0.014404296875, -0.031494140625, -0.0260009765625]


## 6. Save Results

In [35]:
# Save to new JSONL file with embeddings
output_file = 'data_with_embeddings.jsonl'

with open(output_file, 'w') as f:
    for _, row in tqdm(df.iterrows(), total=len(df), desc="Saving"):
        # Convert row to dict and write as JSON line
        record = row.to_dict()
        f.write(json.dumps(record) + '\n')

print(f"\nSaved {len(df)} posts with embeddings to '{output_file}'")

Saving: 100%|██████████| 8799/8799 [00:02<00:00, 2949.44it/s]


Saved 8799 posts with embeddings to 'data_with_embeddings.jsonl'





## 7. Quick Validation - Similarity Search

In [36]:
# Quick test: Find similar posts to the first one
from sklearn.metrics.pairwise import cosine_similarity

# Compute similarity matrix for first 100 posts
sample_embeddings = np.array(df['embedding'].head(100).tolist())
similarity_matrix = cosine_similarity(sample_embeddings)

# Find most similar posts to the first one
first_post_similarities = similarity_matrix[0]
most_similar_indices = np.argsort(first_post_similarities)[::-1][1:6]  # Top 5 (excluding itself)

print("First post:")
print(f"  Title: {df.iloc[0]['title'][:80]}...")
print(f"\nMost similar posts:")
for i, idx in enumerate(most_similar_indices, 1):
    print(f"\n{i}. (similarity: {first_post_similarities[idx]:.3f})")
    print(f"   Title: {df.iloc[idx]['title'][:80]}...")

First post:
  Title: What Are You Reading/Book Club Tuesday...

Most similar posts:

1. (similarity: 1.000)
   Title: What Are You Reading/Book Club Tuesday...

2. (similarity: 0.482)
   Title: Friday Free Talk...

3. (similarity: 0.412)
   Title: "WTF is Social Ecology?" by Usufruct Collective...

4. (similarity: 0.371)
   Title: HOW TO ORGANIZE YOUR COMMUNITY...

5. (similarity: 0.365)
   Title: Reading "1922: The Hong Kong strike"...


In [37]:
# Summary
print("="*50)
print("           EMBEDDING SUMMARY")
print("="*50)
print(f"Total posts processed: {len(df)}")
print(f"Model used: Qwen3-Embedding-0.6B")
print(f"Embedding dimension: {len(df['embedding'].iloc[0])}")
print(f"Output file: {output_file}")
print("="*50)

           EMBEDDING SUMMARY
Total posts processed: 8799
Model used: Qwen3-Embedding-0.6B
Embedding dimension: 1024
Output file: data_with_embeddings.jsonl
