In [6]:
from sentence_transformers import SentenceTransformer

# 1. Load a pretrained Sentence Transformer model
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

In [7]:
import pandas as pd

df = pd.read_csv('posts.csv')

df.head()

Unnamed: 0,cid,text
0,bafyreihtrx5r4noevvlvv3ux67zxxx7z6q7njrnhcis2n...,I'm confused by the idea that any writer could...
1,bafyreibms5e7lahso3aixbm5swy2z64g3caqxn4dgmjs4...,"Oh wow, this is about the worst idea ever from..."
2,bafyreidbwiciurrrkzeblonewxnyl2y6hfstvy2wbaky5...,トライって太刀もハンマーも強かった気がするしスラアクもなんか凄かった気がするから何がいまいち...
3,bafyreiepb4osplv5oxitdkslvgrbxtb5dbp5ajy6b4duh...,evergreen
4,bafyreigmt7cvknjnfaq7xfakmqg7y5mnch2jnf63cy4w4...,starr you’re cooking!! i’m on board with this


In [9]:
# Encode the posts

df['text'] = df['text'].fillna('')
embeddings = model.encode(df['text'], show_progress_bar=True)

Batches: 100%|██████████| 61/61 [00:02<00:00, 27.12it/s]


In [23]:
embeddings.shape

(1946, 384)

In [20]:
# Search for similar posts

from sklearn.metrics.pairwise import cosine_similarity

# Calculate the cosine similarity between the query post and all posts in the dataset

query = "go vote"

query_embedding = model.encode(query)

cosine_scores = cosine_similarity([query_embedding], embeddings)

# Get the most similar posts

n = 10

most_similar_posts = df.iloc[cosine_scores[0].argsort()[-n:][::-1]]

for i, row in most_similar_posts.iterrows():
    print(f"Post: {row['text']}")
    print(f"Similarity: {cosine_scores[0][i]}")
    print()

Post: Vote damn it vote... from a perpetually red state where mu vote doesn't matter... I voted... get the fuck out there
Similarity: 0.6547659635543823

Post: Elections huh?
Similarity: 0.6344351768493652

Post: Alright, I hope it helps and I hope the election goes right 🫂🫂🫂
Similarity: 0.6287328004837036

Post: Drove several people to the polls and convinced some who weren't going to vote to go anyway. I was feeling great earlier about that when I took these pics. Now it's time to wait for the voting results and I'm a little nervous.

#nsfw #vote
Similarity: 0.6257947683334351

Post: Oh, I know. But I've never counted votes before and saw how they don't know how to fill out a ballot without invalidating their goddamn choice.
Similarity: 0.6141208410263062

Post: Voting is so cool and based and hot and sexy so you should do it

Unless you’re a trump supporter in which case you should die-
Similarity: 0.6119887828826904

Post: Imagine not voting for Avel
Similarity: 0.6048896312713623


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

class DenoisingAutoencoder(nn.Module):
    def __init__(self):
        super(DenoisingAutoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(384, 256),  # Adjust input dimension to match SBERT output
            nn.ReLU(True),
            nn.Linear(256, 128),
            nn.ReLU(True),
            nn.Linear(128, 64),
            nn.ReLU(True),
        )
        self.decoder = nn.Sequential(
            nn.Linear(64, 128),
            nn.ReLU(True),
            nn.Linear(128, 256),
            nn.ReLU(True),
            nn.Linear(256, 384),  # Adjust output dimension to match SBERT input
            nn.Tanh(),  # Help keep the output in a similar range as SBERT embeddings
        )

    def forward(self, x):
        # Add noise during training
        if self.training:
            x = x + torch.randn_like(x) * 0.1

        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded, encoded

class PostEmbeddingDataset(Dataset):
    def __init__(self, posts, sbert_model):
        self.posts = posts
        self.sbert_model = sbert_model
        self.embeddings = self.sbert_model.encode(posts, convert_to_tensor=True)

    def __len__(self):
        return len(self.posts)

    def __getitem__(self, idx):
        return self.embeddings[idx]

def train_autoencoder(posts, sbert_model, num_epochs=10, batch_size=32, learning_rate=1e-4):
    # Initialize models
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    autoencoder = DenoisingAutoencoder().to(device)

    # Create dataset and dataloader
    dataset = PostEmbeddingDataset(posts, sbert_model)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    # Loss and optimizer
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(autoencoder.parameters(), lr=learning_rate)

    # Training loop
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in dataloader:
            batch = batch.to(device)

            # Forward pass
            reconstructed, encoded = autoencoder(batch)
            loss = criterion(reconstructed, batch)

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(dataloader)
        print(f"Epoch [{epoch+1}/{num_epochs}], Average Loss: {avg_loss:.4f}")

    return autoencoder

class EnhancedEmbedder:
    def __init__(self, sbert_model, autoencoder):
        self.sbert = sbert_model
        self.autoencoder = autoencoder
        self.device = next(autoencoder.parameters()).device

    def encode(self, texts):
        # Get SBERT embeddings
        with torch.no_grad():
            embeddings = self.sbert.encode(texts, convert_to_tensor=True).to(self.device)
            # Get enhanced embeddings through the encoder part only
            _, enhanced = self.autoencoder(embeddings)
        return enhanced

# Example usage:
def create_enhanced_embedder(posts):
    # Load SBERT
    sbert = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

    # Train autoencoder
    autoencoder = train_autoencoder(posts, sbert)

    # Create enhanced embedder
    enhanced_embedder = EnhancedEmbedder(sbert, autoencoder)

    return enhanced_embedder

            # Create enhanced embedder
enhanced_embedder = create_enhanced_embedder(df['text'])

# Encode the posts
enhanced_embeddings = enhanced_embedder.encode(df['text'])

# Calculate the cosine similarity between the query post and all posts in the dataset
query = "go vote"

query_embedding = enhanced_embedder.encode([query])

cosine_scores = cosine_similarity(query_embedding, enhanced_embeddings)

# Get the most similar posts

n = 10

most_similar_posts = df.iloc[cosine_scores[0].argsort()[-n:][::-1]]

for i, row in most_similar_posts.iterrows():
    print(f"Post: {row['text']}")
    print(f"Similarity: {cosine_scores[0][i]}")
    print()

RuntimeError: mat1 and mat2 shapes cannot be multiplied (32x384 and 768x512)