In [75]:
import pandas as pd
from tqdm.auto import tqdm
import numpy as np
from pathlib import Path
import gzip
from collections import Counter
from sentence_transformers import SentenceTransformer
import os
from sklearn.neighbors import NearestNeighbors
os.environ['TOKENIZERS_PARALLELISM'] = 'true'


In [76]:
datasetName = "debate2024_Jun_bluesky"
dataPath = Path("Data")
networksPath = dataPath/"Networks"
networksPath.mkdir(parents=True, exist_ok=True)
bertModelName = 'paraphrase-MiniLM-L6-v2'

In [77]:
with gzip.open(dataPath / f"{datasetName}.feather.gz", 'rb') as f:
    df = pd.read_feather(f)

In [78]:
def get_unique_sentences_and_indices(sentences):
    unique_sentences = []
    sentence_to_index = {}
    sentences_indices = []
    
    for sentence in sentences:
        if sentence not in sentence_to_index:
            sentence_to_index[sentence] = len(unique_sentences)
            unique_sentences.append(sentence)
        sentences_indices.append(sentence_to_index[sentence])
    
    return unique_sentences, sentences_indices


In [79]:
uniqueSentences, sentenceIndices = get_unique_sentences_and_indices(df['text'])

In [80]:
len(uniqueSentences)

125324

In [81]:
model = SentenceTransformer(bertModelName)

In [82]:
sentence_embeddings = model.encode(uniqueSentences, show_progress_bar=True)
# save as compressed numpy
np.savez_compressed(dataPath / f"{datasetName}_embeddings.npz",
                    sentence_embeddings=sentence_embeddings,
                    uniqueSentences=uniqueSentences,
                    sentenceIndices=sentenceIndices,
                    modelName = bertModelName)


Batches:   0%|          | 0/3917 [00:00<?, ?it/s]

In [83]:
# code to load embeddings
with np.load(dataPath / f"{datasetName}_embeddings.npz") as data:
    sentence_embeddings = data["sentence_embeddings"]
    uniqueSentences = data["uniqueSentences"]
    sentenceIndices = data["sentenceIndices"]
    modelName = data["modelName"]

In [84]:
numberNeighbors = 5
nnModel = NearestNeighbors(n_neighbors=numberNeighbors, metric='cosine').fit(sentence_embeddings)

In [102]:
# Find the nearest neighbors
query = "Biden is a bad president"
query_embedding = model.encode(query)
distances, indices = nnModel.kneighbors([query_embedding])

for index,distance in zip(indices[0],distances[0]):
    print(f"{distance:.2f}")
    print(uniqueSentences[index])
    print("")

0.06
How is President Biden a bad candidate?

0.13
How is President Biden a shitty person?

0.14
Breaking News: Trump thinks Biden is a bad president and Biden thinks Trump was a bad president.

0.16
Biden is also bad but Trump is democracy annihilating bad

0.17
Biden is one of the most unpopular presidents in modern history and democrats just rolled over and accepted a deranged 81 year old as their guy despite the well documented fact that almost nobody wants Biden to run again.



In [86]:
sentenceIndex2DataIndices = []
for i in range(len(uniqueSentences)):
    sentenceIndex2DataIndices.append(set())
for i, sentenceIndex in enumerate(sentenceIndices):
    sentenceIndex2DataIndices[sentenceIndex].add(i)
