# 4 - Network embeddings
In this notebook you will learn how to use network embeddings to represent posts according to their textual content. We will use sentence BERT to generate embeddings for each post and then use these embeddings to train a classifier to predict the target variable.

In [None]:
import pandas as pd
from tqdm.auto import tqdm
import numpy as np
from pathlib import Path
import gzip
from collections import Counter
from sentence_transformers import SentenceTransformer
import os
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
import emlens
import matplotlib.pyplot as plt
import matplotlib.patheffects as pe
import umap
os.environ['TOKENIZERS_PARALLELISM'] = 'true'


Choose a model. all-MiniLM-L12-v2 are already included in the dataset. You can also use any other model from the sentence-transformers library. You can find a list of models [here](https://www.sbert.net/docs/pretrained_models.html).

In [None]:
datasetName = "debate2024_Jun_mastodon"
dataPath = Path("Data")
networksPath = dataPath/"Networks"
networksPath.mkdir(parents=True, exist_ok=True)
bertModelName = "all-MiniLM-L12-v2"
# bertModelName = "all-mpnet-base-v2"
# bertModelName = "dmlls/all-mpnet-base-v2-negation"


In [None]:
with gzip.open(dataPath / f"{datasetName}.feather.gz", 'rb') as f:
    df = pd.read_feather(f)

We need to look into the unique sentences, so that we don't generate multiple embeddings for the same sentence. 

In [None]:
def get_unique_sentences_and_indices(sentences):
    unique_sentences = []
    sentence_to_index = {}
    sentences_indices = []
    
    for sentence in sentences:
        if sentence not in sentence_to_index:
            sentence_to_index[sentence] = len(unique_sentences)
            unique_sentences.append(sentence)
        sentences_indices.append(sentence_to_index[sentence])
    
    return unique_sentences, sentences_indices


This cell will generate and save the embeddings for all the posts in the dataset. You can skip this step if you dont have a GPU. The next cell will load the precomputed embeddings.

In [None]:
# SKIP THIS IF YOU ALREADY HAVE THE EMBEDDINGS
# MAY TAKE SEVERAL MINUTES WITHOUT A GPU
model = SentenceTransformer(bertModelName)
uniqueSentences, sentenceIndices = get_unique_sentences_and_indices(df['text'])
sentence_embeddings = model.encode(uniqueSentences, show_progress_bar=True)
# save as compressed numpy
np.savez_compressed(dataPath / f"{datasetName}_{bertModelName.replace('/','_')}_embeddings.npz",
                    sentence_embeddings=sentence_embeddings,
                    uniqueSentences=uniqueSentences,
                    sentenceIndices=sentenceIndices,
                    modelName = bertModelName)


Loading the embedding from existing file.

In [None]:
# code to load embeddings
with np.load(dataPath / f"{datasetName}_{bertModelName.replace('/','_')}_embeddings.npz") as data:
    sentence_embeddings = data["sentence_embeddings"]
    uniqueSentences = data["uniqueSentences"]
    sentenceIndices = data["sentenceIndices"]
    modelName = str(data["modelName"])
    
model = SentenceTransformer(modelName)

Just a helper function in case we need to encode any more sentences to the model.

In [None]:
def emb(sentence):
    return model.encode(sentence)

One advantagens of the embedding is that we can calculate the similarity between sentences.

In [None]:
sentence0 = "Education is fundamental for the development of a country"
sentence1 = "A nations growth relies heavily on a strong educational foundation"
sentence2 = "My kids are not well-educated in this nation history"
sentence3 = "I'm cooking a delicious meal today"

# Compute embeddings
embedding0 = emb(sentence0)
embedding1 = emb(sentence1)
embedding2 = emb(sentence2)
embedding3 = emb(sentence3)

# Compute cosine-similarities
print(f"Cosine-Similarity 0-1: {cosine_similarity([embedding0], [embedding1])[0][0]:.4f}")
print(f"Cosine-Similarity 0-2: {cosine_similarity([embedding0], [embedding2])[0][0]:.4f}")
print(f"Cosine-Similarity 0-3: {cosine_similarity([embedding0], [embedding3])[0][0]:.4f}")

We can then find the most similar sentences to a given sentence.

In [None]:
numberNeighbors = 5
nnModel = NearestNeighbors(n_neighbors=numberNeighbors, metric='cosine').fit(sentence_embeddings)

In [None]:
query = "Economy is the most important issue for the country"
distances, indices = nnModel.kneighbors([emb(query)])

for index,distance in zip(indices[0],distances[0]):
    print(f"Sim. {distance:.2f}")
    print(uniqueSentences[index])
    print("")

Semantic axes can be created in this embedding by just giving a few examples. For instance the axis for positive vs negative sentiment.

In [None]:
# Positive vs. Negative Sentiment
group0_sentences = [
    "I am hopeful about the future",
    "The new policies are promising",
    "The debate was enlightening",
    "I support the new healthcare plan",
    "The economy is showing signs of recovery",
    "Education reforms are beneficial",
    "The climate initiative is a step in the right direction",
    "I am optimistic about the upcoming election",
    "The government is handling the crisis well",
    "Community efforts are making a difference",
]

group1_sentences = [
    "I am worried about the economy",
    "The policies are disappointing",
    "The debate was a disaster",
    "I oppose the new healthcare plan",
    "The government is failing us",
    "Education reforms are inadequate",
    "The climate initiative is insufficient",
    "I am pessimistic about the upcoming election",
    "The crisis management is poor",
    "Community efforts are not enough",
]

# # Policy vs. Personalities
# group0_sentences = [
#     "The economic plan will reduce unemployment",
#     "Healthcare reforms are essential for the country",
#     "Education funding needs to be increased",
#     "Climate change policies must be prioritized",
#     "Tax reforms will benefit the middle class",
#     "Infrastructure investment is necessary",
#     "Immigration policies should be humane",
#     "Defense spending needs to be rationalized",
#     "Social security reforms are overdue",
#     "Technology regulations should be updated",
# ]

# group1_sentences = [
#     "Biden is a compassionate leader",
#     "Trump is a strong and decisive president",
#     "Harris brings a fresh perspective",
#     "Pence is a steady and reliable vice president",
#     "Biden's empathy is his strength",
#     "Trump's charisma is unmatched",
#     "Harris is an inspiring figure",
#     "Pence's loyalty is commendable",
#     "Biden's experience is invaluable",
#     "Trump's resilience is noteworthy",
# ]

# # Before vs. After the Debate
# group0_sentences = [
#     "I am looking forward to the debate",
#     "The debate will be crucial for the election",
#     "I expect strong arguments from both sides",
#     "The debate will highlight key policies",
#     "The debate could sway undecided voters",
#     "It's important to hear both candidates",
#     "The debate will provide clarity on their positions",
#     "I hope the debate is civil and informative",
#     "The debate will be a significant event",
#     "The candidates need to perform well in the debate",
# ]

# group1_sentences = [
#     "The debate was informative",
#     "Biden performed well in the debate",
#     "Trump's debate strategy was effective",
#     "The debate did not change my opinion",
#     "The candidates addressed key issues",
#     "The debate highlighted their differences",
#     "It was a heated debate",
#     "The debate provided a clear comparison",
#     "I was impressed by the candidates' knowledge",
#     "The debate was a turning point in the election",
# ]


group0_embeddings = emb(group0_sentences)
group1_embeddings = emb(group1_sentences)

# concatenate the embeddings
trainData = np.concatenate([group0_embeddings, group1_embeddings], axis=0)
labels = np.array([0]*len(group0_embeddings) + [1]*len(group1_embeddings))


semAxisModel = emlens.SemAxis()
semAxisModel.fit(trainData, labels)
projectedCoordinates = semAxisModel.transform(sentence_embeddings)

In [None]:
fig,ax = plt.subplots(figsize=(8,3))
bins = 30

p = plt.hist(projectedCoordinates,bins=bins,density=True,alpha=0.70)
    
plt.setp(ax, yticks=[]);
fig.patch.set_visible(False)
ax.spines['top'].set_visible(True)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(True)
ax.spines['left'].set_visible(False)
ax.get_yaxis().set_visible(False)
ax.set_xlabel("SemAxis")

breaks = 15
step = (np.max(projectedCoordinates)-np.min(projectedCoordinates))/breaks;
addedIndices = []
sortedIndices = sorted(range(len(projectedCoordinates)),key=lambda i: projectedCoordinates[i])
for i in sortedIndices:
    if(projectedCoordinates[i] >= np.min(projectedCoordinates)+len(addedIndices)*step):
        addedIndices.append(i)
        
for index,senteceIndex in enumerate(addedIndices):
#     index = 6069
    plt.scatter([projectedCoordinates[senteceIndex]],[1.0],s=10, c = "k",
                clip_on=False,
                transform = ax.get_xaxis_transform())
    textActor = ax.text(projectedCoordinates[senteceIndex], 1.0, str(index), fontsize=8,
                  rotation=0, rotation_mode='anchor',
    #               transform_rotates_text=True,
                   transform = ax.get_xaxis_transform())
    textActor.set_path_effects([pe.Stroke(linewidth=2, foreground='white'),
                       pe.Normal()])
    print(f"{index}: {uniqueSentences[senteceIndex]}")

plt.show()