# 5 - Visualization
In this part of the tutorial, you will learn how to visualize the networks and embeddings produced so far. We will use projection techniques and helios-web


In [None]:
import pandas as pd
from tqdm.auto import tqdm
import numpy as np
from pathlib import Path
import gzip
from collections import Counter
from sentence_transformers import SentenceTransformer
import os
from sklearn.neighbors import NearestNeighbors
import emlens
import matplotlib.pyplot as plt
import matplotlib.patheffects as pe
import umap
from sklearn.decomposition import PCA
import xnetwork as xn
import igraph as ig


First, let's load an embedding

In [None]:
datasetName = "debate2024_Jun_bluesky"
dataPath = Path("Data")
networksPath = dataPath/"Networks"
networksPath.mkdir(parents=True, exist_ok=True)
bertModelName = "all-MiniLM-L12-v2"
# bertModelName = "all-mpnet-base-v2"
# bertModelName = "dmlls/all-mpnet-base-v2-negation"


In [None]:
with gzip.open(dataPath / f"{datasetName}.feather.gz", 'rb') as f:
    df = pd.read_feather(f)

In [None]:
# code to load embeddings
with np.load(dataPath / f"{datasetName}_{bertModelName.replace('/','_')}_embeddings.npz") as data:
    sentence_embeddings = data["sentence_embeddings"]
    uniqueSentences = data["uniqueSentences"]
    sentenceIndices = data["sentenceIndices"]
    modelName = str(data["modelName"])
    
model = SentenceTransformer(modelName)

# helper function
def emb(sentence):
    return model.encode(sentence)

We can generate a network based on the nearest neighbors of each post.

In [None]:
# OPTIONAL
numberNeighbors = 3
nnModel = NearestNeighbors(n_neighbors=numberNeighbors, metric='cosine').fit(sentence_embeddings)
edges = []
weights = []
chunkSize = 1000
for chunkIndex in tqdm(range(0,len(sentence_embeddings)+1,chunkSize)):
    chunk = slice(chunkIndex, min(chunkIndex+chunkSize, len(sentence_embeddings)))
    distances, indices = nnModel.kneighbors(sentence_embeddings[chunk])
    for i, (distances, indices) in enumerate(zip(distances, indices)):
        for distance, index in zip(distances, indices):
            if distance < 0.5:
                edges.append((chunk.start+i, index))
                weights.append(1-distance)


Helper dictionaries in case we need to convert from the sentence indices to the original indices in the database

In [None]:
sentenceIndex2DataIndices = []
for i in range(len(uniqueSentences)):
    sentenceIndex2DataIndices.append(set())
for i, sentenceIndex in enumerate(sentenceIndices):
    sentenceIndex2DataIndices[sentenceIndex].add(i)

Applying UMAP for projection

In [None]:
# Apply umap to the embeddings
UMAPNeighbors = 15
dimension = 2
umapModel = umap.UMAP(n_neighbors=UMAPNeighbors, metric='cosine', n_components=dimension,n_epochs=200,verbose=True)
umapCoordinates = umapModel.fit_transform(sentence_embeddings)

Converting everything to a network format

In [None]:
g = ig.Graph(sentence_embeddings.shape[0],edges=edges,directed=False)
if(weights):
    g.es["weight"] = weights

g.vs["Label"] = uniqueSentences
positions = umapCoordinates*100
# allPositions = positions
allPositions = umapCoordinates*100
medianPosition = np.median(allPositions,axis=0)

# remove all nodes that are too far fromt he median position
distances = np.linalg.norm(allPositions-medianPosition,axis=1)
maxDistance = np.percentile(distances,99)
allMainIndices = [vIndex for vIndex in range(len(allPositions)) if distances[vIndex]<maxDistance*1.1]
mainIndices = [vIndex for vIndex in range(g.vcount()) if distances[vIndex]<maxDistance*1.1]
g = g.subgraph(mainIndices)
positions=positions[mainIndices,:]
allPositions = allPositions[allMainIndices,:]
meanPosition = np.mean(allPositions,axis=0)
allPositions -= meanPosition
positions -= meanPosition
# recenter by the extremes in the PCA axes
# first apply PCA using sklearn
pca = PCA(n_components=dimension)
pca.fit(allPositions)
allPositions = pca.transform(allPositions)
positions = pca.transform(positions)
# recenter by the extremes in the PCA axes
# recenter based on the extremes of the PCA axes
minPositions = np.min(allPositions,axis=0)
maxPositions = np.max(allPositions,axis=0)
positions = positions - (minPositions+maxPositions)/2

# swap x and y axis
# positions = positions[:,[1,0]]
g.vs["Position"] = positions


xn.save(g, networksPath / f"{datasetName}_{bertModelName.replace('/','_')}_umap.xnet")

You can now download and visualize the network using Helios-web by downloading the file and dragging it to the browser at this address:

http://heliosweb.io/docs/example/?advanced&layout=0&use2d&size=1&density