<a href="https://colab.research.google.com/github/orikopel/textnet/blob/main/text_similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q sentence-transformers
!pip install -q networkx
!pip install -q pyvis

In [67]:
import re
import nltk
import numpy as np
import pandas as pd
from tqdm import tqdm
import networkx as nx
from collections import Counter
import matplotlib.pyplot as plt
from pyvis.network import Network
from nltk.corpus import stopwords
from itertools import combinations
from nltk.tokenize import word_tokenize
from concurrent.futures import ThreadPoolExecutor, as_completed
from sentence_transformers import SentenceTransformer
from networkx.algorithms.community import girvan_newman

In [3]:
tqdm.pandas()

In [41]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

---

In [68]:
def validate_data(df, id_col, title_col, text_col):
    """
    Validates the input data.
    """

    # filter out rows without texts
    df = df[df[text_col].apply(lambda x: isinstance(x, str) and len(x) > 0)]

    # get rid of non-letter chars
    df[text_col] = df[text_col].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x) if isinstance(x, str) else x)

    # drop duplicates by id
    df = df.dropna(subset=[id_col])

    # convert ids to string if needed
    df[id_col] = df[id_col].astype(str)

    return df


In [69]:
def batch_encode(texts, model, batch_size=32):
    """
    Encodes texts in batches using the provided model.

    Args:
        texts (list): List of text data to encode.
        model (SentenceTransformer): Model for encoding.
        batch_size (int): Batch size for encoding.

    Returns:
        List of embeddings.
    """
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        batch_embeddings = model.encode(batch_texts, show_progress_bar=False)
        embeddings.extend(batch_embeddings)
    return embeddings

In [70]:
def generate_embeddings(df, text_col, model, batch_size=32, num_threads=4):
    """
    Gets a df and a column name and returns a df with an embedding column.

    Args:
        df(DataFrame): df with text column, should also have an id and title column.
        text_col(String): name of the text column.
        model(SentenceTransformer): SentenceTransformer model.
        batch_size(int): Number of texts to encode in one batch for efficiency. Default is 32.

    Returns:
        df(DataFrame): df with an embedding column.
    """

    # Prepare text data as a list to avoid pandas row overhead
    texts = df[text_col].values
    n = len(texts)

    # Split data for multithreading
    splits = np.array_split(texts, num_threads)

    # Use multithreading to process each batch
    embeddings = []
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = [executor.submit(batch_encode, split, model, batch_size) for split in splits]

        for future in tqdm(as_completed(futures), total=len(futures), desc="Generating embeddings in parallel"):
            embeddings.extend(future.result())

    # Convert list of embeddings to a NumPy array and add to dataframe
    df['embedding'] = np.array(embeddings).tolist()

    return df

In [None]:
def add_edges(G, data, i, similarity_matrix, threshold, id_col):
    """Helper function to add edges to the graph."""
    edges = []
    for j in range(i + 1, len(data)):
        score = similarity_matrix[i, j]
        if score > threshold:
            edges.append((data.iloc[i][id_col], data.iloc[j][id_col], float(score)))
    return edges

In [74]:
def create_similarity_nx(data, id_col, model, threshold):
    """
    Creates a graph data structure with text similarity as edge score.

    Args:
        data(DataFrame): df with an embedding column.
        id_col(String): name of the id column.

    Returns:
        G(Graph): graph data structure with text similarity as edge score.
    """

    G = nx.Graph()

    # Convert embeddings to a NumPy array
    embeddings = np.array(data["embedding"].tolist())

    # Calculate pairwise similarities using a dot product and normalize
    norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
    similarity_matrix = np.dot(embeddings, embeddings.T) / (norms * norms.T)

    # Use ThreadPoolExecutor for multithreading
    with ThreadPoolExecutor() as executor:
        futures = []
        for i in tqdm(range(len(data)), desc="Finding edges"):
            futures.append(executor.submit(add_edges, G, data, i, similarity_matrix, threshold, id_col))

    # Collect results and add edges to the graph
    for future in tqdm(futures, desc="Adding edges to graph"):
        edges = future.result()
        G.add_edges_from(edges)

    return G

In [42]:
def extract_common_keywords(titles):
    """
    Extracts the most common non-stopword keywords from a list of titles.
    Args:
        titles (List[str]): List of titles to extract keywords from.
    Returns:
        common_keywords (str): The most common keyword(s) or subject(s).
    """
    # Tokenize words and remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word.lower() for title in titles for word in word_tokenize(title) if word.isalnum()]
    words = [word for word in words if word not in stop_words]

    # Get the most common words
    most_common_words = Counter(words).most_common(3)  # Top 3 common words
    return ', '.join([word[0] for word in most_common_words])  # Return top words as a string

In [61]:
def G_to_net(G, id_col, title_col, text_col, data):
    """
    Creates a Pyvis network from a graph data structure.

    Args:
        G(Graph): graph data structure with text similarity as edge score.
        id_col(String): name of the id column.
        title_col(String): name of the title column.
        data(DataFrame): df with an embedding column.

    Returns:
        net(Network): Pyvis network.
    """

    # find communities
    comp = girvan_newman(G)
    communities = next(comp)
    colors = plt.cm.get_cmap("tab10", len(communities))

    # Create a Pyvis network
    net = Network(notebook=True)

    # add nodes to pyvis
    for idx, community in tqdm(enumerate(communities), desc="Building pyvis graph by communities - nodes"):

        # Get titles for this community
        titles = [data.loc[data[id_col] == node, title_col].values[0] for node in community]
        texts = [data.loc[data[id_col] == node, title_col].values[0] for node in community]

        # Create a community label (e.g., most common title)
        community_node_id = f'community_{idx}'
        community_title = community_node_id + " - " + extract_common_keywords(texts)

        # Add a community node (outer circle)
        net.add_node(community_node_id, label=community_title, title=community_title,
                     color=f'rgba({colors(idx)[0] * 255}, {colors(idx)[1] * 255}, {colors(idx)[2] * 255}, 0.2)',
                     size=40)  # Larger size for visibility

        # Connect community node to its members
        for node in community:
            title_value = data.loc[data[id_col]==node, title_col].values[0]

            # deal with long node titles
            if len(title_value) > 35:
                title_value = title_value[:35] + "..."

            net.add_node(node, title=title_value, label=title_value) # get title from data df
            net.nodes[-1]['color'] = f'rgba({colors(idx)[0] * 255}, {colors(idx)[1] * 255}, {colors(idx)[2] * 255}, 0.7)'

            net.add_edge(community_node_id, node, color='rgba(0, 0, 0, 0)', value=0)  # Invisible edges for connection

    # add edges to pyvis
    for u, v, val in tqdm(G.edges(data=True), desc="Building pyvis graph by communities - edges"):
        net.add_edge(u, v, value=val['score'] * 10, label=val['score'])  # Scale scores for edge visibility

    net.set_options("""
        var options = {
        "nodes": {
            "font": {
            "size": 14
            }
        },
        "edges": {
            "smooth": {
            "type": "continuous"
            }
        },
        "physics": {
            "enabled": true
        }
        }
        """)

    # Show the network
    return net

In [71]:
def create_with_header(save_path, net, html_header):
    """
    Creates an html page with a header and a network visualization.

    Args:
        save_path(String): path to save the html page.
        net(Network): Pyvis network.
        html_header(String): html header.
    """

    # Combine the header with the network output
    with open(save_path, "w") as f:
        f.write(html_header)
        f.write(net.generate_html())  # Include the network visualization
        f.write("</body></html>")  # Close the HTML tags


---

# Run Code

In [72]:
def df2net(data_path, save_path, id_col, title_col, text_col, html_header, threshold, lang):
    """
    Creates a network visualization from a dataframe. Combines all above functions.

    Args:
        data_path(String): path to the dataframe.
        save_path(String): path to save the html page.
        id_col(String): column name of the unique identifier.
        title_col(String): column name of the title.
        text_col(String): column name of the text.
        html_header(String): header of the html page.
        lang(String): language of the text. Used for choosing the right model.
        threshold(Float): threshold for the edge score.
    """

    # dict for choosing a model for each optional language
    language_models = {"eng": "all-MiniLM-L6-v2", "heb": "imvladikon/sentence-transformers-alephbert"}

    # choose the right model for user input language
    model = SentenceTransformer(language_models[lang])
    print(f"1 - Selected model: {language_models[lang]}")

    # read the dataset and create embedding column
    data = pd.read_csv(data_path, encoding='latin1').head(500)
    data = validate_data(data, id_col, title_col, text_col)
    data = generate_embeddings(data, text_col, model)
    print("2 - Created embeddings for the data")

    # create a graph data structure with text similarity as edge score
    G = create_similarity_nx(data, id_col, model, threshold)
    print("3 - Created graph data structure with text similarity as edge score")

    # convert the nx graph to a net graph
    net = G_to_net(G, id_col, title_col, text_col, data)
    print("4 - Created pyvis network")

    # save the graph as an html page
    create_with_header(save_path, net, html_header)
    print("5 - Saved the graph as an html page")

In [73]:
df2net("McDonald_s_Reviews.csv", "mcdonalds.html", "reviewer_id", "review", "review", html_header, 0.7, "eng")



1 - Selected model: all-MiniLM-L6-v2


Generating embeddings in parallel: 100%|██████████| 4/4 [00:21<00:00,  5.28s/it]


2 - Created embeddings for the data


Creating nx graph: 100%|██████████| 500/500 [00:00<00:00, 3286.05it/s]
  colors = plt.cm.get_cmap("tab10", len(communities))


3 - Created graph data structure with text similarity as edge score


Building pyvis graph by communities - nodes: 16it [00:00, 51.08it/s]
Building pyvis graph by communities - edges: 100%|██████████| 165/165 [00:00<00:00, 15906.87it/s]

4 - Created pyvis network
5 - Saved the graph as an html page





---

# GUI

In [17]:
html_header = """
<!DOCTYPE html>
<html>
<head>
    <title>Similarity Relations in Mcdonalds Reviews</title>
    <style>
        body {
            font-family: Arial, sans-serif;
            margin: 20px;
        }
        .container {
            display: flex;
            flex-direction: row; /* Arrange children in a row */
        }
        .item {
            margin-right: 20px; /* Spacing between items */
        }
    </style>
    <style>
        h1 {
            text-align: center;
            font-size: 36px;
            margin: 20px 0;
        }
        p {
            text-align: center;
            font-size: 18px;
            margin: 0 0 20px 0;
        }
    </style>
</head>
<body>
    <h1>The Data</h1>
    <p>Over 33K McDonalds Reviews</p>
    <p></p>
    <h1>Similarity</h1>
    <p>The similarity was calculated using SBERT embeddings and cosine similarity</p>
</p>
"""

In [80]:
!pip install streamlit

Collecting pyngrok
  Downloading pyngrok-7.2.0-py3-none-any.whl.metadata (7.4 kB)
Downloading pyngrok-7.2.0-py3-none-any.whl (22 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.0


In [81]:
import streamlit as st
import streamlit.components.v1 as components

In [78]:
st.title("Pyvis Graph with Streamlit")

# Generate the Pyvis HTML file
path = "mcdonalds.html"

# Open the HTML file and read its content
with open(path, 'r', encoding='utf-8') as f:
    html_content = f.read()

# Display the HTML content in the Streamlit app
components.html(html_content, height=600)


2024-10-25 00:32:20.006 
  command:

    streamlit run /usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py [ARGUMENTS]


DeltaGenerator()

In [84]:
!streamlit run /usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.125.215.12:8501[0m
[0m
[34m  Stopping...[0m
[34m  Stopping...[0m
