In [1]:
import os
import pandas as pd
import networkx as nx
from pyvis.network import Network
from IPython.display import display, HTML
import ast
import json



def create_network(path_folder):
    news_net = Network(height="750px", width="100%", bgcolor="#222222", font_color="white", select_menu=True, layout=True)    
    news_net.barnes_hut()
    news_data = pd.read_csv(f"{path_folder}/database_update.csv")
    hierarchical_topics = pd.read_csv(f"{path_folder}/database_hierarchical_topics.csv")
    

    sources = pd.concat([hierarchical_topics["Parent_ID"], hierarchical_topics["Parent_ID"]], ignore_index=True)
    sources_names = pd.concat([hierarchical_topics["Parent_Name"], hierarchical_topics["Parent_Name"]], ignore_index=True)
    targets = pd.concat([hierarchical_topics["Child_Left_ID"], hierarchical_topics["Child_Right_ID"]], ignore_index=True)
    targets_names = pd.concat([hierarchical_topics["Child_Left_Name"], hierarchical_topics["Child_Right_Name"]], ignore_index=True)
    weights = pd.concat([hierarchical_topics["Distance"], hierarchical_topics["Distance"]], ignore_index=True)
    data = hierarchical_topics["Topics"].apply(ast.literal_eval).apply(len)
    size_node = pd.concat([data,data],ignore_index=True)
    edge_data = zip(sources, sources_names, targets, targets_names, weights,size_node)

    for e in edge_data:
                    src = e[0]
                    src_names = e[1]
                    dst = e[2]
                    dst_names = e[3]
                    w = e[4]
                    s = e[5]

                    news_net.add_node(src, src_names, title=src, value=s, mass=s)
                    news_net.add_node(dst, dst_names, title=dst, value=s, mass=s)
                    news_net.add_edge(src, dst, value=w)

    neighbor_map = news_net.get_adj_list()
    
    for node in news_net.nodes:
    # Assurez-vous que node["title"] est une chaîne de caractères
        node_title_str = str(node["title"])
        neighbors_str = ", ".join(str(neighbor) for neighbor in neighbor_map[node["id"]])
    # Utilisez la version chaîne de caractères de node["title"] pour la concaténation
        node["title"] = node_title_str + "\n Neighbors : [" + neighbors_str + "]"
        node["value"] = len(neighbor_map[node["id"]])


    news_net.toggle_physics(True)
    news_net.show_buttons(True)
    news_net.inherit_edge_colors(True)
    news_net.save_graph("graphs.html")
    

if __name__ == '__main__':

    dossier = 'archive/test/result'
    create_network(dossier)

    

Warning: When  cdn_resources is 'local' jupyter notebook has issues displaying graphics on chrome/safari. Use cdn_resources='in_line' or cdn_resources='remote' if you have issues viewing graphics in a notebook.

In [2]:

def visualize_hierarchy(topic_model,
                        orientation: str = "left",
                        topics: List[int] = None,
                        top_n_topics: int = None,
                        custom_labels: Union[bool, str] = False,
                        title: str = "<b>Hierarchical Clustering</b>",
                        width: int = 1000,
                        height: int = 600,
                        hierarchical_topics: pd.DataFrame = None,
                        linkage_function: Callable[[csr_matrix], np.ndarray] = None,
                        distance_function: Callable[[csr_matrix], csr_matrix] = None,
                        color_threshold: int = 1) -> go.Figure:
    """ Visualize a hierarchical structure of the topics

    A ward linkage function is used to perform the
    hierarchical clustering based on the cosine distance
    matrix between topic embeddings.

    Arguments:
        topic_model: A fitted BERTopic instance.
        orientation: The orientation of the figure.
                     Either 'left' or 'bottom'
        topics: A selection of topics to visualize
        top_n_topics: Only select the top n most frequent topics
        custom_labels: If bool, whether to use custom topic labels that were defined using 
                       `topic_model.set_topic_labels`.
                       If `str`, it uses labels from other aspects, e.g., "Aspect1".
                       NOTE: Custom labels are only generated for the original 
                       un-merged topics.
        title: Title of the plot.
        width: The width of the figure. Only works if orientation is set to 'left'
        height: The height of the figure. Only works if orientation is set to 'bottom'
        hierarchical_topics: A dataframe that contains a hierarchy of topics
                             represented by their parents and their children.
                             NOTE: The hierarchical topic names are only visualized
                             if both `topics` and `top_n_topics` are not set.
        linkage_function: The linkage function to use. Default is:
                          `lambda x: sch.linkage(x, 'ward', optimal_ordering=True)`
                          NOTE: Make sure to use the same `linkage_function` as used
                          in `topic_model.hierarchical_topics`.
        distance_function: The distance function to use on the c-TF-IDF matrix. Default is:
                           `lambda x: 1 - cosine_similarity(x)`.
                            You can pass any function that returns either a square matrix of 
                            shape (n_samples, n_samples) with zeros on the diagonal and 
                            non-negative values or condensed distance matrix of shape 
                            (n_samples * (n_samples - 1) / 2,) containing the upper 
                            triangular of the distance matrix.
                           NOTE: Make sure to use the same `distance_function` as used
                           in `topic_model.hierarchical_topics`.
        color_threshold: Value at which the separation of clusters will be made which
                         will result in different colors for different clusters.
                         A higher value will typically lead in less colored clusters.

    Returns:
        fig: A plotly figure

    Examples:

    To visualize the hierarchical structure of
    topics simply run:

    ```python
    topic_model.visualize_hierarchy()
    ```

    If you also want the labels visualized of hierarchical topics,
    run the following:

    ```python
    # Extract hierarchical topics and their representations
    hierarchical_topics = topic_model.hierarchical_topics(docs)

    # Visualize these representations
    topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)
    ```

    If you want to save the resulting figure:

    ```python
    fig = topic_model.visualize_hierarchy()
    fig.write_html("path/to/file.html")
    ```
    <iframe src="../../getting_started/visualization/hierarchy.html"
    style="width:1000px; height: 680px; border: 0px;""></iframe>
    """
    if distance_function is None:
        distance_function = lambda x: 1 - cosine_similarity(x)

    if linkage_function is None:
        linkage_function = lambda x: sch.linkage(x, 'ward', optimal_ordering=True)

    # Select topics based on top_n and topics args
    freq_df = topic_model.get_topic_freq()
    freq_df = freq_df.loc[freq_df.Topic != -1, :]
    if topics is not None:
        topics = list(topics)
    elif top_n_topics is not None:
        topics = sorted(freq_df.Topic.to_list()[:top_n_topics])
    else:
        topics = sorted(freq_df.Topic.to_list())

    # Select embeddings
    all_topics = sorted(list(topic_model.get_topics().keys()))
    indices = np.array([all_topics.index(topic) for topic in topics])

    # Select topic embeddings
    if topic_model.c_tf_idf_ is not None:
        embeddings = topic_model.c_tf_idf_[indices]
    else:
        embeddings = np.array(topic_model.topic_embeddings_)[indices]
        
    # Annotations
    if hierarchical_topics is not None and len(topics) == len(freq_df.Topic.to_list()):
        annotations = _get_annotations(topic_model=topic_model,
                                       hierarchical_topics=hierarchical_topics,
                                       embeddings=embeddings,
                                       distance_function=distance_function,
                                       linkage_function=linkage_function,
                                       orientation=orientation,
                                       custom_labels=custom_labels)
    else:
        annotations = None

    # wrap distance function to validate input and return a condensed distance matrix
    distance_function_viz = lambda x: validate_distance_matrix(
        distance_function(x), embeddings.shape[0])
    # Create dendogram
    fig = ff.create_dendrogram(embeddings,
                               orientation=orientation,
                               distfun=distance_function_viz,
                               linkagefun=linkage_function,
                               hovertext=annotations,
                               color_threshold=color_threshold)