In [1]:
from matplotlib import pyplot as plt
import pandas as pd
from glob import glob
import os
import seaborn as sns

# Float

In [None]:
# Float32
repo_id = "bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus"

# Folder where umap results are
results_folder = repo_id + '/umap/euclidean'

# Gather all parquet files in the data folder
parquets = glob(f'{results_folder}/*.parquet', recursive=True)
parquets.sort()

In [None]:
# Filter out the yearly files
for parquet in parquets:

    print(f'Processing {parquet}')

    # Read the parquet file into a pandas dataframe
    df = pd.read_parquet(parquet)

    # Get the unique categories
    unique_categories = df['categories'].unique()
    unique_categories.sort()

    # Create a mapping from category to color
    category_colors = {category: f'C{i}' for i, category in enumerate(unique_categories)}

    # Map the categories to colors
    df['color'] = df['categories'].map(category_colors)

    # Create a scatter plot of the UMAP projection
    plt.scatter(df['x'], df['y'], c=df['color'], s=1, alpha=0.5)

    plt.title(f'Scatter plot of Float UMAP Projection for the year {os.path.basename(parquet).split(".")[0]}')
    plt.xlabel('Dimension 1')
    plt.ylabel('Dimension 2')
    plt.legend(handles=[plt.Line2D([0], [0], marker='o', color='w', label=category,
                                      markerfacecolor=color, markersize=5)
                        for category, color in category_colors.items()],
               title='Categories', loc='upper right', bbox_to_anchor=(1.15, 1))

    plt.tight_layout()
    plt.savefig(f"{parquet.replace('.parquet', '_scatter_float.png')}", dpi=300, bbox_inches='tight')

    # Uncomment the line below to display the plot
    # plt.show()

    plt.close()
    

In [None]:
# Filter out the yearly files
for parquet in parquets:

    print(f'Processing {parquet}')

    # Read the parquet file into a pandas dataframe
    df = pd.read_parquet(parquet)

    # Get the unique categories
    unique_categories = df['categories'].unique()
    unique_categories.sort()

    # Create a mapping from category to color
    category_colors = {category: f'C{i}' for i, category in enumerate(unique_categories)}

    # Map the categories to colors
    df['color'] = df['categories'].map(category_colors)

    # Create a kde plot of the UMAP projection
    sns.kdeplot(data=df, x="x", y="y", hue="color", fill=True,)

    plt.title(f'KDE plot of Float UMAP Projection for the year {os.path.basename(parquet).split(".")[0]}')
    plt.xlabel('Dimension 1')
    plt.ylabel('Dimension 2')
    plt.legend(handles=[plt.Line2D([0], [0], marker='o', color='w', label=category,
                                      markerfacecolor=color, markersize=5)
                        for category, color in category_colors.items()],
               title='Categories', loc='upper right', bbox_to_anchor=(1.15, 1))

    plt.tight_layout()
    plt.savefig(f"{parquet.replace('.parquet', '_kde_float.png')}", dpi=300, bbox_inches='tight')

    # Uncomment the line below to display the plot
    # plt.show()

    plt.close()
    



# Binary

In [2]:
# Float32
repo_id = "bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary"

# Folder where umap results are
results_folder = repo_id + '/umap/hamming'

# Gather all parquet files in the data folder
parquets = glob(f'{results_folder}/*.parquet', recursive=True)
parquets.sort()

In [3]:
# Filter out the yearly files
for parquet in parquets:

    print(f'Processing {parquet}')

    # Read the parquet file into a pandas dataframe
    df = pd.read_parquet(parquet)

    # Get the unique categories
    unique_categories = df['categories'].unique()
    unique_categories.sort()

    # Create a mapping from category to color
    category_colors = {category: f'C{i}' for i, category in enumerate(unique_categories)}

    # Map the categories to colors
    df['color'] = df['categories'].map(category_colors)

    # Create a scatter plot of the UMAP projection
    plt.scatter(df['x'], df['y'], c=df['color'], s=1, alpha=0.5)

    plt.title(f'Scatter plot of Binary UMAP Projection for the year {os.path.basename(parquet).split(".")[0]}')
    plt.xlabel('Dimension 1')
    plt.ylabel('Dimension 2')
    plt.legend(handles=[plt.Line2D([0], [0], marker='o', color='w', label=category,
                                      markerfacecolor=color, markersize=5)
                        for category, color in category_colors.items()],
               title='Categories', loc='upper right', bbox_to_anchor=(1.15, 1))

    plt.tight_layout()
    plt.savefig(f"{parquet.replace('.parquet', '_scatter_binary.png')}", dpi=300, bbox_inches='tight')

    # Uncomment the line below to display the plot
    # plt.show()

    plt.close()
    

Processing bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary/umap/hamming/1991.parquet
Processing bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary/umap/hamming/1992.parquet
Processing bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary/umap/hamming/1993.parquet
Processing bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary/umap/hamming/1994.parquet
Processing bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary/umap/hamming/1995.parquet
Processing bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary/umap/hamming/1996.parquet
Processing bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary/umap/hamming/1997.parquet
Processing bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary/umap/hamming/1998.parquet
Processing bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary/umap/hamming/1999.parquet
Processing bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary/umap/h

In [4]:
# Filter out the yearly files
for parquet in parquets:

    print(f'Processing {parquet}')

    # Read the parquet file into a pandas dataframe
    df = pd.read_parquet(parquet)

    # Get the unique categories
    unique_categories = df['categories'].unique()
    unique_categories.sort()

    # Create a mapping from category to color
    category_colors = {category: f'C{i}' for i, category in enumerate(unique_categories)}

    # Map the categories to colors
    df['color'] = df['categories'].map(category_colors)

    # Create a kde plot of the UMAP projection
    sns.kdeplot(data=df, x="x", y="y", hue="color", fill=True,)

    plt.title(f'KDE plot of Binary UMAP Projection for the year {os.path.basename(parquet).split(".")[0]}')
    plt.xlabel('Dimension 1')
    plt.ylabel('Dimension 2')
    plt.legend(handles=[plt.Line2D([0], [0], marker='o', color='w', label=category,
                                      markerfacecolor=color, markersize=5)
                        for category, color in category_colors.items()],
               title='Categories', loc='upper right', bbox_to_anchor=(1.15, 1))

    plt.tight_layout()
    plt.savefig(f"{parquet.replace('.parquet', '_kde_binary.png')}", dpi=300, bbox_inches='tight')

    # Uncomment the line below to display the plot
    # plt.show()

    plt.close()
    



Processing bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary/umap/hamming/1991.parquet
Processing bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary/umap/hamming/1992.parquet
Processing bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary/umap/hamming/1993.parquet
Processing bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary/umap/hamming/1994.parquet
Processing bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary/umap/hamming/1995.parquet
Processing bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary/umap/hamming/1996.parquet
Processing bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary/umap/hamming/1997.parquet
Processing bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary/umap/hamming/1998.parquet
Processing bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary/umap/hamming/1999.parquet
Processing bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary/umap/h

  sns.kdeplot(data=df, x="x", y="y", hue="color", fill=True,)


Processing bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary/umap/hamming/2001.parquet
Processing bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary/umap/hamming/2002.parquet
Processing bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary/umap/hamming/2003.parquet
Processing bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary/umap/hamming/2004.parquet
Processing bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary/umap/hamming/2005.parquet
Processing bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary/umap/hamming/2006.parquet
Processing bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary/umap/hamming/2007.parquet
Processing bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary/umap/hamming/2008.parquet
Processing bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary/umap/hamming/2009.parquet
Processing bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary/umap/h

  sns.kdeplot(data=df, x="x", y="y", hue="color", fill=True,)


Processing bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary/umap/hamming/2011.parquet
Processing bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary/umap/hamming/2012.parquet
Processing bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary/umap/hamming/2013.parquet
Processing bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary/umap/hamming/2014.parquet


  sns.kdeplot(data=df, x="x", y="y", hue="color", fill=True,)


Processing bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary/umap/hamming/2015.parquet


  sns.kdeplot(data=df, x="x", y="y", hue="color", fill=True,)


Processing bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary/umap/hamming/2016.parquet


  sns.kdeplot(data=df, x="x", y="y", hue="color", fill=True,)


Processing bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary/umap/hamming/2017.parquet
Processing bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary/umap/hamming/2018.parquet
Processing bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary/umap/hamming/2019.parquet
Processing bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary/umap/hamming/2020.parquet
Processing bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary/umap/hamming/2021.parquet
Processing bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary/umap/hamming/2022.parquet
Processing bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary/umap/hamming/2023.parquet
Processing bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary/umap/hamming/2024.parquet
Processing bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary/umap/hamming/2025.parquet
