In [None]:
from matplotlib import pyplot as plt
import pandas as pd
from glob import glob
import os
import seaborn as sns

# Float

In [None]:
# Float32
repo_id = "bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus"

# Folder where mrl results are
results_folder = repo_id + '/' + 'mrl'

# Gather all parquet files in the data folder
parquets = glob(f'{results_folder}/*.parquet', recursive=True)
parquets.sort()

In [None]:
# Filter out the yearly files
for parquet in parquets:

    print(f'Processing {parquet}')

    # Read the parquet file into a pandas dataframe
    df = pd.read_parquet(parquet)

    # Get the unique categories
    unique_categories = df['categories'].unique()
    unique_categories.sort()

    # Create a mapping from category to color
    category_colors = {category: f'C{i}' for i, category in enumerate(unique_categories)}

    # Map the categories to colors
    df['color'] = df['categories'].map(category_colors)

    # Create a scatter plot of the MRL projection
    plt.scatter(df['x'], df['y'], c=df['color'], s=1, alpha=0.5)

    plt.title(f'Scatter plot of Float MRL Projection for the year {os.path.basename(parquet).split(".")[0]}')
    plt.xlabel('Dimension 1')
    plt.ylabel('Dimension 2')
    plt.legend(handles=[plt.Line2D([0], [0], marker='o', color='w', label=category,
                                      markerfacecolor=color, markersize=5)
                        for category, color in category_colors.items()],
               title='Categories', loc='upper right', bbox_to_anchor=(1.15, 1))

    plt.tight_layout()
    plt.savefig(f"{parquet.replace('.parquet', '_scatter_float.png')}", dpi=300, bbox_inches='tight')

    # Uncomment the line below to display the plot
    # plt.show()

    plt.close()
    

In [None]:
# Filter out the yearly files
for parquet in parquets:

    print(f'Processing {parquet}')

    # Read the parquet file into a pandas dataframe
    df = pd.read_parquet(parquet)

    # Get the unique categories
    unique_categories = df['categories'].unique()
    unique_categories.sort()

    # Create a mapping from category to color
    category_colors = {category: f'C{i}' for i, category in enumerate(unique_categories)}

    # Map the categories to colors
    df['color'] = df['categories'].map(category_colors)

    # Create a kde plot of the MRL projection
    sns.kdeplot(data=df, x="x", y="y", hue="color", fill=True,)

    plt.title(f'KDE plot of Float MRL Projection for the year {os.path.basename(parquet).split(".")[0]}')
    plt.xlabel('Dimension 1')
    plt.ylabel('Dimension 2')
    plt.legend(handles=[plt.Line2D([0], [0], marker='o', color='w', label=category,
                                      markerfacecolor=color, markersize=5)
                        for category, color in category_colors.items()],
               title='Categories', loc='upper right', bbox_to_anchor=(1.15, 1))

    plt.tight_layout()
    plt.savefig(f"{parquet.replace('.parquet', '_kde_float.png')}", dpi=300, bbox_inches='tight')

    # Uncomment the line below to display the plot
    # plt.show()

    plt.close()
    



# Binary

In [None]:
# Float32
repo_id = "bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary"

# Folder where mrl results are
results_folder = repo_id + '/' + 'mrl'

# Gather all parquet files in the data folder
parquets = glob(f'{results_folder}/*.parquet', recursive=True)
parquets.sort()

In [None]:
# Filter out the yearly files
for parquet in parquets:

    print(f'Processing {parquet}')

    # Read the parquet file into a pandas dataframe
    df = pd.read_parquet(parquet)

    # Get the unique categories
    unique_categories = df['categories'].unique()
    unique_categories.sort()

    # Create a mapping from category to color
    category_colors = {category: f'C{i}' for i, category in enumerate(unique_categories)}

    # Map the categories to colors
    df['color'] = df['categories'].map(category_colors)

    # Create a scatter plot of the MRL projection
    plt.scatter(df['x'], df['y'], c=df['color'], s=1, alpha=0.5)

    plt.title(f'Scatter plot of Binary MRL Projection for the year {os.path.basename(parquet).split(".")[0]}')
    plt.xlabel('Dimension 1')
    plt.ylabel('Dimension 2')
    plt.legend(handles=[plt.Line2D([0], [0], marker='o', color='w', label=category,
                                      markerfacecolor=color, markersize=5)
                        for category, color in category_colors.items()],
               title='Categories', loc='upper right', bbox_to_anchor=(1.15, 1))

    plt.tight_layout()
    plt.savefig(f"{parquet.replace('.parquet', '_scatter_binary.png')}", dpi=300, bbox_inches='tight')

    # Uncomment the line below to display the plot
    # plt.show()

    plt.close()
    

In [None]:
# Filter out the yearly files
for parquet in parquets:

    print(f'Processing {parquet}')

    # Read the parquet file into a pandas dataframe
    df = pd.read_parquet(parquet)

    # Get the unique categories
    unique_categories = df['categories'].unique()
    unique_categories.sort()

    # Create a mapping from category to color
    category_colors = {category: f'C{i}' for i, category in enumerate(unique_categories)}

    # Map the categories to colors
    df['color'] = df['categories'].map(category_colors)

    # Create a kde plot of the MRL projection
    sns.kdeplot(data=df, x="x", y="y", hue="color", fill=True,)

    plt.title(f'KDE plot of Binary MRL Projection for the year {os.path.basename(parquet).split(".")[0]}')
    plt.xlabel('Dimension 1')
    plt.ylabel('Dimension 2')
    plt.legend(handles=[plt.Line2D([0], [0], marker='o', color='w', label=category,
                                      markerfacecolor=color, markersize=5)
                        for category, color in category_colors.items()],
               title='Categories', loc='upper right', bbox_to_anchor=(1.15, 1))

    plt.tight_layout()
    plt.savefig(f"{parquet.replace('.parquet', '_kde_binary.png')}", dpi=300, bbox_inches='tight')

    # Uncomment the line below to display the plot
    # plt.show()

    plt.close()
    

