In this file, we compare the embeddings of the references to the embedding of a different topic (in this case, 'ethics') via cosine similarity. After explorin the distribution and inspecting examples along the distribution, we set a threshold for what references should be classified as that topic.
Then, we create a seperate csv file with only those references, which we'll later analyze seperately and put as an option in our visualization tool. 

In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', None)


df = pd.read_csv('references.csv')

print("Top 10 contexts most similar to 'ethics':\n")
top_10 = df.nlargest(1000, 'similarity')[['context', 'similarity']]
for _, row in top_10.iterrows():
    print(f"Similarity: {row['similarity']:.4f}")
    print(f"Context: {row['context']}")
    print("-" * 100 + "\n")

print("\nTop 10 contexts least similar to 'ethics':\n")
bottom_10 = df.nsmallest(10, 'similarity')[['context', 'similarity']]
for _, row in bottom_10.iterrows():
    print(f"Similarity: {row['similarity']:.4f}")
    print(f"Context: {row['context']}")
    print("-" * 100 + "\n")

In [None]:
print("Similarity Distribution Analysis:\n")
print("Basic Statistics:")
print(df['similarity'].describe())
print("\n" + "-"*80 + "\n")

print("Distribution by ranges:")
ranges = [
    (-1.0, 0.0),
    (0.0, 0.2),
    (0.2, 0.4),
    (0.4, 0.6),
    (0.6, 0.8),
    (0.8, 1.0)
]

for start, end in ranges:
    count = len(df[(df['similarity'] >= start) & (df['similarity'] < end)])
    percentage = (count / len(df)) * 100
    print(f"Range {start:4.1f} to {end:4.1f}: {count:5d} items ({percentage:5.1f}%)")

In [None]:
filtered_df = df[df['similarity'] > 0.26]
output_file = 'art_filtered.csv'
filtered_df.to_csv(output_file, index=False)
print(f"Created filtered dataset with {len(filtered_df)} rows (original had {len(df)} rows)")
print(f"Saved to {output_file}")
