# Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


# Load Data

In [9]:
df = pd.read_csv('data/04_categorized_data/similarity_scored_paragraphs.csv')
df['similarity_score_0'] = df['similarity_score']
df.drop(columns=['similarity_score'], inplace=True)

# Basic Analysis

In [None]:
# Count occurrences of each 'id'
df['id_count'] = df['id'].map(df['id'].value_counts())

# Aggregate the DataFrame by 'id', keeping the first occurrence of each column
aggregated_df = df.groupby('id', as_index=False).first()

# Sort by highest id count
sorted_df = aggregated_df.sort_values(by='id_count', ascending=False)
sorted_df.head(10)

In [None]:
# Calculate median and average
median_id_count = aggregated_df['id_count'].median()
average_id_count = aggregated_df['id_count'].mean()

# Display results
median_id_count, average_id_count

# Visualize Distributions of Similarity Scores

In [None]:
# Identify the similarity score columns
similarity_cols = [ 'similarity_score_0',
    "similarity_score_1", "similarity_score_2", "similarity_score_3",
    "similarity_score_4", "similarity_score_5", "similarity_score_6",
    "similarity_score_7"
]

# Option 1: Single figure with 8 subplots (2x4 grid)
plt.figure(figsize=(20, 10))
for i, col in enumerate(similarity_cols):
    plt.subplot(2, 4, i+1)
    sns.histplot(df[col], bins=30, kde=True, color=sns.color_palette("viridis", 8)[i])
    plt.title(f"{col}")
    plt.xlabel("Score")
    plt.ylabel("Frequency")
plt.tight_layout()
plt.show()

# Option 2: Combined KDE plot with all distributions
plt.figure(figsize=(14, 8))
for i, col in enumerate(similarity_cols):
    sns.kdeplot(df[col], label=col, fill=True, alpha=0.3, linewidth=2,
                color=sns.color_palette("viridis", 8)[i])
plt.title("Distribution of All Similarity Scores")
plt.xlabel("Score Value")
plt.ylabel("Density")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

# Option 3: Facet grid for more structured comparison
# Reshape data from wide to long format
similarity_data = df[similarity_cols].copy()
similarity_long = pd.melt(similarity_data,
                         value_vars=similarity_cols,
                         var_name='Similarity_Type',
                         value_name='Score')

# Option 4: Violin plot for more detailed distribution comparison
plt.figure(figsize=(14, 8))
sns.violinplot(data=similarity_long, x="Similarity_Type", y="Score", palette="viridis", inner="quartile")
plt.title("Detailed Comparison of Similarity Score Distributions")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()