## Initialize the Libaries

In [None]:
# !pip install -U sentence-transformers

In [None]:
import pandas as pd
from scipy.spatial.distance import cosine
from sentence_transformers import SentenceTransformer

## Calculating Similarity Between Answers With and Without Caption

In [None]:
# Load the CSV file
file_path = "../data/qa_200_singleview.csv"  # Update this with your local file path
data = pd.read_csv(file_path)

# Initialize the SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')  # You can replace with your preferred model

# Function to calculate cosine similarity
def calculate_similarity(no_caption, with_caption, model):
  no_caption = str(no_caption) if pd.notna(no_caption) else ""
  with_caption = str(with_caption) if pd.notna(with_caption) else ""
  if not no_caption or not with_caption:
      return 0.0
      
  no_cap_vec = model.encode([no_caption])[0]
  with_cap_vec = model.encode([with_caption])[0]
  similarity_score = 1 - cosine(no_cap_vec, with_cap_vec)
  return round(similarity_score, 5)  # Round to 5 decimal places

# Dynamically calculate similarities for each A# and AwC# pair
for i in range(1, 11):  # Assuming pairs go from A1 to AwC10
    a_col = f"A{i}"       # Column name for A#
    awc_col = f"AwC{i}"   # Column name for AwC#
    sim_col = f"Sim_A{i}" # Column name for similarity score

    # Check if both columns exist in the DataFrame
    if a_col in data.columns and awc_col in data.columns:
        data[sim_col] = data.apply(
            lambda row: calculate_similarity(row[a_col], row[awc_col], model), axis=1
        )
    else:
        print(f"Columns {a_col} or {awc_col} not found in the DataFrame.")

# Save the updated DataFrame to a new file
data.to_csv("../data/qa_200_singleview.csv", index=False)
data.head()