<a href="https://colab.research.google.com/github/payal15604/Sentence_Similarity_Pretrained/blob/main/Best_Pretrained_Model_for_Sentence_Similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import numpy as np
import plotly.express as px

In [4]:
# Step 1: Define the decision matrix (models x criteria)
# Columns: [Sentence Embedding Performance, Semantic Search Performance, Inference Speed] These values have been taken from Sentence Pretrained Documentation
decision_matrix = np.array([
    [86.16, 80.11, 1000],   # all-mpnet-base-v2
    [84.43, 78.29, 5100],   # all-MiniLM-L6-v2
    [84.92, 78.56, 3300],   # paraphrase-MiniLM-L12-v2
    [85.75, 81.36, 4000],   # multi-qa-mpnet-base-dot-v1
    [83.50, 79.50, 7000],   # multi-qa-distilbert-dot-v1
    [82.70, 78.00, 18000]   # multi-qa-MiniLM-L6-dot-v1
])


In [5]:
# Step 2: Normalize the decision matrix
norm_matrix = decision_matrix / np.sqrt((decision_matrix ** 2).sum(axis=0))

In [7]:
# Step 3: Define weights (assumed equal importance for now)
weights = np.array([0.33, 0.33, 0.34])

In [8]:
# Step 4: Compute weighted normalized decision matrix
weighted_matrix = norm_matrix * weights


In [9]:
# Step 5: Determine ideal and negative-ideal solutions
ideal_best = np.max(weighted_matrix, axis=0)
ideal_worst = np.min(weighted_matrix, axis=0)

In [10]:
# Step 6: Compute separation measures
def calculate_distance(matrix, ideal):
    return np.sqrt(((matrix - ideal) ** 2).sum(axis=1))

s_plus = calculate_distance(weighted_matrix, ideal_best)  # Distance from ideal best
s_minus = calculate_distance(weighted_matrix, ideal_worst)  # Distance from ideal worst


In [11]:
# Step 7: Compute TOPSIS scores
scores = s_minus / (s_plus + s_minus)


In [12]:
# Step 8: Rank models
model_names = [
    "all-mpnet-base-v2",
    "all-MiniLM-L6-v2",
    "paraphrase-MiniLM-L12-v2",
    "multi-qa-mpnet-base-dot-v1",
    "multi-qa-distilbert-dot-v1",
    "multi-qa-MiniLM-L6-dot-v1"
]

ranked_models = sorted(zip(model_names, scores), key=lambda x: x[1], reverse=True)

# Print results
print("Model Rankings based on TOPSIS:")
for rank, (model, score) in enumerate(ranked_models, 1):
    print(f"{rank}. {model} - Score: {score:.4f}")


Model Rankings based on TOPSIS:
1. multi-qa-MiniLM-L6-dot-v1 - Score: 0.9724
2. multi-qa-distilbert-dot-v1 - Score: 0.3529
3. all-MiniLM-L6-v2 - Score: 0.2413
4. multi-qa-mpnet-base-dot-v1 - Score: 0.1781
5. paraphrase-MiniLM-L12-v2 - Score: 0.1358
6. all-mpnet-base-v2 - Score: 0.0230


In [13]:
# Display the best model
best_model = ranked_models[0][0]
print(f"\nBest Model: {best_model}")




Best Model: multi-qa-MiniLM-L6-dot-v1


In [14]:
# Visualization: Bar Chart using Plotly
fig = px.bar(
    x=[m[0] for m in ranked_models],
    y=[m[1] for m in ranked_models],
    labels={'x': "Model Name", 'y': "TOPSIS Score"},
    title="TOPSIS Ranking of Pretrained Models",
    text=[f"{score:.4f}" for _, score in ranked_models],
    color=[m[1] for m in ranked_models],
    color_continuous_scale="Blues"
)
fig.update_traces(textposition='outside')
fig.update_layout(xaxis_tickangle=-45)
fig.show()
