# Adaptive Taxonomy Mapper

**ML-based approach using sentence embeddings and cosine similarity**

In [71]:
#importing the required libraries
import json
import numpy as np
from sentence_transformers import SentenceTransformer

In [72]:
# Load test cases
with open('test_cases.json') as f:
    test_cases = json.load(f)

In [73]:
#subgenre descriptions
SUBGENRE_DESCRIPTIONS = {
    "Slow-burn": "Romance that develops gradually. Friendship builds slowly into love over time.",
    "Enemies-to-Lovers": "Romance between rivals or enemies. Initial hate transforms into attraction.",
    "Second Chance": "Romance about reuniting with a past love. Meeting again after years apart.",
    "Espionage": "Stories about spies and secret agents. Covert missions and intelligence operations.",
    "Psychological": "Thrillers about mind games and mental manipulation. Paranoia and questioning reality.",
    "Legal Thriller": "Courtroom drama and legal battles. Lawyers, judges, trials, and verdicts.",
    "Hard Sci-Fi": "Science fiction based on real physics. Technical explanations and scientific accuracy.",
    "Space Opera": "Epic space adventures. Galactic empires, alien civilizations, space battles.",
    "Cyberpunk": "Dystopian future with high tech. Neon cities, hackers, AI, and corporate control.",
    "Psychological Horror": "Fear from losing sanity. Mental torment and unreliable perceptions.",
    "Gothic": "Atmospheric horror in old mansions. Family curses, dark secrets, Victorian settings.",
    "Slasher": "Killer hunting victims. Masked killer stalking teenagers in isolated locations."
}

In [74]:
#model loading
model = SentenceTransformer('all-MiniLM-L6-v2')

In [75]:
subgenre_names = list(SUBGENRE_DESCRIPTIONS.keys())
subgenre_texts = list(SUBGENRE_DESCRIPTIONS.values())

# Convert descriptions to embeddings
subgenre_embeddings = model.encode(subgenre_texts, show_progress_bar=False)

In [76]:
def cosine_similarity(a, b):
    """Calculate cosine similarity between two vectors"""
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def extract_contributions(text, target_emb, model):
    """Extract words and calculate their contribution to the target category"""
    words = [w.strip(".,!?;:()\"").lower() for w in text.split()]
    stopwords = {'a', 'an', 'the', 'is', 'was', 'were', 'had', 'has', 'in', 'on', 'at', 'to', 'for', 'with', 'from', 'by', 'of', 'and', 'but', 'or', 'so', 'then', 'they', 'their', 'he', 'she', 'it', 'who', 'must', 'each', 'others', 'from'}
    unique_words = list(set([w for w in words if len(w) > 2 and w not in stopwords]))
    if not unique_words: return []
    
    word_embs = model.encode(unique_words, show_progress_bar=False)
    scored_words = [(w, float(cosine_similarity(word_embs[i], target_emb))) for i, w in enumerate(unique_words)]
    return sorted(scored_words, key=lambda x: x[1], reverse=True)[:3]

**map_story**: Encodes a story into an embedding, matches it to the most semantically similar predefined sub-genre using cosine similarity, flags it as [UNMAPPED] if confidence is low, and generates a brief explanation for the decision. 

In [77]:
def map_story(story_text, threshold=0.15):
    """Map story to best matching sub-genre based on semantic similarity"""
    
    # Get story embedding
    story_emb = model.encode(story_text, show_progress_bar=False)
    
    # Calculate similarities with all sub-genres
    similarities = [cosine_similarity(story_emb, sg_emb) for sg_emb in subgenre_embeddings]
    
    # Find best match
    best_idx = np.argmax(similarities)
    best_score = similarities[best_idx]
    best_subgenre = subgenre_names[best_idx]
    
    # Apply threshold for honest unmapping
    if best_score < threshold:
        return "[UNMAPPED]", best_score, f"Low semantic similarity ({best_score:.3f}) - doesn't fit taxonomy"
    
    # Generate dynamic reasoning from snippet
    target_emb = subgenre_embeddings[best_idx]
    top_words = extract_contributions(story_text, target_emb, model)
    
    reasoning = f"Matched {best_subgenre} because the passage contains:\n"
    for word, score in top_words:
        reasoning += f"- {word} (contribution: {score:.3f})\n"
    
    return best_subgenre, best_score, reasoning.strip()

In [78]:
#printing the results
print("="*80)
print("RESULTS")
print("="*80)

results = []

for case in test_cases:
    category, score, reason = map_story(case['story_snippet'])
    
    results.append({
        'id': case['id'],
        'category': category,
        'score': float(score),
        'reasoning': reason
    })
    
    print(f"\nCase #{case['id']}")
    print(f"  Story: {case['story_snippet'][:60]}...")
    print(f"  -> Category: {category}")
    print(f"  -> Similarity: {score:.4f}")
    print(f"  -> Reasoning:\n{reason}")

RESULTS

Case #1
  Story: They hated each other for years, working in the same cubicle...
  -> Category: Enemies-to-Lovers
  -> Similarity: 0.3550
  -> Reasoning:
Matched Enemies-to-Lovers because the passage contains:
- hated (contribution: 0.310)
- other (contribution: 0.174)
- same (contribution: 0.170)

Case #2
  Story: Agent Smith must recover the stolen drive without being dete...
  -> Category: Espionage
  -> Similarity: 0.2233
  -> Reasoning:
Matched Espionage because the passage contains:
- agent (contribution: 0.351)
- kremlin (contribution: 0.237)
- stolen (contribution: 0.155)

Case #3
  Story: The old Victorian mansion seemed to breathe, its corridors w...
  -> Category: Gothic
  -> Similarity: 0.6671
  -> Reasoning:
Matched Gothic because the passage contains:
- mansion (contribution: 0.500)
- victorian (contribution: 0.498)
- dark (contribution: 0.374)

Case #4
  Story: A story about a man who falls in love with his AI operating ...
  -> Category: Cyberpunk
  -> Similari

The system successfully maps story snippets to the most semantically relevant sub-genre by prioritizing story context over user-provided tags.

A similarity threshold is applied to ensure honest classification, correctly marking non-fiction or irrelevant content as [UNMAPPED].

For each mapped case, the model provides transparent reasoning by highlighting key words that influenced the decision.

Ambiguous and misleading cases are handled correctly, demonstrating the robustness of the semantic similarityâ€“based approach

In [79]:
mapped = sum(1 for r in results if r['category'] != '[UNMAPPED]')
unmapped = len(results) - mapped
avg_similarity = np.mean([r['score'] for r in results if r['category'] != '[UNMAPPED]'])

print("\n" + "="*80)
print("SUMMARY")
print("="*80)
print(f"Total Cases: {len(results)}")
print(f"Mapped: {mapped} ({mapped/len(results)*100:.0f}%)")
print(f"Unmapped: {unmapped} ({unmapped/len(results)*100:.0f}%)")
print(f"Average Similarity: {avg_similarity:.4f}")
print("="*80)


SUMMARY
Total Cases: 10
Mapped: 8 (80%)
Unmapped: 2 (20%)
Average Similarity: 0.4138
