In [416]:

# Dependencies are listed in requirements.txt file
!pip install -r requirements.txt
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')



In [20]:
import json
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModel
from typing import List, Dict, Set
import numpy as np
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
from typing import List, Dict
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
import re
import pandas as pd

def preprocess_text(text, is_category = False):
    """Clean and preprocess document text"""

    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    
    return ' '.join(tokens)

def load_amazon_data(file_path: str) -> List[Dict]:
    """Load and preprocess Amazon dataset"""
    with open(file_path, "r", encoding='utf-8') as f:
        data = json.load(f)
    
    documents = []
    excluded_categories = {'Men', 'Women'}
    
    for item in data:
        # Skip if no categories after exclusion
        categories = [cat for cat in item['category'] if cat not in excluded_categories]
        if not categories:
            continue
            
        # Combine text fields
        desc = item.get("description", "")
        reviews_text = " ".join([review['text'] for review in item['reviews']])
        text = f"{item['title']} {desc} {reviews_text}"
        
        # Preprocess combined text
        clean_text = preprocess_text(text)
        
        # Add document with filtered categories
        documents.append({
            'text': clean_text,
            'category': categories  # Limit to 3 levels
        })
    
    return documents

In [21]:
torch.cuda.empty_cache()

In [167]:
class TaxonomyNode:
    def __init__(self, name: str):
        self.name = name
        self.children = {}
        self.parent = None
        self.level = 0  # Add level tracking

class CoreClassAnnotator:
    def __init__(self, device='cuda' if torch.cuda.is_available() else 'cpu', batch_size=32):
        self.device = device
        self.batch_size = batch_size
        print(f"Using device: {device}")
        
        # Initialize entailment model
        self.entail_model = (AutoModelForSequenceClassification
                           .from_pretrained('roberta-large-mnli')
                           .to(device))
        self.entail_tokenizer = AutoTokenizer.from_pretrained('roberta-large-mnli')
        self.entail_model.eval()

    def build_taxonomy(self, categories: List[List[str]]) -> TaxonomyNode:
        """Build taxonomy tree from list of category paths"""
        root = TaxonomyNode("Root")
        
        # Skip excluded categories and build proper hierarchy
        excluded_categories = {'Men', 'Women'}
        
        for path in categories:
            # Filter out excluded categories
            filtered_path = [cat for cat in path if cat not in excluded_categories]
            if not filtered_path:
                continue
                
            current = root
            # Build hierarchy with meaningful categories only
            for category in filtered_path[:3]:  # Limit to 3 levels
                if category not in current.children:
                    node = TaxonomyNode(category)
                    node.parent = current
                    current.children[category] = node
                current = current.children[category]
        
        return root


    @torch.no_grad()
    def get_entailment_score(self, doc_text: str, class_name: str) -> float:
        # Add temperature scaling for more calibrated probabilities
        temperature = 0.8
        hypothesis = f"This example belongs to the category of {class_name}"
        
        inputs = self.entail_tokenizer(
            doc_text,
            hypothesis, 
            truncation=True,
            max_length=512,
            padding='max_length',
            return_tensors='pt'
        ).to(self.device)
        
        with torch.no_grad():
            outputs = self.entail_model(**inputs)
            logits = outputs.logits / temperature
            probs = torch.softmax(logits, dim=1)
        return probs[0][2].cpu().item()

    def get_path_similarity(self, doc_text: str, path: List[str]) -> float:
        # Add hierarchical weighting
        sim = 1.0
        for level, node in enumerate(path):
            entail_score = self.get_entailment_score(doc_text, node)
            if level == 0:
                sim = entail_score 
            else:
                # Decrease weight for deeper levels
                sim = sim * entail_score * (1.0 / (level + 1))
        return sim

    def get_candidates(self, doc_text: str, taxonomy: TaxonomyNode, level: int = 0) -> List[tuple]:
        """Get candidate classes through top-down search"""
        candidates = []
        queue = [(taxonomy, [], 0)]  # (node, path, level)
        
        while queue:
            node, path, level = queue.pop(0)
            if level >= 3:  # Limit to 3 levels
                continue
                
            # Get similarities for all children with proper path consideration
            child_scores = []
            for child_name, child_node in node.children.items():
                child_path = path + [child_name]
                sim = self.get_path_similarity(doc_text, child_path)
                child_scores.append((child_name, child_path, sim))
            
            # Select top (level+3) children
            child_scores.sort(key=lambda x: x[2], reverse=True)
            top_k = level + 3
            top_children = child_scores[:top_k]
            
            for child_name, child_path, sim in top_children:
                candidates.append((child_name, sim))
                queue.append((node.children[child_name], child_path, level + 1))
        
        return candidates


class TaxonomyEnricher:

    def __init__(self, device='cuda' if torch.cuda.is_available() else 'cpu', batch_size=64):
        self.device = device
        self.batch_size = batch_size
        self.bert_model = AutoModel.from_pretrained('bert-base-uncased').to(device)
        self.bert_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
        self.sentence_transformer = SentenceTransformer('all-MiniLM-L6-v2').to(device)
        self.bert_model.eval()

        self.CATEGORY_TERMS = refined_categories = {
            'Shoes': {
                # Original terms
                'sandal', 'sneaker', 'boot', 'heel', 'footwear', 'comfort', 'sole', 'cushion',
                'athletic', 'running', 'walking', 'tennis', 'loafer', 'flat', 'slipper', 'crocs', 'croc',
                'insole', 'arch', 'lace', 'velcro', 'breathable', 'lightweight', 'traction',
                'leather', 'canvas', 'mesh', 'rubber', 'suede', 'orthopedic', 'slip-resistant',
                # Added terms
                'moccasin', 'pump', 'oxford', 'clog', 'wedge', 'platform', 'espadrille', 'mule',
                'stiletto', 'ankle-boot', 'knee-high', 'thigh-high', 'peep-toe', 'mary-jane',
                'derby', 'ballet-flat', 'flip-flop', 'gladiator', 'bootie', 'kitten-heel'
            },
            'Clothing': {
                # Original terms
                'dress', 'shirt', 'sweater', 'cardigan', 'garment', 'fabric', 'wear',
                'blouse', 'skirt', 'pants', 'jeans', 'jacket', 'coat', 'hoodie',
                'cotton', 'polyester', 'silk', 'wool', 'linen', 'denim', 'sleeve',
                'collar', 'zipper', 'button', 'seam', 'hem', 'neckline', 'cuff',
                'stretchy', 'breathable', 'washable', 'tailored', 'fitted', 'bra',
                # Added terms
                'jumpsuit', 'romper', 'blazer', 'vest', 'tunic', 'legging', 'shorts',
                'capri', 'palazzo', 'bodysuit', 'turtleneck', 'poncho', 'kimono', 
                'wrap-dress', 'maxi', 'midi', 'mini', 'pencil-skirt', 'a-line',
                'pleated', 'crop-top', 'tank-top', 'halter', 'peplum'
            },
            'Accessories': {
                # Original terms
                'watch', 'jewelry', 'belt', 'accessory', 'charm', 'decoration',
                'necklace', 'bracelet', 'earring', 'ring', 'pendant', 'brooch',
                'scarf', 'handbag', 'purse', 'wallet', 'sunglasses', 'hat',
                'headband', 'clip', 'pin', 'buckle', 'chain', 'leather',
                'metal', 'gold', 'silver', 'gemstone', 'crystal',
                # Added terms
                'clutch', 'tote', 'satchel', 'backpack', 'crossbody', 'wristlet',
                'beret', 'beanie', 'fedora', 'visor', 'bandana', 'choker',
                'anklet', 'cufflink', 'tie-clip', 'hair-tie', 'scrunchie',
                'barrette', 'gloves', 'mittens', 'umbrella', 'stole'
            },
            'Athletic Wear': {
                'sportswear', 'gym', 'workout', 'fitness', 'yoga', 'compression',
                'sweatpant', 'sweatshirt', 'track-suit', 'jersey', 'dri-fit',
                'moisture-wicking', 'performance', 'athletic-short', 'tank',
                'sports-bra', 'windbreaker', 'warm-up', 'cycling-short',
                'tennis-skirt', 'swim', 'swimsuit', 'rash-guard', 'wetsuit'
            },
            'Formal Wear': {
                'gown', 'tuxedo', 'suit', 'cocktail-dress', 'evening-wear',
                'black-tie', 'formal', 'sequin', 'satin', 'velvet', 'chiffon',
                'bow-tie', 'cummerbund', 'shawl', 'opera-glove',
                'ball-gown', 'mermaid-dress', 'trumpet-dress', 'tailcoat'
            }
        }
    
        self.GENERIC_TERMS = {
                'item', 'brand', 'thing', 'everything', 'product', 'store',
                'buying', 'purchase', 'wear', 'selling', 'feature', 'things',
                'colors', 'customers', 'products', 'next', 'too', 'run',
                'money', 'inspection', 'shop', 'pizza', 'marijuana',
                'good', 'great', 'nice', 'best', 'better', 'worse',
                'amazing', 'awesome', 'terrible', 'bad', 'okay',
                'shipping', 'delivery', 'order', 'return', 'exchange',
                'size', 'color', 'price', 'cost', 'cheap', 'expensive',
                'quality', 'value', 'worth', 'recommend', 'review',
                'star', 'rating', 'feedback', 'experience', 'service',
                'customer', 'seller', 'store', 'shop', 'website',
                'online', 'retail', 'purchase', 'bought', 'received',
            }
        
    def calculate_term_scores(self, terms: List[str], class_docs: List[str], 
                         sibling_docs: List[List[str]], class_name: str,
                         alpha: float = 0.5) -> Dict[str, float]:
        """Calculate term scores with improved filtering"""
        # Process unique terms with less strict filtering
        term_dict = {}
        
        for term in set(terms):
            # Basic filtering conditions
            if len(term) < 2 or not term.isalnum():
                continue
                
            # Get base form
            base_term = term.rstrip('s').lower()
            if base_term in term_dict:
                continue
                
            # Calculate popularity
            popularity = np.log1p(sum(1 for doc in class_docs 
                                    if base_term in doc.lower()))
            
            # Calculate distinctiveness
            tokenized_docs = [word_tokenize(doc.lower()) for doc in class_docs]
            if not tokenized_docs:
                tokenized_docs = [['']]
                
            bm25 = BM25Okapi(tokenized_docs)
            class_score = np.mean(bm25.get_scores(word_tokenize(base_term)))
            
            # Calculate sibling scores
            sibling_scores = []
            for sib_docs in sibling_docs:
                if sib_docs:
                    tokenized_sib = [word_tokenize(doc.lower()) for doc in sib_docs]
                    bm25_sib = BM25Okapi(tokenized_sib)
                    sib_score = np.mean(bm25_sib.get_scores(word_tokenize(base_term)))
                    sibling_scores.append(sib_score)
                    
            # Calculate distinctiveness with sibling penalty
            if sibling_scores:
                distinctiveness = np.exp(class_score) / (1 + sum(np.exp(score) 
                                                       for score in sibling_scores))
            else:
                distinctiveness = np.exp(class_score)
                
            # Calculate final score
            affinity = float((popularity ** alpha) * (distinctiveness ** (1-alpha)))
            term_dict[base_term] = affinity
            
        return term_dict

    def calculate_bm25_score(self, term: str, documents: List[str]) -> float:
        """Calculate BM25 relevance score between a term and a set of documents"""
    # Tokenize documents
        tokenized_docs = [word_tokenize(doc.lower()) for doc in documents]
        if not tokenized_docs:
            tokenized_docs = [['']]
            
        # Initialize BM25 model
        bm25 = BM25Okapi(tokenized_docs)
        
        # Calculate score for the term
        tokenized_term = word_tokenize(term.lower())
        scores = bm25.get_scores(tokenized_term)
        
        # Return average score
        return float(np.mean(scores))

    def get_enriched_taxonomy(self, class_docs: Dict[str, List[str]], 
                            taxonomy: Dict[str, List[str]]) -> Dict[str, Dict]:
        """Get enriched taxonomy for all classes"""
        enriched_taxonomy = {}
        
        for class_name, docs in class_docs.items():
            # Get siblings from taxonomy
            siblings = taxonomy.get(class_name, [])
            sibling_docs = [[class_docs.get(sib, [])] for sib in siblings]
            
            # Get terms from documents
            terms = set()
            for doc in docs:
                terms.update(word_tokenize(doc.lower()))
            
            # Calculate scores and get enriched terms
            result = self.calculate_term_scores(
                list(terms),
                docs,
                sibling_docs,
                class_name
            )
            
            enriched_taxonomy.update(result)
        
        return enriched_taxonomy

    def calculate_semantic_similarity(self, term: str, class_name: str) -> float:
        """Calculate semantic similarity between term and class name using BERT embeddings"""
        with torch.no_grad():
            # Get term embedding
            term_inputs = self.bert_tokenizer(
                f"[CLS] {term} [SEP]",
                return_tensors='pt',
                truncation=True,
                max_length=32,
                padding='max_length'
            ).to(self.device)
            
            # Get class name embedding
            class_inputs = self.bert_tokenizer(
                f"[CLS] {class_name} [SEP]",
                return_tensors='pt',
                truncation=True,
                max_length=32,
                padding='max_length'
            ).to(self.device)
            
            # Calculate embeddings
            term_embedding = self.bert_model(**term_inputs).last_hidden_state.mean(dim=1)
            class_embedding = self.bert_model(**class_inputs).last_hidden_state.mean(dim=1)
            
            # Calculate cosine similarity
            similarity = torch.cosine_similarity(term_embedding, class_embedding, dim=1)
            
            return similarity.item()

# Load and preprocess documents
documents = load_amazon_data("AMAZON_REVIEWS.json")
# documents = documents[:5]

print(f"Loaded {len(documents)} documents")

# Initialize models
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\nUsing device: {device}")

core_annotator = CoreClassAnnotator(device=device)
taxonomy_enricher = TaxonomyEnricher(device=device)

# Process documents
results = []
for doc in tqdm(documents[:5]):  # Process first 5 docs for testing
    # Get document text and category
    doc_text = doc['text']
    category = doc['category']
    
    # Step 3.1: Get core classes
    taxonomy = core_annotator.build_taxonomy([category])  # Limit to 3 levels
    cores = core_annotator.get_candidates(doc_text, taxonomy)
    results.append(cores)

Loaded 640 documents

Using device: cuda
Using device: cuda


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:45<00:00,  9.12s/it]


In [123]:
def print_taxonomy(node: TaxonomyNode, level: int = 0):
    """Print taxonomy in a tree-like structure"""
    indent = "  " * level
    print(f"{indent}{node.name}")
    for child_node in node.children.values():
        print_taxonomy(child_node, level + 1)

# Usage
taxonomy = core_annotator.build_taxonomy([category])
print("\nTaxonomy Structure:")
print_taxonomy(taxonomy)

# Print additional information
print("\nTaxonomy Statistics:")
print(f"Number of children: {len(taxonomy.children)}")



Taxonomy Structure:
Root
  Clothing, Shoes & Jewelry
    Shoe, Jewelry & Watch Accessories
      Shoe Care & Accessories

Taxonomy Statistics:
Number of children: 1


In [124]:
# data[1]

In [125]:
documents[1]

{'text': 'Playtex Women Hour Easy On Easy Off Front Back Close Post Surgery Bra USC Product Description Introducing Playtex hour front back close wireless bra designed dressing ease versatility Our newest wire free bra designed soft seamless cotton comfort blend smooth look clothing incredible comfort skin Designed offer convenience front close bra back close bra easy fastening removal Wide comfort strap soft skin Product Description Introducing Playtex hour front back close wireless bra designed dressing ease versatility Our newest wire free bra designed soft seamless cotton comfort blend smooth look clothing incredible comfort skin Designed offer convenience front close bra back close bra easy fastening removal Wide comfort strap soft skin This close perfect bra Having front close hook AND adjustable size hook back genius This great mobility health issue require front closure typically lack adjustability band size Women plastic mannequin We vary size day day due cycle eat etc What I 

In [127]:
import json
from tqdm import tqdm
from nltk.tokenize import word_tokenize

# Load and preprocess data
print("Loading data...")
documents = load_amazon_data('AMAZON_REVIEWS.json')

# Test with first 10 documents
# test_docs = documents[:10]
# print(f"\nTesting with {len(test_docs)} documents")

# Initialize models
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\nUsing device: {device}")

core_annotator = CoreClassAnnotator(device=device)
enricher = TaxonomyEnricher(device=device)

# Test 3.1: Core Class Annotation
print("\nTesting Core Class Annotation...")
initial_results = {}

for doc in tqdm(documents):
    try:
        # Get core classes using preprocessed text
        taxonomy = core_annotator.build_taxonomy([doc['category']])
        candidates = core_annotator.get_candidates(
            doc_text=doc['text'],
            taxonomy=taxonomy
        )
        initial_results[doc['category'][0]] = candidates
    except Exception as e:
        print(f"Error processing document with category {doc['category']}: {str(e)}")

# Print 3.1 results
print("\nCore Class Annotation Results:")
for category, candidates in initial_results.items():
    print(f"\nCategory: {category}")
    print("Core Class Candidates:")
    for class_name, score in candidates:
        print(f"{class_name}: {score:.4f}")


Loading data...

Using device: cuda
Using device: cuda


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



Testing Core Class Annotation...


100%|████████████████████████████████████████████████████████████████████████████████| 640/640 [07:59<00:00,  1.34it/s]


Core Class Annotation Results:

Category: Clothing, Shoes & Jewelry
Core Class Candidates:
Clothing, Shoes & Jewelry: 0.0851
Uniforms, Work & Safety: 0.0039
Clothing: 0.0008

Category: Sports & Outdoors
Core Class Candidates:
Sports & Outdoors: 0.1578
Fan Shop: 0.0165
Clothing: 0.0037

Category: Home & Kitchen
Core Class Candidates:
Home & Kitchen: 0.1085
Storage & Organization: 0.0126
Clothing & Closet Storage: 0.0010

Category: Handmade Products
Core Class Candidates:
Handmade Products: 0.1266
Clothing, Shoes & Accessories: 0.0008
Luggage & Travel Gear: 0.0000





In [129]:
initial_results

{'Clothing, Shoes & Jewelry': [('Clothing, Shoes & Jewelry',
   0.08511808514595032),
  ('Uniforms, Work & Safety', 0.003946674443238152),
  ('Clothing', 0.0007632845066257986)],
 'Sports & Outdoors': [('Sports & Outdoors', 0.15775834023952484),
  ('Fan Shop', 0.016537707573447413),
  ('Clothing', 0.003713101762005381)],
 'Home & Kitchen': [('Home & Kitchen', 0.10848480463027954),
  ('Storage & Organization', 0.012555897244653558),
  ('Clothing & Closet Storage', 0.0010048035432630867)],
 'Handmade Products': [('Handmade Products', 0.1266063004732132),
  ('Clothing, Shoes & Accessories', 0.0008271060100626879),
  ('Luggage & Travel Gear', 6.869386163354674e-06)]}

In [130]:
# Test 3.2: Taxonomy Enrichment
print("\nTesting Taxonomy Enrichment...")
enriched_terms = {}

for doc in tqdm(documents):
    try:
        doc_text = doc['text']
        
        # Get core classes
        taxonomy = core_annotator.build_taxonomy([doc['category']])
        candidates = core_annotator.get_candidates(doc_text, taxonomy)
        
        # Create class documents dictionary
        class_docs = {
            class_name: [doc_text] 
            for class_name in doc['category']
        }
        
        # Get sibling documents with proper hierarchy
        siblings = []
        for i in range(1, len(doc['category'])):
            current_siblings = [c for c in doc['category'] if c != doc['category'][i]]
            siblings.extend(current_siblings)
        
        sibling_docs = [[doc_text] for _ in siblings]
        
        # Calculate term scores with improved filtering
        terms = word_tokenize(doc_text.lower())
        
        # Filter out short and non-alphanumeric terms
        filtered_terms = [term for term in terms 
                        if len(term) >= 3 and term.isalnum()]
        
        # Calculate term scores for each class in the document's category path
        for class_name in doc['category']:
            result = enricher.calculate_term_scores(
                terms=filtered_terms,
                class_docs=[doc_text],
                sibling_docs=sibling_docs,
                class_name=class_name
            )
            
            if class_name not in enriched_terms:
                enriched_terms[class_name] = {}
            
            # Update scores using maximum score for each term
            for term, score in result.items():
                if term not in enriched_terms[class_name] or score > enriched_terms[class_name][term]:
                    enriched_terms[class_name][term] = score

    except Exception as e:
        print(f"Error processing document with category {doc['category']}: {str(e)}")

# Print 3.2 results 
print("\nTaxonomy Enrichment Results:")
for category, result in enriched_terms.items():
    print(f"\nCategory: {category}")
    # The result is already the dictionary of terms and scores
    for term, score in result.items():
        print(f"  {term}: {score:.4f}")



Testing Taxonomy Enrichment...


100%|██████████████████████████████████████████████████████████████████████████████| 640/640 [4:41:35<00:00, 26.40s/it]



Taxonomy Enrichment Results:

Category: Clothing, Shoes & Jewelry
  beige: 0.3611
  hour: 0.5471
  casual: 0.5117
  loosened: 0.2592
  returning: 0.5471
  looked: 0.5471
  blue: 0.5471
  expected: 0.7257
  buckle: 0.3611
  tie: 0.3611
  silver: 0.7257
  brand: 0.7257
  giving: 0.5471
  getting: 0.5471
  foot: 0.7257
  olive: 0.3611
  adjust: 0.3723
  toe: 0.5471
  trip: 0.3611
  caused: 0.3611
  swell: 0.3611
  fresh: 0.3611
  white: 0.5286
  instead: 0.5471
  quality: 0.7257
  reach: 0.3611
  bottom: 0.5286
  fit: 0.6326
  what: 0.5471
  navy: 0.3611
  stay: 0.5471
  feel: 0.7257
  product: 0.6842
  item: 0.6842
  prefect: 0.3611
  spend: 0.5471
  walking: 0.7257
  elastic: 0.5471
  home: 0.7257
  inky: 0.2592
  cute: 0.6842
  wet: 0.5471
  normal: 0.7257
  size: 0.7257
  wrap: 0.3611
  wore: 0.3611
  woman: 0.6842
  look: 0.6842
  story: 0.3611
  pretty: 0.5471
  rekayla: 0.2551
  issue: 0.7257
  super: 0.7257
  second: 0.5471
  kept: 0.3611
  red: 0.5471
  for: 0.5471
  completely:

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [299]:
class CoreClassRefiner:
    def __init__(self, device='cuda' if torch.cuda.is_available() else 'cpu'):
        self.device = device
        print(f"Using device: {self.device}")
        self.sentence_transformer = SentenceTransformer('all-MiniLM-L6-v2').to(device)

    @torch.no_grad()
    def get_document_embeddings(self, doc_texts: List[str]) -> torch.Tensor:
        """Get document embeddings in batches"""
        return self.sentence_transformer.encode(
            doc_texts,
            convert_to_tensor=True,
            device=self.device,
            batch_size=32
        )

    def get_class_embeddings(self, class_docs: Dict[str, List[str]], 
                           enriched_terms: Dict[str, List[str]]) -> Dict[str, torch.Tensor]:
        """Calculate class embeddings from confident documents"""
        class_embeddings = {}
        
        for class_name, docs in class_docs.items():
            if class_name not in enriched_terms:
                continue
                
            # Use enriched terms to identify confident documents
            confident_docs = [
                doc for doc in docs 
                if any(term in doc.lower() for term in enriched_terms[class_name])
            ]
            
            if confident_docs:
                embeddings = self.sentence_transformer.encode(
                    confident_docs,
                    convert_to_tensor=True,
                    device=self.device,
                    batch_size=32
                )
                class_embeddings[class_name] = torch.mean(embeddings, dim=0)
        
        return class_embeddings

    def refine_core_classes(self, documents: List[Dict],
                          initial_cores: List[List[tuple]], 
                          enriched_terms: Dict[str, List[str]],
                          taxonomy,
                          confidence_threshold: float = 0.7) -> List[List[str]]:
        """Refine core classes using semantic similarity"""
        # Get document embeddings
        doc_texts = [doc['text'] for doc in documents]
        doc_embeddings = self.get_document_embeddings(doc_texts)
        
        # Get class documents
        class_docs = defaultdict(list)
        for doc, cores in zip(documents, initial_cores):
            for class_name, _ in cores:
                class_docs[class_name].append(doc['text'])
                # Add documents to parent classes
                for parent in taxonomy.get_parents(class_name):
                    class_docs[parent].append(doc['text'])
        
        # Get class embeddings using enriched terms
        class_embeddings = self.get_class_embeddings(class_docs, enriched_terms)
        
        # Refine core classes
        refined_cores = []
        confidence_scores = []
        
        # Process each document
        for doc_idx, doc_emb in enumerate(doc_embeddings):
            similarities = {}
            for class_name, class_emb in class_embeddings.items():
                sim = torch.cosine_similarity(
                    doc_emb.unsqueeze(0),
                    class_emb.unsqueeze(0)
                ).item()
                
                # Apply hierarchical weighting
                level = len(taxonomy.get_parents(class_name))
                sim *= (1.0 + 0.1 * level)
                similarities[class_name] = sim
            
            # Find similarity gap
            sorted_sims = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
            gaps = [sorted_sims[i][1] - sorted_sims[i+1][1] for i in range(len(sorted_sims)-1)]
            
            if gaps:
                max_gap_idx = gaps.index(max(gaps))
                core_classes = [c[0] for c in sorted_sims[:max_gap_idx+1]]
                confidence = gaps[max_gap_idx]
                
                # Validate hierarchy
                valid_cores = []
                for class_name in core_classes:
                    if all(parent in valid_cores for parent in taxonomy.get_parents(class_name)):
                        valid_cores.append(class_name)
                
                refined_cores.append(valid_cores)
                confidence_scores.append(confidence)
            else:
                refined_cores.append([])
                confidence_scores.append(0.0)
        
        # Filter by confidence threshold
        threshold = np.percentile(confidence_scores, (1-confidence_threshold)*100)
        return [
            cores for cores, conf in zip(refined_cores, confidence_scores)
            if conf >= threshold
        ]


In [294]:
refined_results

{0: ['Shoes', 'Sandals', 'Flats'],
 1: ['Lingerie, Sleep & Lounge',
  'Lingerie',
  'Bras',
  'Everyday Bras',
  'Clothing',
  'Uniforms, Work & Safety'],
 2: ['Sweaters', 'Cardigans', 'Clothing', 'Uniforms, Work & Safety'],
 3: ['Uniforms, Work & Safety', 'Clothing'],
 4: ['Shoe Decoration Charms',
  'Shoe Care & Accessories',
  'Shoe, Jewelry & Watch Accessories',
  'Uniforms, Work & Safety']}

In [403]:
from torch.optim.lr_scheduler import ReduceLROnPlateau
class TextClassifier:
    def __init__(self, embedding_dim=384, hidden_dim=256, num_layers=2, device='cuda'):
        super().__init__()
        self.device = device
        self.sentence_transformer = SentenceTransformer('all-MiniLM-L6-v2').to(device)
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.class_names = None

        # super(TextClassifier, self).__init__()  # Proper nn.Module initialization
        # self.device = device
        # self.sentence_transformer = SentenceTransformer('all-MiniLM-L6-v2').to(device)
        # self.hidden_dim = hidden_dim
        
        # self.class_names = None

        # BiLSTM with dropout
        self.rnn = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            bidirectional=True,
            batch_first=True,
            dropout=0.2
        ).to(device)

        # Multi-head attention
        self.attention = nn.MultiheadAttention(
            embed_dim=hidden_dim * 2,
            num_heads=4,
            dropout=0.2,
            batch_first=True
        ).to(device)

        self.classifier = None  # Will be initialized in train_classifier
        
    def forward(self, x):
        # Add sequence dimension if needed
        if len(x.shape) == 2:
            x = x.unsqueeze(1)
            
        # BiLSTM processing
        rnn_out, _ = self.rnn(x)
        
        # Self attention with skip connection
        attn_out, _ = self.attention(rnn_out, rnn_out, rnn_out)
        context = attn_out + rnn_out
        
        # Global max pooling
        pooled = torch.max(context, dim=1)[0]
        
        # Classification
        if self.classifier is not None:
            return self.classifier(pooled)
        return pooled


    def get_document_embeddings(self, texts: List[str]) -> torch.Tensor:
        return self.sentence_transformer.encode(
            texts,
            convert_to_tensor=True,
            device=self.device,
            batch_size=32,
            show_progress_bar=False,
            normalize_embeddings=True  # L2 normalize embeddings
        )


    def train_classifier(self, documents, refined_classes, enriched_terms, taxonomy):
        # Initialize classifier with hierarchical structure
        self.class_names = set()
        for classes in refined_classes.values():
            self.class_names.update(classes)
        self.class_names = list(self.class_names)
        
        self.classifier = nn.Sequential(
            nn.Linear(self.hidden_dim * 2, self.hidden_dim),
            nn.LayerNorm(self.hidden_dim),
            nn.GELU(),
            nn.Dropout(0.2),
            nn.Linear(self.hidden_dim, len(self.class_names))
        ).to(self.device)

        # Prepare training data
        doc_texts = []
        doc_labels = []
        
        # Process original documents
        for i, doc in enumerate(documents):
            doc_texts.append(doc['text'])
            if i in refined_classes:
                doc_labels.append(refined_classes[i])
            else:
                doc_labels.append([])

        # Generate synthetic data
        synthetic_data = self.generate_synthetic_data(taxonomy, enriched_terms)
        for sample in synthetic_data:
            doc_texts.append(sample['text'])
            doc_labels.append(sample['category'])

        # Get embeddings and train
        doc_embeddings = self.get_document_embeddings(doc_texts)
        self.train_multi_label(doc_embeddings, doc_labels)

    def generate_synthetic_data(self, taxonomy: Dict[str, List[str]], 
                          enriched_terms: Dict[str, Dict],
                          num_samples: int = 100) -> List[Dict]:
        synthetic_data = []
        for _ in range(num_samples):
            # Sample complete path from taxonomy
            path = []
            current = random.choice(list(taxonomy.keys()))
            path.append(current)
            
            while current in taxonomy and len(path) < 3:
                if not taxonomy[current]:
                    break
                current = random.choice(taxonomy[current])
                path.append(current)
                
            # Generate document using enriched terms
            doc_text = []
            for level, class_name in enumerate(path):
                if class_name in enriched_terms:
                    terms = list(enriched_terms[class_name].keys())
                    weights = list(enriched_terms[class_name].values())
                    # Sample more terms for higher levels
                    num_terms = min(5 * (3-level), len(terms))
                    if terms:
                        selected_terms = random.choices(terms, weights=weights, k=num_terms)
                        doc_text.extend(selected_terms)
            
            synthetic_data.append({
                'text': ' '.join(doc_text),
                'category': path
            })
        return synthetic_data

    def train_multi_label(self, doc_embeddings: torch.Tensor, doc_labels: List[List[str]], num_epochs: int =1000):
        # Add sequence dimension if needed
        if len(doc_embeddings.shape) == 2:
            doc_embeddings = doc_embeddings.unsqueeze(1)
            
        # Initialize classifier
        # self.classifier = nn.Sequential(
        #     nn.Linear(self.hidden_dim * 2, self.hidden_dim),
        #     nn.LayerNorm(self.hidden_dim),
        #     nn.GELU(),
        #     nn.Dropout(0.3),
        #     nn.Linear(self.hidden_dim, len(self.class_names))
        # ).to(self.device)

        self.classifier = nn.Sequential(
            nn.Linear(self.hidden_dim * 2, self.hidden_dim),
            nn.LayerNorm(self.hidden_dim),
            nn.GELU(),
            nn.Dropout(0.3),  # Increase dropout
            nn.Linear(self.hidden_dim, self.hidden_dim // 2),
            nn.LayerNorm(self.hidden_dim // 2),
            nn.GELU(),
            nn.Dropout(0.1),
            nn.Linear(self.hidden_dim // 2, len(self.class_names))
        ).to(self.device)

        # Convert labels to multi-hot encoding
        label_matrix = torch.zeros(len(doc_labels), len(self.class_names)).to(self.device)
        for i, labels in enumerate(doc_labels):
            top_labels = labels[:7] if len(labels) > 7 else labels
            for label in top_labels:
                if label in self.class_names:
                    label_idx = self.class_names.index(label)
                    label_matrix[i, label_idx] = 1

        # Calculate class weights
        # class_weights = torch.FloatTensor([
        #     (len(doc_labels) - label_matrix[:,i].sum())/max(1, label_matrix[:,i].sum()) 
        #     for i in range(len(self.class_names))
        # ]).to(self.device)
        class_weights = torch.FloatTensor([
            max(1.0, (len(doc_labels) - label_matrix[:,i].sum()) / max(1, label_matrix[:,i].sum()))
            for i in range(len(self.class_names))
        ]).to(self.device)

        # Initialize optimizer with classifier parameters
        # optimizer = torch.optim.AdamW(self.classifier.parameters(), lr=1e-3)
        # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=3)
        optimizer = torch.optim.AdamW(
            self.classifier.parameters(),
            lr=5e-3,  # Lower initial learning rate
            weight_decay=0.01,  # Add weight decay
            betas=(0.9, 0.999)  # Adjust momentum parameters
        )
        scheduler = ReduceLROnPlateau(
            optimizer,
            mode='min',
            patience=5,  # Increase patience
            factor=0.5,  # Less aggressive reduction
            min_lr=1e-3  # Add minimum learning rate
        )
        # criterion = nn.BCEWithLogitsLoss(reduction='mean', pos_weight=class_weights)
        # Add label smoothing to loss
        criterion = nn.BCEWithLogitsLoss(
            reduction='mean',
            pos_weight=class_weights
        )

        for epoch in range(num_epochs):
            optimizer.zero_grad()
            
            # Forward pass
            rnn_out, _ = self.rnn(doc_embeddings)
            attn_out, _ = self.attention(rnn_out, rnn_out, rnn_out)
            context = attn_out + rnn_out  # Residual connection
            pooled = torch.max(context, dim=1)[0]
            outputs = self.classifier(pooled)
            
            loss = criterion(outputs, label_matrix)
            loss.backward()
            
            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(self.classifier.parameters(), max_norm=1.0)
            
            optimizer.step()
            scheduler.step(loss)
        # for epoch in range(num_epochs):
        #     self.classifier.train()
        #     optimizer.zero_grad()
            
        #     # Forward pass
        #     rnn_out, _ = self.rnn(doc_embeddings)
        #     attn_out, _ = self.attention(rnn_out, rnn_out, rnn_out)
        #     context = attn_out + rnn_out
        #     pooled = torch.max(context, dim=1)[0]
        #     outputs = self.classifier(pooled)
            
        #     # Calculate loss with L2 regularization
        #     loss = criterion(outputs, label_matrix)
        #     l2_lambda = 0.01
        #     l2_reg = torch.tensor(0.).to(self.device)
        #     for param in self.classifier.parameters():
        #         l2_reg += torch.norm(param)
        #     loss += l2_lambda * l2_reg
            
        #     # Backward pass
        #     loss.backward()
        #     torch.nn.utils.clip_grad_norm_(self.classifier.parameters(), max_norm=0.5)
        #     optimizer.step()
        #     scheduler.step(loss)

            if (epoch + 1) % 10 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

    def predict(self, text: str) -> List[str]:
        """Predict classes with hierarchical constraints"""
        if self.classifier is None:
            raise AttributeError("Model not trained. Call train_classifier first.")
            
        # Get text embedding
        embedding = self.get_document_embeddings([text])
        
        # Add sequence dimension if needed
        if len(embedding.shape) == 2:
            embedding = embedding.unsqueeze(1)
        
        # Get predictions
        with torch.no_grad():
            # Forward pass through RNN and attention
            rnn_out, _ = self.rnn(embedding)
            attn_out, _ = self.attention(rnn_out, rnn_out, rnn_out)
            context = attn_out + rnn_out
            pooled = torch.max(context, dim=1)[0]
            
            # Reshape pooled tensor to match classifier input
            pooled = pooled.view(1, -1)  # Ensure shape is [1, hidden_dim * 2]
            
            outputs = self.classifier(pooled)
            scores = torch.sigmoid(outputs[0])
            
            # Get top k predictions maintaining hierarchy
            top_scores, top_indices = torch.topk(scores, k=min(7, len(self.class_names)))
            predicted_classes = []
            
            for score, idx in zip(top_scores, top_indices):
                if score > 0.5:
                    class_name = self.class_names[idx.item()]
                    parent = '/'.join(class_name.split('/')[:-1])
                    if not parent or parent in predicted_classes:
                        predicted_classes.append(class_name)
                        
            return predicted_classes





In [406]:
print("\nTesting Text Classifier Training...")
classifier = TextClassifier(device=device)  # Pass device parameter

# Build taxonomy from preprocessed documents
taxonomy = {}
for doc in documents:
    if doc['category'][0] not in taxonomy:
        taxonomy[doc['category'][0]] = []
    if len(doc['category']) > 1:
        taxonomy[doc['category'][0]].extend(doc['category'])
    taxonomy[doc['category'][0]] = list(set(taxonomy[doc['category'][0]]))

# Train classifier
classifier.train_classifier(
    documents=documents,
    refined_classes=refined_results,
    enriched_terms=enriched_terms,
    taxonomy=taxonomy
)

def add_predicted_labels(documents: List[Dict], classifier: TextClassifier) -> List[Dict]:
    """Add predicted labels to documents"""
    enhanced_documents = []
    
    for doc in documents:
        # Get predictions for document
        predicted_classes = classifier.predict(doc['text'])
        
        # Create enhanced document with predictions
        enhanced_doc = doc.copy()
        enhanced_doc['predicted_category'] = predicted_classes
        enhanced_documents.append(enhanced_doc)
        
    return enhanced_documents

# Usage
enhanced_docs = add_predicted_labels(documents, classifier)

# Print sample predictions
for doc in enhanced_docs[:5]:
    print(f"\nDocument Categories:")
    print("Original:", doc['category'])
    print("Predicted:", doc['predicted_category'])


Testing Text Classifier Training...
Epoch [10/1000], Loss: 1.3659
Epoch [20/1000], Loss: 1.3645
Epoch [30/1000], Loss: 1.3667
Epoch [40/1000], Loss: 1.3651
Epoch [50/1000], Loss: 1.3615
Epoch [60/1000], Loss: 1.3660
Epoch [70/1000], Loss: 1.3634
Epoch [80/1000], Loss: 1.3615
Epoch [90/1000], Loss: 1.3601
Epoch [100/1000], Loss: 1.3548
Epoch [110/1000], Loss: 1.3419
Epoch [120/1000], Loss: 1.2933
Epoch [130/1000], Loss: 1.1474
Epoch [140/1000], Loss: 1.0152
Epoch [150/1000], Loss: 0.8962
Epoch [160/1000], Loss: 0.7838
Epoch [170/1000], Loss: 0.6943
Epoch [180/1000], Loss: 0.6163
Epoch [190/1000], Loss: 0.5584
Epoch [200/1000], Loss: 0.5165
Epoch [210/1000], Loss: 0.4733
Epoch [220/1000], Loss: 0.4409
Epoch [230/1000], Loss: 0.4048
Epoch [240/1000], Loss: 0.3844
Epoch [250/1000], Loss: 0.3611
Epoch [260/1000], Loss: 0.3444
Epoch [270/1000], Loss: 0.3221
Epoch [280/1000], Loss: 0.2990
Epoch [290/1000], Loss: 0.2872
Epoch [300/1000], Loss: 0.2680
Epoch [310/1000], Loss: 0.2539
Epoch [320/

In [380]:
enhanced_docs[2]

{'text': 'Zeagoo Women Open Front Cardigan Sleeve Draped Ruffles Soft Knit Sweaters From brand Lightweight Cardigan Previous page Best choice Black cardigan woman Most Wished Navy cardigan woman Wardrobe Essentials Grey cardigan woman New Color You like Next page Product Description NOTE Lightweight skin friendly fabric little thinner shirt cause wrinkle When receive wash cold hung hanger dry wrinkle decrease Just right thickness Summer evening Spring Fall The light color like white may little see Note Do remind check size chart ordering This cardigan normal size like loose sense ordered size sleeve slim fit Tip Color may vary slightly image Colors may vary different viewing device For color product please refer actual received DESIGN OCCASION Great jean legging little dress completes fit wedding Perfect sweater event I bought chamber music performance also wear teach church It comfortable soft flattering I bought color Gray brown They fantastic material soft light excellent office I w

In [381]:
documents[1]

{'text': 'Playtex Women Hour Easy On Easy Off Front Back Close Post Surgery Bra USC Product Description Introducing Playtex hour front back close wireless bra designed dressing ease versatility Our newest wire free bra designed soft seamless cotton comfort blend smooth look clothing incredible comfort skin Designed offer convenience front close bra back close bra easy fastening removal Wide comfort strap soft skin Product Description Introducing Playtex hour front back close wireless bra designed dressing ease versatility Our newest wire free bra designed soft seamless cotton comfort blend smooth look clothing incredible comfort skin Designed offer convenience front close bra back close bra easy fastening removal Wide comfort strap soft skin This close perfect bra Having front close hook AND adjustable size hook back genius This great mobility health issue require front closure typically lack adjustability band size Women plastic mannequin We vary size day day due cycle eat etc What I 

In [382]:
with open("AMAZON_REVIEWS.json", "r", encoding='utf-8') as f:
        data = json.load(f)

In [383]:
import pandas as pd
df = pd.DataFrame.from_records(data)

In [384]:
df.head()

Unnamed: 0,type,locale,asin,title,stars,ratings,category,attrs,bullets,description,info,reviews,price,formats,template,subtitle,author,desc,attr,review
0,product,us,B07C1NJF6T,Rekayla Open Toe Tie Up Ankle Wrap Flat Sandal...,4.2 out of 5 stars,"5,656 ratings","[Clothing, Shoes & Jewelry, Women, Shoes, Sand...",{},[],From the brand Previous page REKAYLA is a fres...,{},"[{'stars': '4.0 out of 5 stars', 'title': 'Fin...",,{},shoes,,,,,
1,product,us,B07K12NL97,"Playtex Women's 18 Hour Easy On, Easy Off Fron...",4.0 out of 5 stars,875 ratings,"[Clothing, Shoes & Jewelry, Women, Clothing, L...",{},"[85% Cotton, 15% Spandex, Imported, Hook and E...",Product Description Introducing Playtex 18 hou...,{},"[{'stars': '3.0 out of 5 stars', 'title': 'Soo...",,{},apparel,,,,,
2,product,us,B07F86B3R2,Zeagoo Women's Open Front Cardigan 3/4 Sleeve ...,4.3 out of 5 stars,"9,489 ratings","[Clothing, Shoes & Jewelry, Women, Clothing, S...",{},[Product Dimensions : 13.39 x 11.02 x 0.79 i...,From the brand Lightweight Cardigan Previous p...,{},"[{'stars': '4.0 out of 5 stars', 'title': 'Wel...",,{},apparel,,,,,
3,product,us,B07HLWMB8J,Under Armour Men's Micro G Pursuit-Wide Sneake...,4.7 out of 5 stars,526 ratings,"[Clothing, Shoes & Jewelry, Men]",{},"[100% Textile, Imported, Rubber sole, Shaft me...","Product Description Lightweight, breathable me...",{},"[{'stars': '5.0 out of 5 stars', 'title': 'Upd...",,{},shoes,,,,,
4,product,us,B089YD2KK5,Crocs Jibbitz 5-Pack Alien Shoe Charms | Jibbi...,4.7 out of 5 stars,"1,105 ratings","[Clothing, Shoes & Jewelry, Shoe, Jewelry & Wa...",{},"[Date First Available : November 21, 2021, M...",From the brand Previous page Shop Crocs Collec...,{},"[{'stars': '4.0 out of 5 stars', 'title': 'I l...",$9.99,{},shoes,,,,,


In [385]:
len(df) 

640

In [386]:
len(enhanced_docs)
df_ = pd.DataFrame.from_records(enhanced_docs)

In [387]:
df["predicted"] = df_["predicted_category"]

In [388]:
df["refined_core"] = df_["category"]

In [412]:
df.to_json("AMAZON_REVIEWS_FINAL.json", orient='records', indent=4)

In [413]:
df.head()

Unnamed: 0,type,locale,asin,title,stars,ratings,category,attrs,bullets,description,...,price,formats,template,subtitle,author,desc,attr,review,predicted,refined_core
0,product,us,B07C1NJF6T,Rekayla Open Toe Tie Up Ankle Wrap Flat Sandal...,4.2 out of 5 stars,"5,656 ratings","[Clothing, Shoes & Jewelry, Women, Shoes, Sand...",{},[],From the brand Previous page REKAYLA is a fres...,...,,{},shoes,,,,,,"[Flats, Slippers, Sport Sandals & Slides, Plat...","[Clothing, Shoes & Jewelry, Shoes, Sandals, Fl..."
1,product,us,B07K12NL97,"Playtex Women's 18 Hour Easy On, Easy Off Fron...",4.0 out of 5 stars,875 ratings,"[Clothing, Shoes & Jewelry, Women, Clothing, L...",{},"[85% Cotton, 15% Spandex, Imported, Hook and E...",Product Description Introducing Playtex 18 hou...,...,,{},apparel,,,,,,"[Everyday Bras, Lingerie, Bras, Sports Bras, L...","[Clothing, Shoes & Jewelry, Clothing, Lingerie..."
2,product,us,B07F86B3R2,Zeagoo Women's Open Front Cardigan 3/4 Sleeve ...,4.3 out of 5 stars,"9,489 ratings","[Clothing, Shoes & Jewelry, Women, Clothing, S...",{},[Product Dimensions : 13.39 x 11.02 x 0.79 i...,From the brand Lightweight Cardigan Previous p...,...,,{},apparel,,,,,,"[Sweaters, Cardigans, Sets, Blouses & Button-D...","[Clothing, Shoes & Jewelry, Clothing, Sweaters..."
3,product,us,B07HLWMB8J,Under Armour Men's Micro G Pursuit-Wide Sneake...,4.7 out of 5 stars,526 ratings,"[Clothing, Shoes & Jewelry, Men]",{},"[100% Textile, Imported, Rubber sole, Shaft me...","Product Description Lightweight, breathable me...",...,,{},shoes,,,,,,"[Road Running, Running, Athletic, Socks & Hosi...","[Clothing, Shoes & Jewelry]"
4,product,us,B089YD2KK5,Crocs Jibbitz 5-Pack Alien Shoe Charms | Jibbi...,4.7 out of 5 stars,"1,105 ratings","[Clothing, Shoes & Jewelry, Shoe, Jewelry & Wa...",{},"[Date First Available : November 21, 2021, M...",From the brand Previous page Shop Crocs Collec...,...,$9.99,{},shoes,,,,,,"[Water Shoes, Shoe Decoration Charms, Socks]","[Clothing, Shoes & Jewelry, Shoe, Jewelry & Wa..."


In [391]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 640 entries, 0 to 639
Data columns (total 22 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   type          640 non-null    object
 1   locale        640 non-null    object
 2   asin          640 non-null    object
 3   title         640 non-null    object
 4   stars         640 non-null    object
 5   ratings       640 non-null    object
 6   category      640 non-null    object
 7   attrs         638 non-null    object
 8   bullets       638 non-null    object
 9   description   638 non-null    object
 10  info          638 non-null    object
 11  reviews       640 non-null    object
 12  price         640 non-null    object
 13  formats       640 non-null    object
 14  template      640 non-null    object
 15  subtitle      2 non-null      object
 16  author        2 non-null      object
 17  desc          2 non-null      object
 18  attr          2 non-null      object
 19  review  

In [392]:
df["predicted"].value_counts()

predicted
[Watches, Wrist Watches]                                                                                6
[Wrist Watches, Watches]                                                                                4
[Sunglasses, Sunglasses & Eyewear Accessories]                                                          2
[Belts]                                                                                                 2
[Totes, Sports & Outdoors, Tanks & Camis, Caps & Hats]                                                  1
                                                                                                       ..
[Sweaters, Fashion Hoodies & Sweatshirts, Hoodies]                                                      1
[Sweaters, Cardigans, Pullovers, Sports & Outdoors, Fashion Hoodies & Sweatshirts]                      1
[Running, Road Running, Socks & Hosiery, Athletic, Shoes, Sport Specific Clothing, Ice & Snow Grips]    1
[Bodysuits, Suspenders, Ties, Cummer

In [393]:
def evaluate_predictions(true_labels: List[List[str]], 
                        predicted_labels: List[List[str]]) -> Dict[str, float]:
    """Evaluate predictions using hierarchical metrics"""
    metrics = {
        'precision': 0,
        'recall': 0,
        'f1': 0,
        'hierarchical_precision': 0,
        'hierarchical_recall': 0,
        'hierarchical_f1': 0
    }
    
    total_docs = len(true_labels)
    
    for true, pred in zip(true_labels, predicted_labels):
        # Standard metrics
        if pred:  # Avoid division by zero
            precision = len(set(true) & set(pred)) / len(pred)
            recall = len(set(true) & set(pred)) / len(true)
            metrics['precision'] += precision
            metrics['recall'] += recall
            
        # Hierarchical metrics
        true_hierarchy = set()
        pred_hierarchy = set()
        
        # Build hierarchical paths
        for label in true:
            parts = label.split('/')
            for i in range(1, len(parts) + 1):
                true_hierarchy.add('/'.join(parts[:i]))
                
        for label in pred:
            parts = label.split('/')
            for i in range(1, len(parts) + 1):
                pred_hierarchy.add('/'.join(parts[:i]))
        
        # Calculate hierarchical scores
        if pred_hierarchy:  # Avoid division by zero
            h_precision = len(true_hierarchy & pred_hierarchy) / len(pred_hierarchy)
            h_recall = len(true_hierarchy & pred_hierarchy) / len(true_hierarchy)
            metrics['hierarchical_precision'] += h_precision
            metrics['hierarchical_recall'] += h_recall
    
    # Calculate averages
    for key in metrics:
        metrics[key] /= total_docs
    
    # Calculate F1 scores
    if metrics['precision'] + metrics['recall'] > 0:
        metrics['f1'] = 2 * (metrics['precision'] * metrics['recall']) / (metrics['precision'] + metrics['recall'])
    
    if metrics['hierarchical_precision'] + metrics['hierarchical_recall'] > 0:
        metrics['hierarchical_f1'] = 2 * (metrics['hierarchical_precision'] * metrics['hierarchical_recall']) / (metrics['hierarchical_precision'] + metrics['hierarchical_recall'])
    
    return metrics

# Usage example:
true_categories = [doc['category'] for doc in documents]
predicted_categories = [doc['predicted_category'] for doc in enhanced_docs]

metrics = evaluate_predictions(true_categories, predicted_categories)

print("\nEvaluation Metrics:")
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")


Evaluation Metrics:
precision: 0.3810
recall: 0.4603
f1: 0.4169
hierarchical_precision: 0.3810
hierarchical_recall: 0.4603
hierarchical_f1: 0.4169


In [394]:
def evaluate_precision_at_k(true_labels: List[List[str]], 
                          predicted_labels: List[List[str]]) -> Dict[str, float]:
    """Evaluate predictions using Precision@k metrics"""
    metrics = {
        'precision@1': 0.0,
        'precision@3': 0.0
    }
    
    total_docs = len(true_labels)
    
    for true, pred in zip(true_labels, predicted_labels):
        # Precision@1
        if pred and len(pred) >= 1:
            if pred[0] in true:
                metrics['precision@1'] += 1
        
        # Precision@3
        if pred and len(pred) >= 3:
            correct_at_3 = len(set(pred[:3]) & set(true))
            metrics['precision@3'] += correct_at_3 / 3
    
    # Calculate averages
    metrics['precision@1'] /= total_docs
    metrics['precision@3'] /= total_docs
    
    return metrics

# Usage
true_categories = [doc['category'] for doc in documents]
predicted_categories = [doc['predicted_category'] for doc in enhanced_docs]

precision_scores = evaluate_precision_at_k(true_categories, predicted_categories)
print("\nPrecision Scores:")
print(f"Precision@1: {precision_scores['precision@1']:.4f}")
print(f"Precision@3: {precision_scores['precision@3']:.4f}")


Precision Scores:
Precision@1: 0.7328
Precision@3: 0.4438


In [400]:
def calculate_precision(predicted: List[str], actual: List[str], k: int = 1) -> float:
    """Calculate Precision@k score"""
    if not predicted or not actual:
        return 0.0
        
    # Take top k predictions
    pred_k = predicted[:k]
    
    # Count correct predictions
    correct = sum(1 for p in pred_k if p in actual)
    
    # Calculate precision
    return correct / k

def calculate_example_f1(predicted: List[str], actual: List[str]) -> float:
    """Calculate Example-F1 score for a single prediction"""
    if not predicted or not actual:
        return 0.0
    
    # Calculate intersection of predicted and actual labels
    intersection = set(predicted) & set(actual)
    
    # Calculate precision and recall
    precision = len(intersection) / len(predicted) if predicted else 0
    recall = len(intersection) / len(actual) if actual else 0
    
    # Calculate F1 score
    if precision + recall == 0:
        return 0.0
    f1 = 2 * (precision * recall) / (precision + recall)
    
    return f1

def evaluate_predictions(documents: List[Dict], predicted_categories: List[List[str]]) -> Dict[str, float]:
    """Calculate Precision@1, Precision@3 and Example-F1 scores"""
    p1_scores = []
    p3_scores = []
    f1_scores = []
    
    for doc, pred_cats in zip(documents, predicted_categories):
        # Get actual categories as list
        actual_cats = doc['category']  # Already in list format
        
        # Calculate metrics
        p1 = calculate_precision(pred_cats, actual_cats, k=1)
        p3 = calculate_precision(pred_cats, actual_cats, k=3)
        f1 = calculate_example_f1(pred_cats, actual_cats)
        
        # Store scores
        p1_scores.append(p1)
        p3_scores.append(p3)
        f1_scores.append(f1)
    
    # Calculate averages
    avg_p1 = sum(p1_scores) / len(p1_scores) if p1_scores else 0
    avg_p3 = sum(p3_scores) / len(p3_scores) if p3_scores else 0
    avg_f1 = sum(f1_scores) / len(f1_scores) if f1_scores else 0
    
    return {
        'precision@1': avg_p1,
        'precision@3': avg_p3,
        'example_f1': avg_f1
    }



In [402]:
# Usage
true_categories = [doc['category'] for doc in documents]
predicted_categories = [doc['predicted_category'] for doc in enhanced_docs]

precision_scores = evaluate_predictions(documents, predicted_categories)
print("\nPrecision Scores:")
print(f"Precision@1: {precision_scores['precision@1']:.4f}")
print(f"Precision@3: {precision_scores['precision@3']:.4f}")


Precision Scores:
Precision@1: 0.7328
Precision@3: 0.4807
