In [6]:
import pandas as pd
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import numpy as np
import time

In [10]:
class SpineInjuryAnalyzer:
    def __init__(self):
        """
        Initialize the analyzer with BioClinicalBERT
        """
        print("Loading BioClinicalBERT model...")
        self.tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
        self.model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
        
        #GPU available?
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = self.model.to(self.device)
        self.model.eval()
        
        self.spine_terms = {
            'anatomical': [
                "spine", "spinal", "vertebra", "vertebrae", "cervical", 
                "thoracic", "lumbar", "sacral", "coccyx", "back", "neck",
                "vertebral column", "spinal cord", "disc", "disk",
                "intervertebral disc", "intervertebral foramen", "facet joint",
                "ligament", "tendon", "paraspinal muscles", "spinal nerves", 
                "cauda equina", "nucleus pulposus", "annulus fibrosus",
                "transverse process", "spinous process", "vertebral body",
                "pedicle", "lamina", "zygapophyseal joint", "posterior longitudinal ligament",
                "anterior longitudinal ligament", "ligamentum flavum", "interspinous ligament",
                "supraspinous ligament", "dura mater", "pia mater", "arachnoid mater"
            ],
            'injuries': [
                "strain", "sprain", "fracture", "injury", "trauma",
                "whiplash", "subluxation", "dislocation", "compression",
                "contusion", "pulled muscle", "herniation", "rupture",
                "tear", "avulsion", "compression fracture", "burst fracture",
                "flexion-distraction injury", "chance fracture", "facet joint injury",
                "ligamentous injury", "vertebral body fracture", "pedicle fracture",
                "lamina fracture", "spinous process fracture", "transverse process fracture",
                "sacral fracture", "coccygeal fracture"
            ],
            'conditions': [
                "herniated", "bulging", "slipped", "compressed", "pinched",
                "stenosis", "radiculopathy", "myelopathy", "spondylosis",
                "degenerative", "sciatica", "spondylolisthesis",
                "osteoarthritis", "ankylosing spondylitis", "kyphosis",
                "lordosis", "scoliosis", "spinal stenosis", "degenerative disc disease",
                "facet joint arthritis", "spinal cord injury", "cauda equina syndrome",
                "disc protrusion", "disc extrusion", "disc sequestration", "disc desiccation",
                "disc height loss", "osteophyte formation", "spinal instability",
                "spinal fracture, trauma", "spinal infection", "spinal tumor", "syringomyelia",
                "tethered cord syndrome", "Scheuermann's disease", "Paget's disease of bone",
                "ossification of posterior longitudinal ligament", "spinal arteriovenous malformation"
            ],
            'symptoms': [
                "pain", "numbness", "tingling", "weakness", "stiffness",
                "limited range", "limited motion", "radiating", "shooting",
                "sore", "ache", "tender", "burning", "stabbing", "sharp",
                "dull", "throbbing", "muscle spasm", "loss of reflexes",
                "bowel/bladder dysfunction", "gait abnormalities", "sensory deficit",
                "motor deficit", "paresthesia", "paralysis", "incontinence",
                "saddle anesthesia", "cauda equina syndrome", "myelopathy symptoms",
                "radicular symptoms", "neck pain", "back pain", "arm pain", "leg pain"
            ]
        }

    def preprocess_text(self, text):
        text = str(text).lower().strip()
        return text

    def analyze_narrative(self, text):
        """
        Analyze a single narrative for spine-related content
        """
        processed_text = self.preprocess_text(text)
        
        matches = {category: [] for category in self.spine_terms.keys()}
        for category, terms in self.spine_terms.items():
            for term in terms:
                if term in processed_text:
                    matches[category].append(term)

        encoded = self.tokenizer(
            processed_text,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors='pt'
        ).to(self.device)

        with torch.no_grad():
            outputs = self.model(**encoded)
            embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()[0]

        category_scores = {}
        for category, matched_terms in matches.items():
            if matched_terms:
                category_scores[category] = len(matched_terms) / len(self.spine_terms[category])
            else:
                category_scores[category] = 0.0

        anatomical_weight = 0.4
        other_weight = 0.2

        spine_score = (
            category_scores['anatomical'] * anatomical_weight +
            category_scores['injuries'] * other_weight +
            category_scores['conditions'] * other_weight +
            category_scores['symptoms'] * other_weight
        )

        # SPINE-RELATED? must have anatomical term and atleastone other indicator>
        is_spine_related = (
            category_scores['anatomical'] > 0 and
            (category_scores['injuries'] > 0 or 
             category_scores['conditions'] > 0 or 
             category_scores['symptoms'] > 0)
        )

        return {
            'spine_score': float(spine_score),
            'is_spine_related': is_spine_related,
            'matched_terms': {k: v for k, v in matches.items() if v},
            'category_scores': category_scores
        }

    def analyze_dataset(self, df, narrative_column='Narrative'):
        """
        Analyze entire dataset of narratives
        """
        results = []
        
        print("Analyzing narratives...")
        for text in tqdm(df[narrative_column]):
            result = self.analyze_narrative(text)
            results.append(result)
        
        #basic results to dataframe
        df['spine_score'] = [r['spine_score'] for r in results]
        df['is_spine_related'] = [r['is_spine_related'] for r in results]
        
        for category in self.spine_terms.keys():
            df[f'{category}_terms'] = [
                ', '.join(r['matched_terms'].get(category, [])) 
                for r in results
            ]
            df[f'{category}_score'] = [
                r['category_scores'][category] 
                for r in results
            ]
        
        return df

def main():

    excel_path = r'/Users/armanimanov/Downloads/NEISS Soccer.xlsx'
    output_path = f"soccer_spine_injuries{time.time()}.xlsx"
    
    print("loading NEISS data...")
    df = pd.read_excel(excel_path)
    
    analyzer = SpineInjuryAnalyzer()
    
    df = analyzer.analyze_dataset(df)
    
    total_cases = len(df)
    spine_cases = df['is_spine_related'].sum()
    spine_percentage = (spine_cases / total_cases) * 100
    
    print("\analysis Results:")
    print(f"tot cases analyzed: {total_cases}")
    print(f"Spine-related cases: {spine_cases} ({spine_percentage:.1f}%)")
    
    for category in analyzer.spine_terms.keys():
        cases = df[df[f'{category}_terms'].str.len() > 0].shape[0]
        print(f"{category.title()} terms found in {cases} cases ({cases/total_cases*100:.1f}%)")
    
    print("\nSaving results...")
    df.to_excel(output_path, index=False)
    print(f"Results saved to: {output_path}")
    
    return df

if __name__ == "__main__":
    df = main()

In [14]:
class EnhancedSpineInjuryAnalyzer:

        
    def preprocess_text(self, text):

        text = str(text).lower().strip()
        return text
        
    def __init__(self):

        print("Loading BioClinicalBERT model...")
        self.tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
        self.model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
        
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = self.model.to(self.device)
        self.model.eval()
        
        # Keep existing term dictionaries
        self.spine_terms = {
            'anatomical': [
                "spine", "spinal", "vertebra", "vertebrae", "cervical", 
                "thoracic", "lumbar", "sacral", "coccyx", "back", "neck",
                "vertebral column", "spinal cord", "disc", "disk",
                "intervertebral disc", "intervertebral foramen", "facet joint",
                "ligament", "tendon", "paraspinal muscles", "spinal nerves", 
                "cauda equina", "nucleus pulposus", "annulus fibrosus",
                "transverse process", "spinous process", "vertebral body",
                "pedicle", "lamina", "zygapophyseal joint", "posterior longitudinal ligament",
                "anterior longitudinal ligament", "ligamentum flavum", "interspinous ligament",
                "supraspinous ligament", "dura mater", "pia mater", "arachnoid mater"
            ],
            'injuries': [
                "strain", "sprain", "fracture", "injury", "trauma",
                "whiplash", "subluxation", "dislocation", "compression",
                "contusion", "pulled muscle", "herniation", "rupture",
                "tear", "avulsion", "compression fracture", "burst fracture",
                "flexion-distraction injury", "chance fracture", "facet joint injury",
                "ligamentous injury", "vertebral body fracture", "pedicle fracture",
                "lamina fracture", "spinous process fracture", "transverse process fracture",
                "sacral fracture", "coccygeal fracture"
            ],
            'conditions': [
                "herniated", "bulging", "slipped", "compressed", "pinched",
                "stenosis", "radiculopathy", "myelopathy", "spondylosis",
                "degenerative", "sciatica", "spondylolisthesis",
                "osteoarthritis", "ankylosing spondylitis", "kyphosis",
                "lordosis", "scoliosis", "spinal stenosis", "degenerative disc disease",
                "facet joint arthritis", "spinal cord injury", "cauda equina syndrome",
                "disc protrusion", "disc extrusion", "disc sequestration", "disc desiccation",
                "disc height loss", "osteophyte formation", "spinal instability",
                "spinal fracture, trauma", "spinal infection", "spinal tumor", "syringomyelia",
                "tethered cord syndrome", "Scheuermann's disease", "Paget's disease of bone",
                "ossification of posterior longitudinal ligament", "spinal arteriovenous malformation"
            ],
            'symptoms': [
                "pain", "numbness", "tingling", "weakness", "stiffness",
                "limited range", "limited motion", "radiating", "shooting",
                "sore", "ache", "tender", "burning", "stabbing", "sharp",
                "dull", "throbbing", "muscle spasm", "loss of reflexes",
                "bowel/bladder dysfunction", "gait abnormalities", "sensory deficit",
                "motor deficit", "paresthesia", "paralysis", "incontinence",
                "saddle anesthesia", "cauda equina syndrome", "myelopathy symptoms",
                "radicular symptoms", "neck pain", "back pain", "arm pain", "leg pain"
            ]
        }
        
        self.similarity_threshold = 0.85
        
        self.term_embeddings = self._precompute_term_embeddings()
        
        self.semantic_cache = {}

    def _precompute_term_embeddings(self):
        embeddings = {}
        print("Pre-computing term embeddings...")
        
        for category, terms in self.spine_terms.items():
            embeddings[category] = {}
            for term in terms:
                with torch.no_grad():
                    encoded = self.tokenizer(term, return_tensors='pt', padding=True).to(self.device)
                    outputs = self.model(**encoded)
                    embedding = outputs.last_hidden_state[:, 0, :].cpu()
                    embeddings[category][term] = embedding
                    
        return embeddings

    def _compute_semantic_similarity(self, text_embedding, term_embedding):

        return torch.nn.functional.cosine_similarity(
            text_embedding, 
            term_embedding, 
            dim=1
        ).item()

    def _find_semantic_matches(self, text_embedding, category):

        semantic_matches = []
        
        for term, term_embedding in self.term_embeddings[category].items():
            similarity = self._compute_semantic_similarity(text_embedding, term_embedding)
            if similarity > self.similarity_threshold:
                semantic_matches.append((term, similarity))
                
        return semantic_matches

    def analyze_narrative(self, text):

        processed_text = self.preprocess_text(text)
        
        cache_key = hash(processed_text)
        if cache_key in self.semantic_cache:
            return self.semantic_cache[cache_key]
        
        encoded = self.tokenizer(
            processed_text,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors='pt'
        ).to(self.device)
        
        with torch.no_grad():
            outputs = self.model(**encoded)
            text_embedding = outputs.last_hidden_state[:, 0, :]
        
        matches = {category: [] for category in self.spine_terms.keys()}
        semantic_matches = {category: [] for category in self.spine_terms.keys()}
        
        for category in self.spine_terms.keys():
            for term in self.spine_terms[category]:
                if term in processed_text:
                    matches[category].append(term)
            
            category_semantic_matches = self._find_semantic_matches(text_embedding, category)
            semantic_matches[category].extend(
                [term for term, sim in category_semantic_matches if term not in matches[category]]
            )
        
        category_scores = {}
        for category in self.spine_terms.keys():
            exact_matches = len(matches[category])
            semantic_matches_count = len(semantic_matches[category])
            
            category_scores[category] = (
                (exact_matches + 0.7 * semantic_matches_count) / 
                len(self.spine_terms[category])
            )
        
        anatomical_weight = 0.4
        other_weights = {
            'injuries': 0.25,
            'conditions': 0.20,
            'symptoms': 0.15
        }
        
        spine_score = (
            category_scores['anatomical'] * anatomical_weight +
            sum(category_scores[cat] * weight 
                for cat, weight in other_weights.items())
        )
        
        is_spine_related = (
            (category_scores['anatomical'] > 0 or len(semantic_matches['anatomical']) > 0) and
            any(
                category_scores[cat] > 0 or len(semantic_matches[cat]) > 0
                for cat in ['injuries', 'conditions', 'symptoms']
            )
        )
        
        confidence_score = min(
            max(spine_score, 
                max(category_scores.values())),
            1.0
        )
        
        result = {
            'spine_score': float(spine_score),
            'is_spine_related': is_spine_related,
            'confidence_score': float(confidence_score),
            'exact_matches': {k: v for k, v in matches.items() if v},
            'semantic_matches': {k: v for k, v in semantic_matches.items() if v},
            'category_scores': category_scores
        }
        
        self.semantic_cache[cache_key] = result
        
        return result

    def analyze_dataset(self, df, narrative_column='Narrative'):

        results = []
        
        print("Analyzing narratives...")
        for text in tqdm(df[narrative_column]):
            result = self.analyze_narrative(text)
            results.append(result)
        
        df['spine_score'] = [r['spine_score'] for r in results]
        df['is_spine_related'] = [r['is_spine_related'] for r in results]
        df['confidence_score'] = [r['confidence_score'] for r in results]
        
        for category in self.spine_terms.keys():
            df[f'{category}_exact_terms'] = [
                ', '.join(r['exact_matches'].get(category, [])) 
                for r in results
            ]
            df[f'{category}_semantic_terms'] = [
                ', '.join(r['semantic_matches'].get(category, [])) 
                for r in results
            ]
            df[f'{category}_score'] = [
                r['category_scores'][category] 
                for r in results
            ]
        
        df['semantic_match_count'] = [
            sum(len(r['semantic_matches'].get(cat, [])) 
                for cat in self.spine_terms.keys())
            for r in results
        ]
        
        return df

def main():
    excel_path = r'/Users/armanimanov/Downloads/NEISS Soccer.xlsx'
    output_path = f"enhanced_spine_analysis_{time.time()}.xlsx"
    
    print("Loading NEISS data...")
    df = pd.read_excel(excel_path)
    
    analyzer = EnhancedSpineInjuryAnalyzer()
    df = analyzer.analyze_dataset(df)
    
    total_cases = len(df)
    spine_cases = df['is_spine_related'].sum()
    spine_percentage = (spine_cases / total_cases) * 100
    
    print("\nENHANCED ANALYSIS RESULTS:")
    print(f"TOT cases: {total_cases}")
    print(f"Spine-related cases: {spine_cases} ({spine_percentage:.1f}%)")
    
    # Enhanced statistics
    high_confidence_cases = df[df['confidence_score'] > 0.8].shape[0]
    semantic_matches = df[df['semantic_match_count'] > 0].shape[0]
    
    print(f"HI_CONF cases: {high_confidence_cases} ({high_confidence_cases/total_cases*100:.1f}%)")
    print(f"Cases w/ semantic matches: {semantic_matches} ({semantic_matches/total_cases*100:.1f}%)")
    
    for category in analyzer.spine_terms.keys():
        exact_cases = df[df[f'{category}_exact_terms'].str.len() > 0].shape[0]
        semantic_cases = df[df[f'{category}_semantic_terms'].str.len() > 0].shape[0]
        print(f"\n{category.title()}:")
        print(f"- exact match: {exact_cases} ({exact_cases/total_cases*100:.1f}%)")
        print(f"- semantic match: {semantic_cases} ({semantic_cases/total_cases*100:.1f}%)")
    
    print("\nsaving enhanced results...")
    df.to_excel(output_path, index=False)
    print(f"results saved {output_path}")
    
    return df

if __name__ == "__main__":
    df = main()

Loading NEISS data...
Loading BioClinicalBERT model...
Pre-computing term embeddings...
Analyzing narratives...


100%|██████████| 15004/15004 [08:09<00:00, 30.68it/s]



Enhanced Analysis Results:
Total cases analyzed: 15004
Spine-related cases: 3211 (21.4%)
High confidence cases: 0 (0.0%)
Cases with semantic matches: 9395 (62.6%)

Anatomical:
- Exact matches: 3451 (23.0%)
- Semantic matches: 196 (1.3%)

Injuries:
- Exact matches: 6359 (42.4%)
- Semantic matches: 4556 (30.4%)

Conditions:
- Exact matches: 286 (1.9%)
- Semantic matches: 8528 (56.8%)

Symptoms:
- Exact matches: 4700 (31.3%)
- Semantic matches: 2335 (15.6%)

Saving enhanced results...
Results saved to: enhanced_spine_analysis_1731511303.0158012.xlsx
