In [2]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import numpy as np

In [5]:
class SpineInjuryAnalyzer:
    def __init__(self):
        """
        Initialize the analyzer with BioClinicalBERT
        """
        print("Loading BioClinicalBERT model...")
        self.tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
        self.model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
        
        #GPU available?
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = self.model.to(self.device)
        self.model.eval()
        
        self.spine_terms = {
            'anatomical': [
                "spine", "spinal", "vertebra", "vertebrae", "cervical", 
                "thoracic", "lumbar", "sacral", "coccyx", "back", "neck",
                "vertebral column", "spinal cord", "disc", "disk"
            ],
            'injuries': [
                "strain", "sprain", "fracture", "injury", "trauma",
                "whiplash", "subluxation", "dislocation", "compression",
                "contusion", "pulled muscle"
            ],
            'conditions': [
                "herniated", "bulging", "slipped", "compressed", "pinched",
                "stenosis", "radiculopathy", "myelopathy", "spondylosis",
                "degenerative", "sciatica"
            ],
            'symptoms': [
                "pain", "numbness", "tingling", "weakness", "stiffness",
                "limited range", "limited motion", "radiating", "shooting",
                "sore", "ache", "tender", "burning"
            ]
        }

    def preprocess_text(self, text):
        text = str(text).lower().strip()
        return text

    def analyze_narrative(self, text):
        """
        Analyze a single narrative for spine-related content
        """
        processed_text = self.preprocess_text(text)
        
        matches = {category: [] for category in self.spine_terms.keys()}
        for category, terms in self.spine_terms.items():
            for term in terms:
                if term in processed_text:
                    matches[category].append(term)

        encoded = self.tokenizer(
            processed_text,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors='pt'
        ).to(self.device)

        with torch.no_grad():
            outputs = self.model(**encoded)
            embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()[0]

        category_scores = {}
        for category, matched_terms in matches.items():
            if matched_terms:
                category_scores[category] = len(matched_terms) / len(self.spine_terms[category])
            else:
                category_scores[category] = 0.0

        anatomical_weight = 0.4
        other_weight = 0.2

        spine_score = (
            category_scores['anatomical'] * anatomical_weight +
            category_scores['injuries'] * other_weight +
            category_scores['conditions'] * other_weight +
            category_scores['symptoms'] * other_weight
        )

        # SPINE-RELATED? must have anatomical term and atleastone other indicator>
        is_spine_related = (
            category_scores['anatomical'] > 0 and
            (category_scores['injuries'] > 0 or 
             category_scores['conditions'] > 0 or 
             category_scores['symptoms'] > 0)
        )

        return {
            'spine_score': float(spine_score),
            'is_spine_related': is_spine_related,
            'matched_terms': {k: v for k, v in matches.items() if v},
            'category_scores': category_scores
        }

    def analyze_dataset(self, df, narrative_column='Narrative'):
        """
        Analyze entire dataset of narratives
        """
        results = []
        
        print("Analyzing narratives...")
        for text in tqdm(df[narrative_column]):
            result = self.analyze_narrative(text)
            results.append(result)
        
        #basic results to dataframe
        df['spine_score'] = [r['spine_score'] for r in results]
        df['is_spine_related'] = [r['is_spine_related'] for r in results]
        
        for category in self.spine_terms.keys():
            df[f'{category}_terms'] = [
                ', '.join(r['matched_terms'].get(category, [])) 
                for r in results
            ]
            df[f'{category}_score'] = [
                r['category_scores'][category] 
                for r in results
            ]
        
        return df

def main():
    excel_path = '/Users/armanimanov/Downloads/NEISS Soccer.xlsx'
    output_path = "soccer_spine_injuries.xlsx"
    
    print("loading NEISS data...")
    df = pd.read_excel(excel_path)
    
    analyzer = SpineInjuryAnalyzer()
    
    df = analyzer.analyze_dataset(df)
    
    total_cases = len(df)
    spine_cases = df['is_spine_related'].sum()
    spine_percentage = (spine_cases / total_cases) * 100
    
    print("\analysis Results:")
    print(f"tot cases analyzed: {total_cases}")
    print(f"Spine-related cases: {spine_cases} ({spine_percentage:.1f}%)")
    
    for category in analyzer.spine_terms.keys():
        cases = df[df[f'{category}_terms'].str.len() > 0].shape[0]
        print(f"{category.title()} terms found in {cases} cases ({cases/total_cases*100:.1f}%)")
    
    print("\nSaving results...")
    df.to_excel(output_path, index=False)
    print(f"Results saved to: {output_path}")
    
    return df

if __name__ == "__main__":
    df = main()

Loading NEISS data...
Loading BioClinicalBERT model...


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Analyzing narratives...


  1%|          | 98/15004 [00:03<07:45, 31.99it/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

100%|██████████| 15004/15004 [07:25<00:00, 33.70it/s]



Analysis Results:
Total cases analyzed: 15004
Spine-related cases: 2829 (18.9%)
Anatomical terms found in 3442 cases (22.9%)
Injuries terms found in 6287 cases (41.9%)
Conditions terms found in 284 cases (1.9%)
Symptoms terms found in 4683 cases (31.2%)

Saving results...
Results saved to: soccer_spine_injuries.xlsx
