In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import re

In [8]:
def preprocess_text(text):
    """Clean and standardize narrative text"""
    text = str(text).lower()
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    text = ' '.join(text.split())
    return text

def create_spine_injury_classifier(df, narrative_column='Narrative'):
    """
    Create and train a classifier for spine-related injuries with pattern analysis
    """
    spine_keywords = [
        'spine', 'spinal', 'back pain', 'neck pain',
        'cervical', 'thoracic', 'lumbar', 'vertebra',
        'vertebrae', 'disc', 'disk', 'herniated',
        'back sprain', 'neck sprain', 'back strain',
        'neck strain', 'backbone', 'back injury',
        'neck injury', 'back muscle', 'neck muscle'
    ]
    
    df['processed_narrative'] = df[narrative_column].apply(preprocess_text)
    
    def contains_spine_keywords(text):
        return any(keyword in text for keyword in spine_keywords)
    
    df['spine_related'] = df['processed_narrative'].apply(contains_spine_keywords)
    
    vectorizer = TfidfVectorizer(
        max_features=5000,
        ngram_range=(1, 3),  # 3-word phrases
        stop_words='english',
        min_df=2, 
        max_df=0.95 
    )
    
    X = vectorizer.fit_transform(df['processed_narrative'])
    y = df['spine_related']
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    #logistic regression classifier
    classifier = LogisticRegression(max_iter=1000, class_weight='balanced')
    classifier.fit(X_train, y_train)
    
    #analyze important features
    feature_importance = analyze_important_features(classifier, vectorizer)
    
    #validate
    y_pred = classifier.predict(X_test)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    #ex of non-keyword matches
    df['prediction_prob'] = classifier.predict_proba(X)[:, 1]
    non_keyword_matches = find_non_keyword_matches(df, spine_keywords)
    
    return classifier, vectorizer, feature_importance, non_keyword_matches

def analyze_important_features(classifier, vectorizer, top_n=20):
    """Analyze which words/phrases the model considers most important"""
    feature_names = vectorizer.get_feature_names_out()
    
    coefficients = classifier.coef_[0]
    
    top_positive_idx = np.argsort(coefficients)[-top_n:]
    top_negative_idx = np.argsort(coefficients)[:top_n]
    
    important_features = {
        'positive': [(feature_names[i], coefficients[i]) 
                    for i in top_positive_idx],
        'negative': [(feature_names[i], coefficients[i]) 
                    for i in top_negative_idx]
    }
    
    return important_features

def find_non_keyword_matches(df, spine_keywords, threshold=0.8):
    """Find high-confidence matches that don't contain original keywords"""
    def contains_no_keywords(row):
        return not any(keyword in row['processed_narrative'] 
                      for keyword in spine_keywords)
    
    non_keyword_matches = df[
        (df['prediction_prob'] > threshold) & 
        df.apply(contains_no_keywords, axis=1)
    ]
    
    return non_keyword_matches[['Narrative', 'prediction_prob']]

def analyze_new_narratives(narratives, classifier, vectorizer):
    """Analyze new narrative texts with explanation"""
    processed_narratives = [preprocess_text(text) for text in narratives]
    
    X_new = vectorizer.transform(processed_narratives)
    
    predictions = classifier.predict(X_new)
    probabilities = classifier.predict_proba(X_new)[:, 1]
    
    results = pd.DataFrame({
        'original_narrative': narratives,
        'processed_narrative': processed_narratives,
        'spine_related': predictions,
        'confidence': probabilities
    })
    
    return results

In [9]:
excel_file = r'/Users/armanimanov/Downloads/NEISS Soccer.xlsx'
df = pd.read_excel(excel_file)

In [10]:
classifier, vectorizer, feature_importance, non_keyword_matches = create_spine_injury_classifier(df)

print("\nTOP INDICATORS of spine injuries learned by CLASSIFIER:")
for feature, coef in feature_importance['positive']:
    print(f"{feature}: {coef:.4f}")


Classification Report:
              precision    recall  f1-score   support

       False       0.99      0.98      0.99      2626
        True       0.87      0.94      0.90       375

    accuracy                           0.97      3001
   macro avg       0.93      0.96      0.94      3001
weighted avg       0.98      0.97      0.98      3001


Top indicators of spine injuries learned by the model:
discomfort: 3.6811
neck sprain: 3.7129
concussion neck: 3.9544
neck injury: 4.0673
cervical strain: 4.2633
lower: 4.4195
sprain: 4.6788
neck strain: 4.7393
lower pain: 4.9135
low pain: 4.9246
dx pain: 5.0155
thoracic: 5.0210
low: 5.0894
spine: 6.2166
cervical: 7.4669
neck pain: 8.2567
lumbar: 8.6240
strain: 8.6818
neck: 8.6849
pain: 10.2140

Examples of spine injuries identified without keywords:
                                              Narrative  prediction_prob
64    17 YOM WITH SCROTAL PAIN AFTER PLAYING SOCCER....         0.896171
175   9YOM PRESENTED TO ED C/O PAIN SHOULDER, P