Preparing training data for CRF...
Training CRF model...
Preparing test data for CRF...
Evaluating CRF model...
Converting predictions to entity format...
Generating output dataframe...

CRF Model Performance:
Overall F1 Score: 0.9504
Overall Precision: 0.9561
Overall Recall: 0.9449

Detailed Classification Report:
              precision    recall  f1-score   support

 B-Condition     0.9701    0.9619    0.9660     16817
 I-Condition     0.8920    0.8737    0.8828      8157
B-Medication     1.0000    0.9754    0.9876       244
I-Medication     1.0000    0.9630    0.9811        54
 B-Procedure     0.9947    0.9879    0.9913      4562
 I-Procedure     0.9914    0.9792    0.9853      2594

   micro avg     0.9561    0.9449    0.9504     32428
   macro avg     0.9747    0.9569    0.9657     32428
weighted avg     0.9559    0.9449    0.9503     32428


Condition:
F1 Score: 0.9454
Precision: 0.9513
Recall: 0.9395

Procedure:
F1 Score: 0.9901
Precision: 0.9945
Recall: 0.9857

Medication:
F1 

'\n# Assuming X_train, y_train, X_test, y_test are already defined\n# Convert y_train and y_test to the expected format (list of dicts with entity types as keys)\ntrain_entities = []\nfor i in range(len(y_train)):\n    train_entities.append({\n        "Condition": y_train.iloc[i][\'Condition\'],\n        "Procedure": y_train.iloc[i][\'Procedure\'],\n        "Medication": y_train.iloc[i][\'Medication\']\n    })\n\ntest_entities = []\nfor i in range(len(y_test)):\n    test_entities.append({\n        "Condition": y_test.iloc[i][\'Condition\'],\n        "Procedure": y_test.iloc[i][\'Procedure\'],\n        "Medication": y_test.iloc[i][\'Medication\']\n    })\n\n# Run the CRF pipeline\nresults = run_crf_pipeline(X_train.tolist(), train_entities, X_test.tolist(), test_entities)\n\n# Print evaluation results\nprint("\nCRF Model Performance:")\nprint(f"Overall F1 Score: {results[\'evaluation\'][\'overall\'][\'f1\']:.4f}")\nprint(f"Overall Precision: {results[\'evaluation\'][\'overall\'][\'preci

In [4]:
def generate_output_adjusted(texts, predicted_entity_dicts, original_df=None):
    """
    Generate output dataframe in the required format with both predicted and original entity columns
    
    Args:
        texts: List of texts
        predicted_entity_dicts: List of dictionaries with predicted entity types as keys and lists of entities as values
        original_df: Original DataFrame with entity columns to consolidate
        
    Returns:
        DataFrame with columns: text, predicted entities, and original entities
    """
    import pandas as pd
    
    results = []
    
    for i, text in enumerate(texts):
        try:
            # Convert predicted entity lists to comma-separated strings
            condition_str = ', '.join(predicted_entity_dicts[i]['Condition']) if predicted_entity_dicts[i]['Condition'] else ''
            procedure_str = ', '.join(predicted_entity_dicts[i]['Procedure']) if predicted_entity_dicts[i]['Procedure'] else ''
            medication_str = ', '.join(predicted_entity_dicts[i]['Medication']) if predicted_entity_dicts[i]['Medication'] else ''
            
            # Create result row with predicted entities
            result_row = {
                'text': text,
                'Condition': condition_str,
                'Procedure': procedure_str,
                'Medication': medication_str
            }
            
            # Add original entity columns if original_df is provided
            if original_df is not None and i < len(original_df):
                # Get original values
                result_row['original_Condition'] = ', '.join(original_df.iloc[i]['Condition']) if isinstance(original_df.iloc[i]['Condition'], list) else original_df.iloc[i]['Condition']
                result_row['original_Procedure'] = ', '.join(original_df.iloc[i]['Procedure']) if isinstance(original_df.iloc[i]['Procedure'], list) else original_df.iloc[i]['Procedure']
                result_row['original_Medication'] = ', '.join(original_df.iloc[i]['Medication']) if isinstance(original_df.iloc[i]['Medication'], list) else original_df.iloc[i]['Medication']
            
            results.append(result_row)
            
        except Exception as e:
            print(f"Error processing text #{i}: {str(e)[:100]}...")
            # Add an empty row with just the text to maintain alignment
            result_row = {
                'text': text,
                'Condition': '',
                'Procedure': '',
                'Medication': ''
            }
            
            # Add original entity columns if original_df is provided
            if original_df is not None and i < len(original_df):
                result_row['original_Condition'] = ', '.join(original_df.iloc[i]['Condition']) if isinstance(original_df.iloc[i]['Condition'], list) else original_df.iloc[i]['Condition']
                result_row['original_Procedure'] = ', '.join(original_df.iloc[i]['Procedure']) if isinstance(original_df.iloc[i]['Procedure'], list) else original_df.iloc[i]['Procedure']
                result_row['original_Medication'] = ', '.join(original_df.iloc[i]['Medication']) if isinstance(original_df.iloc[i]['Medication'], list) else original_df.iloc[i]['Medication']
            
            results.append(result_row)
    
    return pd.DataFrame(results)

# Update the run_crf_pipeline function to use the adjusted output function
def run_crf_pipeline_with_original_columns(X_train, y_train_entity_dicts, X_test, y_test_entity_dicts, y_test_df=None):
    """
    Run the complete CRF pipeline with consolidated output
    
    Args:
        X_train: List of training texts
        y_train_entity_dicts: List of dictionaries with training entity annotations
        X_test: List of test texts
        y_test_entity_dicts: List of dictionaries with test entity annotations
        y_test_df: Original test DataFrame with entity columns
        
    Returns:
        Dictionary with evaluation results and output dataframe
    """
    print("Preparing training data for CRF...")
    X_train_features, y_train_labels = prepare_data_for_crf(X_train, y_train_entity_dicts)
    
    print("Training CRF model...")
    crf_model = train_crf_model(X_train_features, y_train_labels)
    
    print("Preparing test data for CRF...")
    X_test_features, y_test_labels = prepare_data_for_crf(X_test, y_test_entity_dicts)
    
    print("Evaluating CRF model...")
    eval_results, report, y_pred = evaluate_crf_model(crf_model, X_test_features, y_test_labels)
    
    print("Converting predictions to entity format...")
    pred_entity_dicts = convert_predictions_to_entities(X_test, y_pred)
    
    print("Generating output dataframe with original columns...")
    output_df = generate_output_adjusted(X_test, pred_entity_dicts, original_df=y_test_df)
    
    return {
        'model': crf_model,
        'evaluation': eval_results,
        'report': report,
        'predictions': pred_entity_dicts,
        'output_df': output_df
    }

X_train, y_train, X_test, y_test = read_train_test_split()
# Prepare entity lists for training
train_entities = []
for i in range(len(y_train)):
    train_entities.append({
        "Condition": y_train.iloc[i]['Condition'],
        "Procedure": y_train.iloc[i]['Procedure'],
        "Medication": y_train.iloc[i]['Medication']
    })

# Prepare entity lists for testing
test_entities = []
for i in range(len(y_test)):
    test_entities.append({
        "Condition": y_test.iloc[i]['Condition'],
        "Procedure": y_test.iloc[i]['Procedure'],
        "Medication": y_test.iloc[i]['Medication']
    })

# Create test DataFrame for original columns
test_df = pd.DataFrame({
    'Condition': y_test['Condition'],
    'Procedure': y_test['Procedure'],
    'Medication': y_test['Medication']
})

# Run the pipeline with original columns
results = run_crf_pipeline_with_original_columns(
    X_train.tolist(), 
    train_entities, 
    X_test.tolist(), 
    test_entities,
    y_test_df=test_df
)

 
# Save the output directly
results['output_df'].to_csv('medical_ner_crf_results_with_original.csv', index=False)

print("\nDetailed Classification Report:")
print(results['report'])

for entity_type in ['Condition', 'Procedure', 'Medication']:
    if entity_type in results['evaluation']:
        print(f"\n{entity_type}:")
        print(f"F1 Score: {results['evaluation'][entity_type]['f1']:.4f}")
        print(f"Precision: {results['evaluation'][entity_type]['precision']:.4f}")
        print(f"Recall: {results['evaluation'][entity_type]['recall']:.4f}")

# Print evaluation results
print("\nCRF Model Performance:")
print(f"Overall F1 Score: {results['evaluation']['overall']['f1']:.4f}")
print(f"Overall Precision: {results['evaluation']['overall']['precision']:.4f}")
print(f"Overall Recall: {results['evaluation']['overall']['recall']:.4f}")


# Example usage of the updated function
"""
# Load your data
X_train, y_train, X_test, y_test = read_train_test_split()

# Prepare entity lists for training
train_entities = []
for i in range(len(y_train)):
    train_entities.append({
        "Condition": y_train.iloc[i]['Condition'],
        "Procedure": y_train.iloc[i]['Procedure'],
        "Medication": y_train.iloc[i]['Medication']
    })

# Prepare entity lists for testing
test_entities = []
for i in range(len(y_test)):
    test_entities.append({
        "Condition": y_test.iloc[i]['Condition'],
        "Procedure": y_test.iloc[i]['Procedure'],
        "Medication": y_test.iloc[i]['Medication']
    })

# Create test DataFrame for original columns
test_df = pd.DataFrame({
    'Condition': y_test['Condition'],
    'Procedure': y_test['Procedure'],
    'Medication': y_test['Medication']
})

# Run the pipeline with original columns
results = run_crf_pipeline_with_original_columns(
    X_train.tolist(), 
    train_entities, 
    X_test.tolist(), 
    test_entities,
    y_test_df=test_df
)

# Print evaluation results
print("\nCRF Model Performance:")
print(f"Overall F1 Score: {results['evaluation']['overall']['f1']:.4f}")
print(f"Overall Precision: {results['evaluation']['overall']['precision']:.4f}")
print(f"Overall Recall: {results['evaluation']['overall']['recall']:.4f}")

# Display a few rows to compare predicted vs original entities
print("\nSample of output with predicted and original entities:")
print(results['output_df'][['text', 'Condition', 'original_Condition']].head(3))
"""

Preparing training data for CRF...
Training CRF model...
Preparing test data for CRF...
Evaluating CRF model...
Converting predictions to entity format...
Generating output dataframe with original columns...

CRF Model Performance:
Overall F1 Score: 0.9504
Overall Precision: 0.9561
Overall Recall: 0.9449


'\n# Load your data\nX_train, y_train, X_test, y_test = read_train_test_split()\n\n# Prepare entity lists for training\ntrain_entities = []\nfor i in range(len(y_train)):\n    train_entities.append({\n        "Condition": y_train.iloc[i][\'Condition\'],\n        "Procedure": y_train.iloc[i][\'Procedure\'],\n        "Medication": y_train.iloc[i][\'Medication\']\n    })\n\n# Prepare entity lists for testing\ntest_entities = []\nfor i in range(len(y_test)):\n    test_entities.append({\n        "Condition": y_test.iloc[i][\'Condition\'],\n        "Procedure": y_test.iloc[i][\'Procedure\'],\n        "Medication": y_test.iloc[i][\'Medication\']\n    })\n\n# Create test DataFrame for original columns\ntest_df = pd.DataFrame({\n    \'Condition\': y_test[\'Condition\'],\n    \'Procedure\': y_test[\'Procedure\'],\n    \'Medication\': y_test[\'Medication\']\n})\n\n# Run the pipeline with original columns\nresults = run_crf_pipeline_with_original_columns(\n    X_train.tolist(), \n    train_entitie

In [5]:
print("\nDetailed Classification Report:")
print(results['report'])

for entity_type in ['Condition', 'Procedure', 'Medication']:
    if entity_type in results['evaluation']:
        print(f"\n{entity_type}:")
        print(f"F1 Score: {results['evaluation'][entity_type]['f1']:.4f}")
        print(f"Precision: {results['evaluation'][entity_type]['precision']:.4f}")
        print(f"Recall: {results['evaluation'][entity_type]['recall']:.4f}")



Detailed Classification Report:
              precision    recall  f1-score   support

 B-Condition     0.9701    0.9619    0.9660     16817
 I-Condition     0.8920    0.8737    0.8828      8157
B-Medication     1.0000    0.9754    0.9876       244
I-Medication     1.0000    0.9630    0.9811        54
 B-Procedure     0.9947    0.9879    0.9913      4562
 I-Procedure     0.9914    0.9792    0.9853      2594

   micro avg     0.9561    0.9449    0.9504     32428
   macro avg     0.9747    0.9569    0.9657     32428
weighted avg     0.9559    0.9449    0.9503     32428


Condition:
F1 Score: 0.9454
Precision: 0.9513
Recall: 0.9395

Procedure:
F1 Score: 0.9901
Precision: 0.9945
Recall: 0.9857

Medication:
F1 Score: 0.9864
Precision: 1.0000
Recall: 0.9732
