In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
import lime.lime_text
import shap
from sklearn.metrics import classification_report
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Sample healthcare reviews data
healthcare_reviews = [
    "The doctor was very helpful and attentive during my visit.",
    "I had a terrible experience with the hospital staff.",
    "The treatment was effective and I feel much better now.",
    "The waiting time was too long, and the nurses were rude.",
    "Great service! The team was very professional.",
    "I wouldn't recommend this clinic due to poor service."
]

# Labels: 1 for positive, 0 for negative sentiment
labels = [1, 0, 1, 0, 1, 0]

In [3]:
# Text preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

In [4]:
# Preprocess the reviews
processed_reviews = [preprocess_text(review) for review in healthcare_reviews]

In [5]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(processed_reviews, labels, test_size=0.2, random_state=42)

In [6]:
# Create a pipeline with TF-IDF and Random Forest
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=1000)),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

In [7]:
# Train the model
pipeline.fit(X_train, y_train)

In [8]:
# Function for LIME explanation
def explain_with_lime(text, pipeline):
    # Initialize LIME text explainer
    explainer = lime.lime_text.LimeTextExplainer(class_names=['Negative', 'Positive'])
    
    # Get explanation
    exp = explainer.explain_instance(
        text, 
        pipeline.predict_proba,
        num_features=6
    )
    
    return exp

In [9]:
# Function for SHAP explanation
def explain_with_shap(pipeline, X_train):
    # Create explainer
    explainer = shap.TreeExplainer(pipeline.named_steps['classifier'])
    
    # Transform text data using the fitted TF-IDF vectorizer
    X_train_transformed = pipeline.named_steps['tfidf'].transform(X_train)
    
    # Calculate SHAP values
    shap_values = explainer.shap_values(X_train_transformed.toarray())
    
    return shap_values, pipeline.named_steps['tfidf'].get_feature_names_out()

In [10]:
# Example usage
def analyze_healthcare_review(review_text):
    # Preprocess the review
    processed_review = preprocess_text(review_text)
    
    # Get prediction
    prediction = pipeline.predict([processed_review])[0]
    prediction_prob = pipeline.predict_proba([processed_review])[0]
    
    # Get LIME explanation
    lime_exp = explain_with_lime(processed_review, pipeline)
    
    print(f"Review: {review_text}")
    print(f"\nPrediction: {'Positive' if prediction == 1 else 'Negative'}")
    print(f"Confidence: {max(prediction_prob):.2f}")
    
    print("\nLIME Explanation:")
    print("Top features supporting this prediction:")
    for feature, weight in lime_exp.as_list()[:3]:
        print(f"- {feature}: {weight:.3f}")

In [11]:
# Function to analyze multiple reviews and generate insights
def analyze_multiple_reviews(reviews):
    all_explanations = []
    
    for review in reviews:
        processed_review = preprocess_text(review)
        lime_exp = explain_with_lime(processed_review, pipeline)
        all_explanations.extend(lime_exp.as_list())
    
    # Aggregate and analyze feature importance
    feature_importance = {}
    for feature, weight in all_explanations:
        if feature not in feature_importance:
            feature_importance[feature] = []
        feature_importance[feature].append(weight)
    
    # Calculate average importance for each feature
    avg_importance = {
        feature: np.mean(weights)
        for feature, weights in feature_importance.items()
    }
    
    return avg_importance

In [12]:
# Analyze a single review
print("Single Review Analysis:")
test_review = "The doctor was very professional and the treatment was excellent"
analyze_healthcare_review(test_review)

print("\nGlobal Feature Importance Analysis:")
avg_importance = analyze_multiple_reviews(healthcare_reviews)

print("\nTop 5 Most Important Features Across All Reviews:")
sorted_features = sorted(
    avg_importance.items(),
    key=lambda x: abs(x[1]),
    reverse=True
)[:5]

for feature, importance in sorted_features:
    print(f"- {feature}: {importance:.3f}")

Single Review Analysis:
Review: The doctor was very professional and the treatment was excellent

Prediction: Positive
Confidence: 0.75

LIME Explanation:
Top features supporting this prediction:
- was: 0.120
- professional: 0.083
- very: 0.070

Global Feature Importance Analysis:

Top 5 Most Important Features Across All Reviews:
- was: 0.110
- professional: 0.093
- very: 0.072
- were: -0.061
- nurses: -0.054
