# Data Exploration for Bengali Medical Chatbot

This notebook explores the available datasets for training the Bengali medical chatbot.

## Datasets:
1. Kaggle AI Medical Chatbot Dataset
2. Medical Chatbot Dataset (Sarfaraz)
3. BanglaHealth Paraphrase Dataset
4. Medical Terms Dictionary

In [None]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
import warnings
warnings.filterwarnings('ignore')

# Add src to path
sys.path.append('../src')

from data.preprocessor import MedicalDataPreprocessor
from utils.logger import setup_logger

logger = setup_logger(__name__)

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Setup complete!")

## 1. Load Datasets

In [None]:
# Data directory
data_dir = Path('../data/raw')

# Load datasets
datasets = {}

# Kaggle Medical Dataset 1
try:
    datasets['kaggle_medical_1'] = pd.read_csv(data_dir / 'ai_medical_chatbot.csv')
    print(f"✓ Loaded Kaggle Medical Dataset 1: {datasets['kaggle_medical_1'].shape}")
except FileNotFoundError:
    print("✗ Kaggle Medical Dataset 1 not found")

# Kaggle Medical Dataset 2
try:
    datasets['kaggle_medical_2'] = pd.read_csv(data_dir / 'medical_chatbot_dataset.csv')
    print(f"✓ Loaded Kaggle Medical Dataset 2: {datasets['kaggle_medical_2'].shape}")
except FileNotFoundError:
    print("✗ Kaggle Medical Dataset 2 not found")

# BanglaHealth Dataset
try:
    datasets['bangla_health'] = pd.read_json(data_dir / 'bangla_health_paraphrases.json')
    print(f"✓ Loaded BanglaHealth Dataset: {datasets['bangla_health'].shape}")
except FileNotFoundError:
    print("✗ BanglaHealth Dataset not found")

# Medical Terms Dictionary
try:
    with open(data_dir / 'medical_terms_en_bn.json', 'r', encoding='utf-8') as f:
        medical_terms = json.load(f)
    print(f"✓ Loaded Medical Terms Dictionary: {len(medical_terms)} terms")
except FileNotFoundError:
    print("✗ Medical Terms Dictionary not found")
    medical_terms = {}

## 2. Dataset Overview

In [None]:
# Overview of all datasets
print("Dataset Overview:")
print("=" * 50)

for name, df in datasets.items():
    print(f"\n{name.upper()}:")
    print(f"  Shape: {df.shape}")
    print(f"  Columns: {list(df.columns)}")
    print(f"  Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    
    # Show sample data
    print(f"  Sample data:")
    for col in df.columns[:2]:  # Show first 2 columns
        sample_value = str(df[col].iloc[0])[:100] + "..." if len(str(df[col].iloc[0])) > 100 else str(df[col].iloc[0])
        print(f"    {col}: {sample_value}")

## 3. Data Quality Analysis

In [None]:
# Initialize preprocessor for quality analysis
preprocessor = MedicalDataPreprocessor()

def analyze_data_quality(df, name):
    """Analyze data quality for a dataset."""
    print(f"\nData Quality Analysis - {name.upper()}:")
    print("-" * 40)
    
    # Missing values
    missing = df.isnull().sum()
    if missing.sum() > 0:
        print("Missing values:")
        for col, count in missing[missing > 0].items():
            print(f"  {col}: {count} ({count/len(df)*100:.1f}%)")
    else:
        print("✓ No missing values")
    
    # Duplicates
    duplicates = df.duplicated().sum()
    print(f"Duplicate rows: {duplicates} ({duplicates/len(df)*100:.1f}%)")
    
    # Text columns analysis
    text_cols = df.select_dtypes(include=['object']).columns
    for col in text_cols:
        if df[col].dtype == 'object':
            lengths = df[col].astype(str).str.len()
            print(f"\n{col} statistics:")
            print(f"  Length - Mean: {lengths.mean():.1f}, Median: {lengths.median():.1f}")
            print(f"  Length - Min: {lengths.min()}, Max: {lengths.max()}")
            print(f"  Unique values: {df[col].nunique()} ({df[col].nunique()/len(df)*100:.1f}%)")

# Analyze each dataset
for name, df in datasets.items():
    analyze_data_quality(df, name)

## 4. Language Detection Analysis

In [None]:
def detect_languages_in_dataset(df, text_columns):
    """Detect languages in text columns."""
    language_stats = {}
    
    for col in text_columns:
        if col in df.columns:
            languages = df[col].astype(str).apply(preprocessor.detect_language)
            language_stats[col] = languages.value_counts()
    
    return language_stats

# Analyze languages in each dataset
print("Language Distribution Analysis:")
print("=" * 50)

for name, df in datasets.items():
    print(f"\n{name.upper()}:")
    
    # Get text columns
    text_cols = df.select_dtypes(include=['object']).columns.tolist()
    
    if text_cols:
        lang_stats = detect_languages_in_dataset(df, text_cols[:2])  # Analyze first 2 text columns
        
        for col, stats in lang_stats.items():
            print(f"  {col}:")
            for lang, count in stats.items():
                print(f"    {lang}: {count} ({count/stats.sum()*100:.1f}%)")
    else:
        print("  No text columns found")

## 5. Medical Content Analysis

In [None]:
def analyze_medical_content(df, text_columns, medical_terms):
    """Analyze medical content in text columns."""
    medical_stats = {}
    
    # Create list of all medical terms (English and Bengali)
    all_terms = list(medical_terms.keys()) + list(medical_terms.values())
    
    for col in text_columns:
        if col in df.columns:
            # Count medical terms in each text
            medical_counts = []
            for text in df[col].astype(str):
                text_lower = text.lower()
                count = sum(1 for term in all_terms if term.lower() in text_lower)
                medical_counts.append(count)
            
            medical_stats[col] = {
                'mean_terms': np.mean(medical_counts),
                'median_terms': np.median(medical_counts),
                'max_terms': np.max(medical_counts),
                'texts_with_medical': sum(1 for count in medical_counts if count > 0),
                'medical_percentage': sum(1 for count in medical_counts if count > 0) / len(medical_counts) * 100
            }
    
    return medical_stats

# Analyze medical content
print("Medical Content Analysis:")
print("=" * 50)

for name, df in datasets.items():
    print(f"\n{name.upper()}:")
    
    text_cols = df.select_dtypes(include=['object']).columns.tolist()
    
    if text_cols and medical_terms:
        med_stats = analyze_medical_content(df, text_cols[:2], medical_terms)
        
        for col, stats in med_stats.items():
            print(f"  {col}:")
            print(f"    Avg medical terms per text: {stats['mean_terms']:.2f}")
            print(f"    Texts with medical content: {stats['texts_with_medical']} ({stats['medical_percentage']:.1f}%)")
            print(f"    Max medical terms in single text: {stats['max_terms']}")
    else:
        print("  No text columns or medical terms dictionary available")

## 6. Visualization

In [None]:
# Create visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Dataset Analysis Visualizations', fontsize=16)

# 1. Dataset sizes
if datasets:
    dataset_sizes = {name: len(df) for name, df in datasets.items()}
    axes[0, 0].bar(dataset_sizes.keys(), dataset_sizes.values())
    axes[0, 0].set_title('Dataset Sizes')
    axes[0, 0].set_ylabel('Number of Records')
    axes[0, 0].tick_params(axis='x', rotation=45)

# 2. Text length distribution (for first dataset with text)
if datasets:
    first_dataset = list(datasets.values())[0]
    text_col = first_dataset.select_dtypes(include=['object']).columns[0]
    lengths = first_dataset[text_col].astype(str).str.len()
    axes[0, 1].hist(lengths, bins=50, alpha=0.7)
    axes[0, 1].set_title(f'Text Length Distribution ({text_col})')
    axes[0, 1].set_xlabel('Text Length')
    axes[0, 1].set_ylabel('Frequency')

# 3. Medical terms frequency
if medical_terms:
    # Show top 10 most common medical terms
    term_lengths = {term: len(term) for term in medical_terms.keys()}
    sorted_terms = sorted(term_lengths.items(), key=lambda x: x[1], reverse=True)[:10]
    terms, lengths = zip(*sorted_terms)
    axes[1, 0].barh(terms, lengths)
    axes[1, 0].set_title('Medical Terms Length (Top 10)')
    axes[1, 0].set_xlabel('Term Length')

# 4. Dataset column counts
if datasets:
    column_counts = {name: len(df.columns) for name, df in datasets.items()}
    axes[1, 1].bar(column_counts.keys(), column_counts.values())
    axes[1, 1].set_title('Number of Columns per Dataset')
    axes[1, 1].set_ylabel('Number of Columns')
    axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 7. Sample Data Inspection

In [None]:
# Show sample data from each dataset
print("Sample Data Inspection:")
print("=" * 50)

for name, df in datasets.items():
    print(f"\n{name.upper()} - Sample Records:")
    print("-" * 30)
    
    # Show first 3 records
    for i in range(min(3, len(df))):
        print(f"\nRecord {i+1}:")
        for col in df.columns:
            value = str(df.iloc[i][col])
            # Truncate long values
            if len(value) > 200:
                value = value[:200] + "..."
            print(f"  {col}: {value}")

## 8. Data Preparation Recommendations

In [None]:
print("Data Preparation Recommendations:")
print("=" * 50)

recommendations = []

# Check for missing values
for name, df in datasets.items():
    missing_pct = (df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100
    if missing_pct > 5:
        recommendations.append(f"⚠️  {name}: High missing values ({missing_pct:.1f}%) - implement imputation strategy")
    
    # Check for duplicates
    dup_pct = (df.duplicated().sum() / len(df)) * 100
    if dup_pct > 10:
        recommendations.append(f"⚠️  {name}: High duplicate rate ({dup_pct:.1f}%) - implement deduplication")
    
    # Check text length distribution
    text_cols = df.select_dtypes(include=['object']).columns
    for col in text_cols:
        lengths = df[col].astype(str).str.len()
        if lengths.max() > 2000:
            recommendations.append(f"⚠️  {name}.{col}: Very long texts (max: {lengths.max()}) - consider truncation")
        if lengths.min() < 10:
            recommendations.append(f"⚠️  {name}.{col}: Very short texts (min: {lengths.min()}) - consider filtering")

# General recommendations
general_recommendations = [
    "✅ Implement comprehensive text cleaning pipeline",
    "✅ Create stratified train/validation/test splits",
    "✅ Implement quality filtering based on text length and medical content",
    "✅ Set up translation pipeline for English datasets",
    "✅ Implement cultural adaptation for Bengali medical terms",
    "✅ Create data augmentation strategy for balanced training",
    "✅ Implement medical accuracy validation with expert review"
]

print("\nSpecific Issues Found:")
if recommendations:
    for rec in recommendations:
        print(rec)
else:
    print("✅ No major data quality issues detected")

print("\nGeneral Recommendations:")
for rec in general_recommendations:
    print(rec)

print("\n" + "=" * 50)
print("Data exploration complete! Ready for preprocessing phase.")

## 9. Export Summary Statistics

In [None]:
# Create summary statistics for export
summary_stats = {
    'datasets': {},
    'medical_terms_count': len(medical_terms),
    'total_records': sum(len(df) for df in datasets.values()),
    'recommendations': recommendations + general_recommendations
}

for name, df in datasets.items():
    text_cols = df.select_dtypes(include=['object']).columns.tolist()
    
    summary_stats['datasets'][name] = {
        'shape': df.shape,
        'columns': list(df.columns),
        'missing_values': df.isnull().sum().sum(),
        'duplicates': df.duplicated().sum(),
        'text_columns': text_cols,
        'memory_mb': df.memory_usage(deep=True).sum() / 1024**2
    }
    
    if text_cols:
        first_text_col = text_cols[0]
        lengths = df[first_text_col].astype(str).str.len()
        summary_stats['datasets'][name]['text_stats'] = {
            'mean_length': lengths.mean(),
            'median_length': lengths.median(),
            'min_length': lengths.min(),
            'max_length': lengths.max()
        }

# Save summary
output_dir = Path('../experiments/results')
output_dir.mkdir(parents=True, exist_ok=True)

with open(output_dir / 'data_exploration_summary.json', 'w', encoding='utf-8') as f:
    json.dump(summary_stats, f, indent=2, ensure_ascii=False, default=str)

print(f"✅ Summary statistics saved to: {output_dir / 'data_exploration_summary.json'}")
print(f"\nTotal datasets analyzed: {len(datasets)}")
print(f"Total records across all datasets: {summary_stats['total_records']:,}")
print(f"Medical terms in dictionary: {len(medical_terms)}")