In [6]:
import sys
from pathlib import Path
true_root_dir = Path().resolve().parent
sys.path.append(str(true_root_dir))

In [None]:
from src.preprocessing.guidelines import EntityGuidelines
from src.renal_biopsy.preprocessor import RenalBiopsyProcessor

root_data_dir = "src/renal_biopsy/data"
guidelines = EntityGuidelines(f'{true_root_dir}/{root_data_dir}/guidelines.xlsx')
processor = RenalBiopsyProcessor(guidelines=guidelines)

input_json = processor.create_input_json(
    data_path=f"{true_root_dir}/{root_data_dir}/full_data.xlsx",
    save_path=f"{true_root_dir}/{root_data_dir}/input.json",
    full=True
)

segmented_reports = processor.process_all_reports_real(f"{true_root_dir}/{root_data_dir}/full_data.xlsx")
filtered_reports, microscopy_sections, conclusion_sections = processor.extract_valid_sections(
    segmented_reports, 
    required_sections=['MICROSCOPY', 'CONCLUSION']
)

In [None]:
from src.preprocessing.eda import MedicalReportEDA

eda = MedicalReportEDA()

stats = eda.analyse_section_lengths(segmented_reports, exclude_keys=['entity_key'])

stats = eda.calculate_report_statistics(
    reports=segmented_reports,
    section_keys=['MICROSCOPY', 'CONCLUSION']
)
# all sections
# stats = eda.calculate_report_statistics(reports=segmented_reports)

In [None]:
eda.analyse_word_distributions(microscopy_sections, f'Microscopy Section (n={len(microscopy_sections)})')
eda.analyse_word_distributions(microscopy_sections, f'Microscopy Section (n={len(microscopy_sections)})')


In [None]:
# number of patients
import pandas as pd
sample_data = pd.read_excel(f"{true_root_dir}/{root_data_dir}/full_data.xlsx")
len(sample_data['project_id'].unique())

In [None]:
import Levenshtein as lev

highlight_words = ['glomeruli', 'medulla', 'cortex', 'fibrosis', 'sclerosed', 'chronic', 'interstitial', 'tubular', 'atrophy']
word_freq_df = eda.analyse_word_frequencies_spacy(
    segmented_reports, 
    'MICROSCOPY',
    highlight_words=highlight_words,
    n_terms=40
)

In [None]:
# Check for misspellings of main words
misspellings = {}
for correct_word in highlight_words:
    misspellings[correct_word] = word_freq_df['Word'].apply(lambda x: lev.distance(x, correct_word) <= 3)

# Print misspellings
for correct_word, matches in misspellings.items():
    print(f"\nPossible misspellings of '{correct_word}':")
    print(word_freq_df[matches])

In [None]:
custom_stop_words = ['and', 'are', 'but', 'in', 'is', 'no', 'of', 'the', 'there', 'with', 'seen', 'show', 'shows', 'to', 'which']
eda.analyse_tfidf(microscopy_sections[:40], n_terms=30, custom_stop_words=custom_stop_words)