# Data Exploration Notebook

This notebook provides interactive tools for exploring and querying the processed ethnographic notes generated by `supernote_parser.py`.

In [None]:
import pandas as pd
import json
import os
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Define the directory where processed JSON notes are stored
processed_notes_dir = '../research/processed_notes'

# Ensure the directory exists
if not os.path.exists(processed_notes_dir):
    print(f"Error: Processed notes directory not found at '{processed_notes_dir}'")
    print("Please make sure you have run the supernote_parser.py script first.")
else:
    all_notes_data = []
    json_files = [f for f in os.listdir(processed_notes_dir) if f.endswith('.json')]
    
    if not json_files:
        print(f"No processed JSON files found in '{processed_notes_dir}'.")
        print("Please make sure you have run the supernote_parser.py script with notes containing frontmatter.")
    else:
        for filename in json_files:
            file_path = os.path.join(processed_notes_dir, filename)
            with open(file_path, 'r', encoding='utf-8') as f:
                note = json.load(f)
                all_notes_data.append(note)
        
        # Convert the list of note dictionaries into a Pandas DataFrame
        # Flatten metadata and protocol_steps for easier DataFrame creation
        flattened_data = []
        for note in all_notes_data:
            row = {**note.get('metadata', {}), **note.get('protocol_steps', {})}
            flattened_data.append(row)
            
        df = pd.DataFrame(flattened_data)
        
        print(f"Loaded {len(df)} processed notes into DataFrame.")
        print("DataFrame Head:")
        display(df.head())
        print("\nDataFrame Info:")
        df.info()


## Explore Metadata

In [None]:
if 'df' in locals():
    print("Unique Locations:", df['location'].unique())
    print("\nUnique Tags:", df['tags'].explode().unique())
    print("\nNotes by Source:")
    display(df['source'].value_counts())
    
    # Convert 'date' to datetime objects if it exists and is not already
    if 'date' in df.columns:
        df['date'] = pd.to_datetime(df['date'])
        print("\nNotes over time (by month):")
        display(df.set_index('date').resample('M').size().plot(kind='bar', figsize=(10, 4)))


## Visualizations

In [None]:
if 'df' in locals():
    # --- Tag Cloud ---
    all_tags = ' '.join(df['tags'].explode().astype(str).tolist())
    if all_tags:
        wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_tags)
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.title('Tag Cloud')
        plt.show()
    else:
        print("No tags found for Tag Cloud.")

    # --- Notes by Protocol Step ---
    protocol_steps_columns = ['Purification', 'Containment', 'Anchoring', 'Dissolution', 'Liminality', 'Encounter', 'Integration', 'Emergence']
    step_counts = pd.Series({
        step: df[step].astype(bool).sum() for step in protocol_steps_columns
    })
    
    if not step_counts.empty:
        plt.figure(figsize=(12, 6))
        step_counts.plot(kind='bar')
        plt.title('Number of Notes per Protocol Step (Presence of Content)')
        plt.xlabel('Protocol Step')
        plt.ylabel('Number of Notes with Content')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.show()
    else:
        print("No protocol step data found for visualization.")


## Query Notes

In [None]:
if 'df' in locals():
    # Example: Find notes related to 'neuroscience' tag
    print("Notes tagged 'neuroscience':")
    neuroscience_notes = df[df['tags'].apply(lambda x: 'neuroscience' in x if isinstance(x, list) else False)]
    display(neuroscience_notes[['original_file', 'date', 'location']])
    
    # Example: View the full content of a specific note's 'Purification' step
    if not neuroscience_notes.empty:
        first_neuro_note = neuroscience_notes.iloc[0]
        print(f"\nContent of Purification step for {first_neuro_note['original_file']}:")
        print(first_neuro_note.get('Purification', 'No content for this step.'))
    else:
        print("\nNo neuroscience notes to display details for.")
