<a href="https://colab.research.google.com/github/mkywall/crucible-analysis-notebooks/blob/main/general/summer_school_data_tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Crucible Tutorial


## Part 1: Setup
- Install the crucible python client
- Import packages
- Retrieve your personal Crucible API key
- Initialize your client

#### Install the client from GitHub

In [None]:
!pip install git+https://github.com/MolecularFoundryCrucible/pycrucible.git

#### Import packages

In [None]:
import os
import json
import pprint
import uuid
from typing import List, Dict
from datetime import datetime
import h5py
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from pathlib import Path
import ipywidgets as widgets
from IPython.display import display, clear_output
import matplotlib.pyplot as plt

from pycrucible import CrucibleClient, SecureInput

#### Retrieve your API key

In your web browser navigate to https://crucible.lbl.gov/testapi/user_apikey.

You will be prompted to login to your ORCID.  Login.

Run the cell below and copy your resulting API key into the box!

** note: If you do not have an ORCID you can easily create one here: https://orcid.org/register

In [None]:
SecureInput(description = "Enter your API key:", var_name = 'CRUCIBLE_API_KEY')

#### Initialize the client

In [None]:
API_URL = "https://crucible.lbl.gov/testapi"
API_KEY = os.environ.get("CRUCIBLE_API_KEY")

# Initialize the client
client = CrucibleClient(API_URL, API_KEY)
print("Crucible client initialized successfully!")

## Part 2: Use the client to work with a batch of perovskite data
For this demo we will be using data generated for a batch of perovskite wafers generated by Yi-Ru.  The batch is named `S-pMeMBAI-pre-2` and has the unique id: `0t3h7ymbm5s27000z6tt82zvx4`



##### Query the Data

In [None]:
# set the batch_id as a variable
batch_id = '0t3h7ymbm5s27000z6tt82zvx4'

In [None]:
# list all of the samples associated with this batch
client.list_samples(parent_id = batch_id)

In [None]:
# list all of the datasets associated with this batch
client.list_datasets(sample_id = batch_id)

##### Download data files

After running the following cell, you can to navigate to the file system on the right by clicking the folder icon.  You should see a folder titled "crucible_downloads" that will contain all of the files you just downloaded.

In [None]:
batch_datasets = client.list_datasets(sample_id = batch_id)[0:2]
for ds in batch_datasets:
    pprint.pprint(ds)
    try:
      client.download_dataset(dsid = ds['unique_id'])
      print('downloaded')
    except Exception as err:
      print(err)

## Part 3: Working with ScopeFoundry hdf5 files

### Open the file and get your bearings

In [None]:
# Choose a file to work with
sample_file = 'crucible-downloads/yrliu98_S-pMeMBAI-pre-2_1_1_run3_spec_run.h5'

In [None]:
# Opening the file
with h5py.File(sample_file, 'r') as f:
  # groups within the file object
  print(f.keys())

  # attributes of the file object (the file object is the "root group")
  print(f.attrs.keys())

  # every group has a name (the name is the key)
  for group_name in f:
    print(group_name)

In [None]:
# The App Group

with h5py.File(sample_file, 'r') as f:
  app = f['app']

  # groups within the app group
  print(list(app.keys()))

  # attributes of the app group
  print(list(app.attrs.keys()))

  # print the settings group attributes for the app
  print("\n\napp settings: ")
  [print(k,v) for k,v in list(app['settings'].attrs.items())]

In [None]:
# The Hardware Group

with h5py.File(sample_file, 'r') as f:
  hw = f['hardware']

  # groups within the hw group
  print(list(hw.keys()))

  # attributes of the hw group
  print(list(hw.attrs.keys()))

In [None]:
# The Measurement Group

with h5py.File(sample_file, 'r') as f:
  M = f['measurement']

  # groups within the measurement group
  print(list(M.keys()))

  # attributes of the measurement group
  print(list(M.attrs.keys()))

  # Look at the measurement sub group
  # Note that you can keep extending out key values for groups or use a file system like notation
  print(list(f['measurement']['spec_run'].keys()))
  print(list(f['measurement/spec_run'].keys()))

  # Each of the values printed is a Dataset Object that can be accessed as a numpy array
  arr = np.array(f['measurement/spec_run/wl_spectra'])
  print(arr[0:2])

### Recurse the file systematically with the ```visititems``` function.
Instead of manually recursing the file, we can define a function and pass it to visititems which will recursively call the function on each of the objects in the h5file

In [None]:
def explore_h5_structure(name, obj):
    indent = "  " * name.count('/')
    if isinstance(obj, h5py.Group):
        print(f"{indent}{name}/ (Group)")
    elif isinstance(obj, h5py.Dataset):
        print(f"{indent}{name} (Dataset) - Shape: {obj.shape}, Type: {obj.dtype}")

with h5py.File(sample_file, 'r') as f:
    f.visititems(explore_h5_structure)


### Extract and plot the PL spectra

In [None]:
with h5py.File(sample_file, 'r') as h5file:

    # extract the corrected PL spectra as an array
    pl_spectra = np.array(h5file['measurement/spec_run/pl_spectra_corrected'])

    # determine how many spectra were collected
    dims = pl_spectra.shape

    # extract the wavelengths as an array
    wavelengths = np.array(h5file['measurement/spec_run/pl_wls'])

    # create the plot
    fig, ax = plt.subplots(figsize=(10, 6))

    # for each spectra, plot the line
    for i in range(0, dims[0]):
        ax.plot(wavelengths, pl_spectra[i], label=f"Spectrum {i+1}", linewidth=2, alpha=0.8)

    # formatting
    ax.set_xlabel('Wavelength (nm)', fontsize=12)
    ax.set_ylabel('Intensity', fontsize=12)
    ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    ax.grid(True, alpha=0.3)
    plt.tight_layout()

### Extract and plot the UV-vis spectra

In [None]:
with h5py.File(sample_file, 'r') as h5file:

    # extract the corrected PL spectra as an array
    wl_spectra = np.array(h5file['measurement/spec_run/wl_spectra_corrected'])

    # determine how many spectra were collected
    dims = wl_spectra.shape

    # extract the wavelengths as an array
    wavelengths = np.array(h5file['measurement/spec_run/wl_wls'])

    # create the plot
    fig, ax = plt.subplots(figsize=(10, 6))

    # for each spectra, plot the line
    for i in range(0, dims[0]):
        ax.plot(wavelengths, wl_spectra[i], label=f"Spectrum {i+1}", linewidth=2, alpha=0.8)

    # formatting
    ax.set_xlabel('Wavelength (nm)', fontsize=12)
    ax.set_ylabel('Intensity', fontsize=12)
    ax.set_ylim(0,1)
    ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    ax.grid(True, alpha=0.3)
    plt.tight_layout()

### Extract and display the sample image

In [None]:
with h5py.File(sample_file, 'r') as h5file:

    # extract the image
    imarray = np.array(h5file['measurement/spec_run/adj_photo'])

    # create the plot
    fig, ax = plt.subplots(figsize=(8, 6))
    ax.imshow(imarray, cmap = 'grey')
    ax.set_title('Sample Photo', fontsize=14, fontweight='bold')
    ax.axis('off')

### Scale this to explore the whole batch

#### Define functions for extracting the data and creating the plots

In [None]:
# Spectra plotting
def make_spectra_plot(M, s, w, title=""):
    """Create a spectra plot with proper formatting"""
    if len(M[s]) > 0:
        fig, ax = plt.subplots(figsize=(10, 6))
        spectra = np.array(M[s])
        dims = spectra.shape
        wls = np.array(M[w])

        for i in range(0, dims[0]):
            ax.plot(wls, spectra[i], label=f"Spectrum {i+1}", linewidth=2, alpha=0.8)

        ax.set_xlabel('Wavelength (nm)', fontsize=12)
        ax.set_ylabel('Intensity', fontsize=12)
        if 'wl' in s:
            ax.set_ylim(0,1)

        ax.set_title(f'{title} Spectra', fontsize=14, fontweight='bold')
        ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        ax.grid(True, alpha=0.3)
        plt.tight_layout()
        return fig
    return None


# Generalized plot generator
def get_sample_plots(specrun_file):
    """Extract data and create plots for a sample"""
    plots = {}

    with h5py.File(specrun_file, 'r') as h5file:
        M = h5file[f"measurement/spec_run"]

        # Create spectra plots
        for m in list(M.keys()):
            if m.endswith("spectra_corrected"):
                dtype = m.split("_")[0]
                wl_key = f'{dtype}_wls'
                if wl_key in M.keys():
                    fig = make_spectra_plot(M, m, wl_key, dtype.title())
                    if fig:
                        plots[m] = fig
                        plt.close(fig)

        # Handle photo data
        if 'adj_photo' in list(M.keys()):
            fig, ax = plt.subplots(figsize=(8, 6))
            imarray = np.array(M['adj_photo'])
            ax.imshow(imarray)
            ax.set_title('Sample Photo', fontsize=14, fontweight='bold')
            ax.axis('off')
            plots['adj_photo'] = fig
            plt.close(fig)

    return plots


# Graphics Formatting and switching between plots interactively
def display_plots_by_type(plot_type):
    """Display all samples for a specific plot type"""
    with output:
        clear_output(wait=True)

        # Find all samples that have this plot type
        samples_with_plot = []
        for ds_name, plot_data in batch_sample_data.items():
            if ('plots' in plot_data and
                'error' not in plot_data and
                plot_type in plot_data['plots']):
                samples_with_plot.append((ds_name, plot_data['plots'][plot_type]))

        n_samples = len(samples_with_plot)
        cols = min(3, n_samples)
        rows = (n_samples + cols - 1) // cols

        fig, axes = plt.subplots(rows, cols, figsize=(5*cols, 4*rows))
        fig.suptitle(f'{plot_type.replace("_", " ").title()} Comparison Across Samples',
                     fontsize=16, fontweight='bold')

        # Handle single row/column cases
        if rows == 1 and cols == 1:
            axes = [axes]
        elif rows == 1 or cols == 1:
            axes = axes.flatten() if hasattr(axes, 'flatten') else [axes]
        else:
            axes = axes.flatten()

        # Plot each sample
        for i, (ds_name, plot_fig) in enumerate(samples_with_plot):
            ax = axes[i]

            if plot_type == 'adj_photo':
                # For photo data, we need to extract the image data
                try:
                    # Get the image data from the original plot
                    img_data = plot_fig.axes[0].images[0].get_array()
                    ax.imshow(img_data, cmap = 'grey')
                    ax.set_title(f'{ds_name}', fontsize=12, fontweight='bold')
                    ax.axis('off')
                except Exception as e:
                    ax.text(0.5, 0.5, f'Error displaying\n {ds_name}',
                           ha='center', va='center', transform=ax.transAxes)
                    ax.set_title(f'{ds_name}', fontsize=12)
            else:
                #For spectra data, copy the plot lines
                try:
                    original_ax = plot_fig.axes[0]
                    for line in original_ax.get_lines():
                        ax.plot(line.get_xdata(), line.get_ydata(),
                               label=line.get_label(), alpha=0.8)

                    ax.set_xlabel(original_ax.get_xlabel())
                    ax.set_ylabel(original_ax.get_ylabel())
                    ax.set_title(f'{ds_name}', fontsize=12, fontweight='bold')
                    ax.grid(True, alpha=0.3)

                    # Add legend if there are multiple lines
                    if len(original_ax.get_lines()) > 1:
                        ax.legend(fontsize=8)

                except Exception as e:
                    ax.text(0.5, 0.5, f'Error displaying\n {ds_name}',
                           ha='center', va='center', transform=ax.transAxes)
                    ax.set_title(f'{ds_name}', fontsize=12)

        # Hide unused subplots
        for i in range(n_samples, len(axes)):
            axes[i].set_visible(False)

        plt.tight_layout()
        plt.show()

#### Extract the data for your batch

In [None]:
# Process all samples and create organized data structure
batch_sample_data = {}

for i, ds in enumerate(batch_datasets[0:2]):
    data_file = os.path.join('crucible-downloads/', os.path.basename(ds['file_to_upload']))
    dataset_name = ds['dataset_name']

    if data_file.endswith('.h5'):
        sample_plots = get_sample_plots(data_file)
        batch_sample_data[dataset_name] = {
            'dataset_info': ds,
            'plots': sample_plots,
            'data_file': data_file
        }


    elif data_file.endswith('.jpg'):
        print(f"  - Found batch photo: {data_file}")
        batch_photo_file = data_file


### Explore the data

In [None]:
# Organize data by plot type for easy comparison
plot_types_available = ['adj_photo', 'wl_spectra_corrected', 'pl_spectra_corrected']


# Create the interactive widget
plot_type_dropdown = widgets.Dropdown(
    options=plot_types_available,
    value=plot_types_available[0] if plot_types_available else None,
    description='Plot Type:',
    style={'description_width': 'initial'}
)

output = widgets.Output()

def on_plot_type_change(change):
    display_plots_by_type(change['new'])

plot_type_dropdown.observe(on_plot_type_change, names='value')

display(plot_type_dropdown)
display(output)

# Show the first plot type by default
if plot_types_available:
    display_plots_by_type(plot_types_available[0])

###  Part 4: Play around and see what else you can do with the API

#### Add a project you are working on

In [None]:
help(client.add_project)

In [None]:
client.add_project(project_info = {"project_id":"AUM_DEMO",
                                   "organization":"Summer School",
                                   "project_lead_email":"mkwall@lbl.gov"})

#### Add a sample

In [None]:
sample = client.add_sample()

#### Add a dataset from your google drive

In [None]:
# mount your google drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# choose a file
your_file_path = "sample_data/california_housing_train.csv"

# define some metadata you want to add to this dataset
metadata_to_add = {'comments': 'this is a fake dataset',
                   'weather': 'sunny',
                   'iphone_version': 11
                  }

In [None]:
# fill out the fields and send the data to Crucible
results = client.build_new_dataset_from_file(files_to_upload = [your_file_path],
                                        dataset_name = None, # this will default to the file name
                                        project_id = None, # this will default to unknown
                                        instrument_name = None, # default is null
                                        measurement = None, # default is null
                                        session_name = None, # default is null
                                        source_folder = None, # this will default to the base directory
                                        scientific_metadata = metadata_to_add, # this is the dictionary you defined above
                                        keywords = [], # list any keywords you want to be able to search on
                                        ingestor = 'CrucibleDatasetIngestor', # use a generic ingestor
                                        verbose = False,
                                        wait_for_ingestion_response = True)

ds = results['created_record']
pprint.pprint(ds)

#### Associate this dataset with the sample you created

In [None]:
# define the dataset and sample
dataset_id = ds['unique_id']
sample_id = sample['unique_id']

# link them!
client.add_dataset_to_sample(dataset_id = dataset_id, sample_id = sample_id)

In [None]:
# see all the datasets associated with your sample
client.list_datasets(sample_id = sample_id)

#### Send your dataset from Crucible to SciCat

In [None]:
client.send_to_scicat(dsid = ds['unique_id'], wait_for_scicat_response= True)

Go to https://mf-scicat.lbl.gov to get a quick look at your data