# Summer Students 2025 Analysis - Steel Based Electrolysis Activation

In [8]:
%%capture
%matplotlib widget
#!pip install requests_cache

import matplotlib.pyplot as plt
import plotly.graph_objects as go
import time
import requests
import pandas as pd
import numpy as np
import datetime
import math

import ipywidgets as widgets
from IPython.display import display, clear_output

import sys
sys.path.insert(1, '../python-scripts-c6fxKDJrSsWp1xCxON1Y7g')
sys.path.insert(1, '../../python-scripts-c6fxKDJrSsWp1xCxON1Y7g')
from api_calls import *

url = "https://nomad-hzb-ce.de/nomad-oasis/api/v1"

import os
token = os.environ['NOMAD_CLIENT_ACCESS_TOKEN']

In [9]:
def get_upload_ids_from_main_authors(url, token, main_authors):   
    query = {
        'required': {
            'upload_id': '*',
        },
        'owner': 'visible',
        'query': {
            'origin:any': main_authors,
        },
        'pagination': {
            'page_size': 1000
        }
    }
    response = requests.post(f'{url}/entries/archive/query',
                             headers={'Authorization': f'Bearer {token}'}, json=query)
    linked_data = response.json()["data"]
    res = set()
    for ldata in linked_data:
        res.add(ldata.get('upload_id'))
    return res

def get_specific_entrytype_of_upload_ids(url, token, upload_list, entry_type):   
    query = {
        'required': {
            'data': '*',
        },
        'owner': 'visible',
        'query': {
            'upload_id:any': upload_list,
            'entry_type': entry_type
        },
        'pagination': {
            'page_size': 10000
        }
    }
    response = requests.post(f'{url}/entries/archive/query',
                             headers={'Authorization': f'Bearer {token}'}, json=query)
    linked_data = response.json()["data"]
    res = []
    for ldata in linked_data:
        res.append(ldata["archive"]["data"])
    return res 

def get_upload_name_from_id(url, token, upload_id):
    response = requests.get(f'{url}/uploads/{upload_id}', headers={'Authorization': f'Bearer {token}'})
    linked_data = response.json()["data"]
    return linked_data.get('upload_name')

def get_result_df_from_upload_ids(upload_ids, filter_fft=False):
    #columns = ['sample_id', 'potential1 (mV vs RHE)', 'hold1 (s)', 'potential2 (mV vs RHE)', 'hold2 (s)', 'potential3 (mV vs RHE)', 'sweep speed (mV/s)', 'cycle (P2-P3)', 'duration (s)', 'duration (h)']
    columns = ['upload_id', 'sample_id', 'potential1 (mV vs Hg/HgO)', 'hold1 (s)', 'potential2 (mV vs Hg/HgO)', 'hold2 (s)', 'potential3 (mV vs Hg/HgO)', 'sweep speed (mV/s)', 'cycle (P2-P3)', 'duration (s)', 'duration (h)']
    rows = []
    hold2 = 0
    sweep_speed = 100
    
    for upload_id in upload_ids:
        try:
            ca_data = get_specific_entrytype_of_upload_ids(url, token, [upload_id], 'CE_NOME_Chronoamperometry',)
            cv_data = get_specific_entrytype_of_upload_ids(url, token, [upload_id], 'CE_NOME_CyclicVoltammetry',)

            if filter_fft:
                if not ca_data[0].get('name', '').startswith('FFT'):
                    continue
            else:
                if ca_data[0].get('name', '').startswith('FFT'):
                    continue
    
            sample_id = ca_data[0].get('samples')[0].get('name')
            #upload_name = get_upload_name_from_id(url, token, upload_id)
        
            # extract data from Chronoamperometry files
            rhe = ca_data[0].get('voltage_shift') or 0.933
            potential1 = ca_data[0].get('properties').get('step_1_potential')
            potential1_rhe = potential1 + rhe
            hold1 = ca_data[0].get('properties').get('step_1_time')
        
            # extract data from CyclicVoltammetry files
            rhe = cv_data[0].get('voltage_shift') or 0.933
            potential2 = cv_data[0].get('properties').get('limit_potential_1')
            potential3 = cv_data[0].get('properties').get('limit_potential_2')
            potential2_rhe = potential2 + rhe
            potential3_rhe = potential3 + rhe
            sweep_speed = cv_data[0].get('properties').get('scan_rate')

            # convert to mV
            potential1 *= 1000
            potential2 *= 1000
            potential3 *= 1000
         
            interval1 = hold1 + np.abs(potential1 - potential2)/sweep_speed
            interval2 = np.abs(potential2 - potential3)/sweep_speed + hold2
            cycles_computed = min(100, math.floor((3600 - interval1) / interval2 - 1))   #TODO check the -1
            cycles_gamry_file = cv_data[0].get('properties').get('cycles')
            
            #if cycles_computed != len(cv_data[0].get('cycles', [{}])):
            if cycles_computed != cycles_gamry_file:
                print(f' {upload_id} cycles do not match: {cycles_computed} != {cycles_gamry_file}')
            duration = interval1 + cycles_gamry_file*interval2
        
            rows.append([upload_id, sample_id, potential1, hold1, potential2, hold2, potential3, sweep_speed, cycles_gamry_file, duration, duration/3600])
        
            #print(upload_id, f': {len(cv_data)} CV files, {len(ca_data)} CA files')
        except:
            print(upload_id, "didnt work - maybe calibration upload?")
    
    result = pd.DataFrame(rows, columns=columns)
    result = result.sort_values(by=['sample_id']).reset_index(drop = True)
    return result

In [10]:
# all ipywidgets

# FFT = Furthest-First Traversal
author_selector = widgets.Dropdown(
    options=['Maitryi Gupta', 'Literature-Marlena Thormeier', 'FFT-Marlena Thormeier'],
    value='Maitryi Gupta',
    description='NOMAD author:',
    style={'description_width': 'initial'}
)

group_selector = widgets.Dropdown(
    description="Select parameter set:",
    style={'description_width': 'initial'}
)

get_button = widgets.Button(
    description='Get NOMAD data',
    button_style='success',
    layout=widgets.Layout(width='auto')
)

analysis_button = widgets.Button(
    description='Group and evaluate data',
    button_style='info',
    layout=widgets.Layout(width='auto')
)


safe_baybe_button = widgets.Button(
    description='Safe data for Bayesian Optimization',
    button_style='primary',
    layout=widgets.Layout(width='auto')
)

show_group_details_button = widgets.Button(
    description="Show plots for selected parameter set",
    button_style='info',
    layout=widgets.Layout(width='auto')
)

all_runs_output = widgets.Output()
analysis_output = widgets.Output()
baybe_output = widgets.Output()
group_select_output = widgets.Output()
group_detail_output = widgets.Output()

### Select Uploads

In [11]:
def on_author_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        all_runs_output.clear_output()
        analysis_output.clear_output()
        baybe_output.clear_output()
        group_select_output.clear_output()
        group_detail_output.clear_output()

author_selector.observe(on_author_change)

display(author_selector)

Dropdown(description='NOMAD author:', options=('Maitryi Gupta', 'Literature-Marlena Thormeier', 'FFT-Marlena T…

### Create Table from NOMAD entries

In [12]:
def on_button_clicked(b):
    global result
    fft_filter = False
    with all_runs_output:
        all_runs_output.clear_output()
        print('Getting data. This can take some time...')
        author_list = [author_selector.value.split("-", 1)[-1]]
        upload_ids = get_upload_ids_from_main_authors(url, token, author_list)
        if author_selector.value.startswith('FFT'):
            fft_filter = True
        result = get_result_df_from_upload_ids(upload_ids, fft_filter)
        all_runs_output.clear_output()
        display(result)

get_button.on_click(on_button_clicked)

display(get_button, all_runs_output)

Button(button_style='success', description='Get NOMAD data', layout=Layout(width='auto'), style=ButtonStyle())

Output()

### Calculation of Targets

In [None]:
# TODO: should this be done on RHE compensated data? E_shift referenzelektrode
# TODO decide how table and activity is connected: everything in the same upload or match via sample ids?

In [7]:
def get_groups(df):
    res_grouped = df.groupby([
        'potential1 (mV vs Hg/HgO)',
        'hold1 (s)',
        'potential3 (mV vs Hg/HgO)',
        'sweep speed (mV/s)',
        'cycle (P2-P3)'
    ]).agg({
        'upload_id': lambda x: list(x.unique()),
        'sample_id': lambda x: list(x.unique())
    }).reset_index()
    res_grouped.rename(columns={
        'upload_id': 'upload_ids',
        'sample_id': 'sample_ids'
    }, inplace=True)

    # sort by date
    res_grouped['sort_by_id_date'] = res_grouped['sample_ids'].apply(lambda x: x[0] if x else '') #apply(lambda x: x[0][13:24] if x else '')
    res_grouped.sort_values(by='sort_by_id_date', inplace=True)
    res_grouped.drop(columns='sort_by_id_date', inplace=True)
    res_grouped.reset_index(drop=True, inplace=True)

    return res_grouped

def get_mean_std_no_cycles(data_list, quantity):
    replicates = []
    for measurement in data_list:
        replicates.append(measurement.get(quantity))
    mean_all = np.mean(replicates, axis=0)
    std_all = np.std(replicates, axis=0, ddof=1)
    mean_val = mean_all.mean() * 1000 #mV
    std_val = std_all.mean() * 1000 #mV
    return mean_val, std_val
    
def get_mean_std_of_groups(res_grouped):
    eval_col_names = ['cp_voltage_mean', 'cp_voltage_std', 'cp_geom_mean', 'cp_voltage_rhe_mean', 'cp_voltage_rhe_std', 'cp_geom_mean_rhe']
    eval_cols = []
    
    for group in res_grouped.itertuples():
        cp_data = get_specific_entrytype_of_upload_ids(url, token, group.upload_ids, 'CE_NOME_Chronopotentiometry',)
        
        cp_mean, cp_std = get_mean_std_no_cycles(cp_data, 'voltage')
        cp_mean_rhe, cp_std_rhe = get_mean_std_no_cycles(cp_data, 'voltage_rhe_compensated')

        geom_mean = (cp_mean*cp_mean*cp_std)**(1/3)
        geom_mean_rhe = (cp_mean_rhe*cp_mean_rhe*cp_std_rhe)**(1/3)
        
        eval_cols.append([cp_mean, cp_std, geom_mean, cp_mean_rhe, cp_std_rhe, geom_mean_rhe])
    
    res_grouped.loc[:, eval_col_names] = eval_cols
    return res_grouped

#upload_ids = get_upload_ids_from_main_authors(url, token, [author_selector.value])
#result = get_result_df_from_upload_ids(upload_ids)
#res_grouped = get_groups(result)
#res_grouped2 = get_mean_std_of_groups(res_grouped)
#res_grouped2

In [8]:
def get_trial_overview(res_grouped, plot_title='CP Mean & STD in mV', y1_name='cp_voltage_mean', y1_label='CP Voltage (mV)', y2_name='cp_voltage_std', y2_label='CP Standard Deviation (mV)'):
    fig = go.Figure()
    
    fig.add_trace(go.Scatter(
        x=res_grouped.index,
        y=res_grouped[y1_name],
        name=y1_label,
        mode='lines+markers',
        yaxis='y1'
    ))
    
    fig.add_trace(go.Scatter(
        x=res_grouped.index,
        y=res_grouped[y2_name],
        name=y2_label,
        mode='lines+markers',
        yaxis='y2'
    ))
    
    fig.update_layout(
        title=plot_title,
        plot_bgcolor='white',
        xaxis=dict(
            title='Parameter Set',
            showgrid=False,      # no vertikal line
            linecolor='black',   # black axis at bottom
        ),
        yaxis=dict(
            title=y1_label,
            #showgrid=False,      # no horizontal line
            linecolor='blue',
            titlefont=dict(color='blue'),
            tickfont=dict(color='blue'),
        ),
        yaxis2=dict(
            title=y2_label,
            overlaying='y',
            side='right',
            showgrid=False,     # no horizontal line
            linecolor='red',
            titlefont=dict(color='red'),
            tickfont=dict(color='red'),
        ),
        legend=dict(
            x=0.5, y=-0.3,
            xanchor='center',
            orientation='h'
        )
    )
    return fig

def get_trial_std_over_mean(res_grouped, plot_title='CP std vs. CP mean (in mV)', x_name='cp_voltage_mean', x_label='CP Voltage (mV)', y_name='cp_voltage_std', y_label='CP Standard Deviation (mV)'):
    fig = go.Figure()
    
    fig.add_trace(go.Scatter(
        x=res_grouped[x_name],   # TODO maybe use mA
        y=res_grouped[y_name],
        mode='markers+text',
        marker=dict(color='green', size=8),
        name='Mean vs Std',
        textposition='top center'
    ))
    
    fig.update_layout(
        title=plot_title,
        xaxis=dict(
            title=x_label,
            showgrid=True,
            gridcolor='lightgrey',
            zeroline=False,
            linecolor='black',
            #range=[0, 2000],
        ),
        yaxis=dict(
            title=y_label, 
            showgrid=True,
            gridcolor='lightgrey',
            zeroline=False,
            linecolor='black',
        ),
        plot_bgcolor='white',
    )
    return fig


In [9]:
def on_analysis_clicked(b):
    global res_grouped
    with analysis_output:
        analysis_output.clear_output()
        print('Evaluating data. This can take some time...')
        res_grouped = get_groups(result)
        res_grouped = get_mean_std_of_groups(res_grouped)
        styled_df = res_grouped.drop(columns=['upload_ids', 'sample_ids']).style.background_gradient(subset=['cp_geom_mean', 'cp_geom_mean_rhe'], cmap='RdYlGn_r')
        fig1 = get_trial_overview(res_grouped, plot_title='CP Mean & STD in mV', y1_name='cp_voltage_mean', y1_label='CP Voltage (mV)', y2_name='cp_voltage_std', y2_label='CP Standard Deviation (mV)')
        fig2 = get_trial_std_over_mean(res_grouped, plot_title='CP std vs. CP mean (in mV)', x_name='cp_voltage_mean', x_label='CP Voltage (mV)', y_name='cp_voltage_std', y_label='CP Standard Deviation (mV)')
        analysis_output.clear_output()
        display(styled_df)
        fig1.show()
        fig2.show()
    if not res_grouped.empty:
        group_selector.options = list(res_grouped.index)
        with group_select_output:
            group_select_output.clear_output()
            display(group_selector, show_group_details_button)

analysis_button.on_click(on_analysis_clicked)

display(analysis_button, analysis_output)

Button(button_style='info', description='Group and evaluate data', layout=Layout(width='auto'), style=ButtonSt…

Output()

### Save Data for Bayesian Optimization

At the moment we use the `cp_geom_mean` column for BO (not the RHE compensated column).

In [10]:
#date_now = datetime.datetime.now()
#file_name = 'baybe_csv/nomad_result_table_maitryi_' + date_now.strftime("%Y%m%d") + '.csv'
#result.to_csv(file_name, index=False)

In [11]:
def save_baybe_table_csv(res_grouped_df, author_name='maitryi'):
    rename_baybe = {
        'potential1 (mV vs Hg/HgO)': 'potential1',
        'potential3 (mV vs Hg/HgO)': 'potential3',
        'hold1 (s)': 'hold1',
        'sweep speed (mV/s)': 'sweep_speed',
    }
    
    baybe_table = res_grouped_df.rename(columns=rename_baybe)[['potential1', 'potential3', 'hold1', 'sweep_speed', 'cp_geom_mean']]
    #baybe_table = baybe_table.replace('n/a', 0)    # this is from last year where sweep speed could be n/a TODO check if this is also possible this year
    baybe_table.to_csv(f'baybe_csv/wateroxidation_ni_{author_name}.csv', index=False, header=True)
    print(f'Saved table for Bayesion Optimization in baybe_csv/wateroxidation_ni_{author_name}.csv')

In [12]:
def on_baybe_clicked(b):
    with baybe_output:
        baybe_output.clear_output()
        first_name = author_selector.value.split()[0].lower()
        save_baybe_table_csv(res_grouped, author_name=first_name)

safe_baybe_button.on_click(on_baybe_clicked)

display(safe_baybe_button, baybe_output)

Button(button_style='primary', description='Safe data for Bayesian Optimization', layout=Layout(width='auto'),…

Output()

### More detailed view on grouped data

In [13]:
def get_oer_cp_compare_plot(time_lists, voltage_lists, labels, plot_title='OER CP Voltage vs Time'):
    fig = go.Figure()

    for time, voltage, label in zip(time_lists, voltage_lists, labels):
        fig.add_trace(go.Scatter(
            x=time,
            y=voltage,
            mode='lines+markers',
            name=label,
            line=dict(width=2),
            marker=dict(size=6)
        ))
    
    fig.update_layout(
        title=plot_title,
        xaxis_title="Time (s)",
        yaxis_title="Voltage (V)",
        template="plotly_white"
    )
    
    fig.show()

def get_cv_compare_plot(dfs):
    fig = go.Figure()

    for i, df in enumerate(dfs, start=1):
        fig.add_trace(go.Scatter(
            x=df['voltage'],
            y=df['current'],
            mode='lines+markers',
            name=df['id'][0],
            line=dict(width=2),
            marker=dict(size=6)
        ))
    
    fig.update_layout(
        title="Activation CV",
        xaxis_title="Voltage (V)",
        yaxis_title="Im (A)",
        template="plotly_white"
    )
    
    fig.show()

In [14]:
def get_group_detail_view(res_grouped, parameter_set_idx):
    group = res_grouped.iloc[parameter_set_idx]

    # ------------- CP --------------
    
    cp_data = get_specific_entrytype_of_upload_ids(url, token, group.upload_ids, 'CE_NOME_Chronopotentiometry',)
    replicates = []
    replicates_time = []
    replicates_id = []
    for measurement in cp_data:
        replicates.append(measurement.get('voltage'))  # TODO 'voltage_rhe_compensated'
        replicates_time.append(measurement.get('time'))
        replicates_id.append(measurement.get('samples', [''])[0].get('name'))
    mean_all = np.mean(replicates, axis=0)
    std_all = np.std(replicates, axis=0, ddof=1)
    mean_val = mean_all.mean() * 1000 #mV
    std_val = std_all.mean() * 1000 #mV

    get_oer_cp_compare_plot(replicates_time, replicates, replicates_id)
    get_oer_cp_compare_plot([replicates_time[0]], [mean_all], ['V average'], 'OER CP V average')
    get_oer_cp_compare_plot([replicates_time[0]], [std_all], ['V standard deviation'], 'OER CP V standard deviation')

    # ------------- CV --------------

    cv_data = get_specific_entrytype_of_upload_ids(url, token, group.upload_ids, 'CE_NOME_CyclicVoltammetry',)
    first_cycles = []
    last_cycles = []
    for idx, cv in enumerate(cv_data):
        sample_id = cv.get('samples', [''])[0].get('name')
        first_cycle = cv.get('cycles')[1]   # cycle 2 (first complete cycle)
        last_cycle = cv.get('cycles')[-1]
        if len(last_cycle) < len(first_cycle):
            last_cycle = cv.get('cycles')[-2]    # sometimes the last cycle is only half cycle
    
        first_current = first_cycle.get('current')
        first_voltage = np.array(first_cycle.get('voltage_rhe_compensated'))
        first_df = pd.DataFrame({
            'voltage': first_voltage,
            'current': first_current,
            'id': f'first cycle {sample_id}',
        })
        first_cycles.append(first_df)
        
        last_current = last_cycle.get('current')
        last_voltage = np.array(last_cycle.get('voltage_rhe_compensated'))
        last_df = pd.DataFrame({
            'voltage': last_voltage,
            'current': last_current,
            'id': f'last cycle {sample_id}',
        })
        last_cycles.append(last_df)
    get_cv_compare_plot(first_cycles+last_cycles)

In [27]:
def on_group_idx_change(change):
    if change['name'] == 'value' and change['new'] is not None:
        idx = change['new']
        with group_select_output:
            group_select_output.clear_output()
            group_detail_output.clear_output()
            display(group_selector, res_grouped.loc[[idx]].drop(columns=['upload_ids', 'sample_ids']), show_group_details_button)

group_selector.observe(on_group_idx_change, names='value')

def on_show_group_details_button_click(b):
    selected_idx = group_selector.value
    with group_detail_output:
        group_detail_output.clear_output()
        get_group_detail_view(res_grouped, selected_idx)

show_group_details_button.on_click(on_show_group_details_button_click)

try:
    _ = res_grouped  # check if df is already defined
except NameError:
    with group_select_output:
        print('Please run the "Group and evaluate data" button before inspecting individual parameter sets.')

display(group_select_output, group_detail_output)


Output(outputs=({'output_type': 'display_data', 'data': {'text/plain': "Dropdown(description='Select parameter…

Output()