# Perovskite Papers Data Explorer
Interactive tool to plot and filter research paper data

In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual, HBox, VBox, Layout
from IPython.display import display, clear_output
import warnings
import os
warnings.filterwarnings('ignore')

In [17]:
# Load Excel data
df_original = pd.read_excel('All_101025_list-of-papers.xlsx')

# Remove rows with Evaporation as deposition technique
df_original = df_original[df_original['Deposition Technique'] != 'Evaporation']

# Remove rows with missing Area, PCE, or Deposition Technique
df_original = df_original.dropna(subset=['Area (cm²)', 'PCE(Max)', 'Deposition Technique'])

# Clean numeric columns
def clean_numeric(val):
    if pd.isna(val):
        return np.nan
    if isinstance(val, (int, float)):
        return float(val)
    try:
        return float(str(val).strip())
    except:
        return np.nan

df_original['PCE(Median)'] = df_original['PCE(Median)'].apply(clean_numeric)
df_original['Area (cm²)'] = df_original['Area (cm²)'].apply(clean_numeric)

# Clean Year column - handle Excel date serial numbers
def clean_year(val):
    if pd.isna(val):
        return np.nan
    year = float(val)
    # If year looks like an Excel serial date (> 10000), try to convert
    if year > 10000:
        try:
            # Excel serial dates start from 1899-12-30
            date = pd.Timestamp('1899-12-30') + pd.Timedelta(days=year)
            return int(date.year)
        except:
            return np.nan
    # If year is reasonable (1900-2100), use it
    if 1900 <= year <= 2100:
        return int(year)
    return np.nan

df_original['Year'] = df_original['Year'].apply(clean_year)

# Add source column to identify data origin
df_original['Data Source'] = 'Excel'

print(f"Loaded {len(df_original)} papers from Excel")
print(f"Columns: {', '.join(df_original.columns)}")

Loaded 119 papers from Excel
Columns: DOI, Author(s), Year, PCE(Max), PCE(Median), Area (cm²), Deposition Technique, Module/Cell, Stack (with Chemical Formula), What is new?, Data Source


In [15]:
# Load NOMAD CSV data if available
df_nomad = None
nomad_file = 'nomad_perovskite_data.csv'

if os.path.exists(nomad_file):
    try:
        df_nomad_raw = pd.read_csv(nomad_file)
        
        # Function to clean absorber_fabrication column
        def clean_absorber_fabrication(val):
            if pd.isna(val):
                return np.nan
            
            # Convert to string and remove brackets and quotes
            val_str = str(val).replace('[', '').replace(']', '').replace("'", '').replace('"', '')
            
            # Check for >> separator
            if '>>' in val_str:
                parts = [p.strip() for p in val_str.split('>>')]
                # If both parts are the same, keep only one
                if len(set(parts)) == 1:
                    val_str = parts[0]
                else:
                    # Different parts - mark for removal
                    return 'REMOVE_ROW'
            
            # Replace hyphens with spaces
            val_str = val_str.replace('-', ' ')
            
            return val_str.strip()
        
        # Clean absorber_fabrication
        df_nomad_raw['absorber_fabrication_clean'] = df_nomad_raw['absorber_fabrication'].apply(clean_absorber_fabrication)
        
        # Remove rows marked for removal
        df_nomad_raw = df_nomad_raw[df_nomad_raw['absorber_fabrication_clean'] != 'REMOVE_ROW']
        
        # Filter to only keep specific deposition techniques
        allowed_techniques = [
            'Blade coating',
            'Slot die coating',
            'Gravure',
            'Inkjet printing',
            'Spin coating',
            'spin coating',
            'Screen printing',
            'Spray coating'
        ]
        
        # Keep rows where the cleaned technique is in the allowed list
        df_nomad_raw = df_nomad_raw[df_nomad_raw['absorber_fabrication_clean'].isin(allowed_techniques)]
        
        # Extract year from publication_date
        df_nomad_raw['Year'] = pd.to_datetime(df_nomad_raw['publication_date']).dt.year
        
        # Convert area from m² to cm²
        df_nomad_raw['device_area_cm2'] = df_nomad_raw['device_area'] * 10000
        
        # Map module column to Module/Cell
        df_nomad_raw['Module/Cell'] = df_nomad_raw['module'].apply(lambda x: 'Module' if x == True or str(x).lower() == 'true' else 'Cell')
        
        # Create standardized dataframe
        df_nomad = pd.DataFrame({
            'DOI': df_nomad_raw['DOI_number'],
            'Author(s)': df_nomad_raw['lead_author'],
            'Year': df_nomad_raw['Year'],
            'PCE(Max)': df_nomad_raw['efficiency'],
            'PCE(Median)': np.nan,  # Not available in NOMAD data
            'Area (cm²)': df_nomad_raw['device_area_cm2'],
            'Deposition Technique': df_nomad_raw['absorber_fabrication_clean'],
            'Module/Cell': df_nomad_raw['Module/Cell'],
            'Stack (with Chemical Formula)': np.nan,  # Not available in NOMAD data
            'What is new?': np.nan,  # Not available in NOMAD data
            'Data Source': 'NOMAD'
        })
        # Remove rows with missing Area, PCE, or Deposition Technique
        df_nomad = df_nomad.dropna(subset=['Area (cm²)', 'PCE(Max)', 'Deposition Technique'])
        
        print(f"\nLoaded {len(df_nomad)} papers from NOMAD CSV")
        print(f"Removed {len(df_nomad_raw) - len(df_nomad)} rows with mixed fabrication techniques")
    except Exception as e:
        print(f"\nError loading NOMAD data: {e}")
        print("Continuing with Excel data only")
        df_nomad = None
else:
    print(f"\n{nomad_file} not found. Only Excel data will be used.")
    print("To include NOMAD data, place the CSV file in the same directory as this notebook.")


Loaded 34003 papers from NOMAD CSV
Removed 1283 rows with mixed fabrication techniques


In [4]:
# Extract all unique materials from Stack column
def extract_materials(df):
    all_materials = set()
    for stack in df['Stack (with Chemical Formula)'].dropna():
        materials = [m.strip() for m in str(stack).split('/')]
        all_materials.update(materials)
    return sorted(list(all_materials))

all_stack_materials = extract_materials(df_original)
print(f"Found {len(all_stack_materials)} unique materials in stacks")

Found 221 unique materials in stacks


In [5]:
# Define plotting columns
numeric_columns = ['Year', 'PCE(Max)', 'PCE(Median)', 'Area (cm²)']
categorical_columns = ['Deposition Technique', 'Module/Cell']
all_plot_columns = numeric_columns + categorical_columns + ['Stack (with Chemical Formula)']

# Create widgets
output = widgets.Output()

# Axis selection
x_axis = widgets.Dropdown(
    options=all_plot_columns,
    value='Year',
    description='X-axis:',
    style={'description_width': '80px'},
    layout=Layout(width='300px')
)

y_axis = widgets.Dropdown(
    options=all_plot_columns,
    value='PCE(Max)',
    description='Y-axis:',
    style={'description_width': '80px'},
    layout=Layout(width='300px')
)

# Color-by selection
color_by = widgets.Dropdown(
    options=['None', 'Deposition Technique', 'Module/Cell', 'Stack (with Chemical Formula)', 'Data Source'],
    value='None',
    description='Color by:',
    style={'description_width': '80px'},
    layout=Layout(width='300px')
)

# Stack material selector for coloring
color_stack_material = widgets.Dropdown(
    options=['All'] + all_stack_materials,
    value='All',
    description='Color Stack:',
    style={'description_width': '80px'},
    layout=Layout(width='400px', visibility='hidden')
)

# Stack material selector (for when Stack is selected)
stack_material_x = widgets.Dropdown(
    options=['All'] + all_stack_materials,
    value='All',
    description='Stack X:',
    style={'description_width': '80px'},
    layout=Layout(width='400px', visibility='hidden')
)

stack_material_y = widgets.Dropdown(
    options=['All'] + all_stack_materials,
    value='All',
    description='Stack Y:',
    style={'description_width': '80px'},
    layout=Layout(width='400px', visibility='hidden')
)

# Checkbox to include NOMAD data
show_nomad_data = widgets.Checkbox(
    value=False,
    description='Include NOMAD Data',
    disabled=(df_nomad is None),
    style={'description_width': '120px'},
    layout=Layout(width='200px')
)

if df_nomad is None:
    show_nomad_data.description = 'NOMAD Data (Not Available)'

# Checkboxes for log scale axes
log_x_axis = widgets.Checkbox(
    value=False,
    description='Log X-axis',
    style={'description_width': '80px'},
    layout=Layout(width='300px')
)

log_y_axis = widgets.Checkbox(
    value=False,
    description='Log Y-axis',
    style={'description_width': '80px'},
    layout=Layout(width='300px')
)

# Filters
year_filter = widgets.SelectMultiple(
    options=['All'] + sorted([y for y in df_original['Year'].dropna().unique().astype(int)]),
    value=['All'],
    description='Year:',
    style={'description_width': '80px'},
    layout=Layout(width='300px', height='100px')
)

pce_max_range = widgets.FloatRangeSlider(
    value=[0, df_original['PCE(Max)'].max() if pd.notna(df_original['PCE(Max)'].max()) else 30],
    min=0,
    max=df_original['PCE(Max)'].max() if pd.notna(df_original['PCE(Max)'].max()) else 30,
    step=0.5,
    description='PCE(Max):',
    style={'description_width': '80px'},
    layout=Layout(width='350px')
)

pce_median_range = widgets.FloatRangeSlider(
    value=[0, df_original['PCE(Median)'].max() if pd.notna(df_original['PCE(Median)'].max()) else 30],
    min=0,
    max=df_original['PCE(Median)'].max() if pd.notna(df_original['PCE(Median)'].max()) else 30,
    step=0.5,
    description='PCE(Med):',
    style={'description_width': '80px'},
    layout=Layout(width='350px')
)

area_range = widgets.FloatRangeSlider(
    value=[0, df_original['Area (cm²)'].max() if pd.notna(df_original['Area (cm²)'].max()) else 100],
    min=0,
    max=df_original['Area (cm²)'].max() if pd.notna(df_original['Area (cm²)'].max()) else 100,
    step=0.1,
    description='Area (cm²):',
    style={'description_width': '80px'},
    layout=Layout(width='350px')
)

# Define standard deposition techniques
standard_techniques = ['Blade coating', 'Slot-die coating', 'Gravure', 'Inkjet printing', 'Spin coating', 'Screen printing','Spray coating']

deposition_filter = widgets.SelectMultiple(
    options=['All'] + standard_techniques,
    value=['All'],
    description='Deposition:',
    style={'description_width': '80px'},
    layout=Layout(width='350px', height='100px')
)

module_cell_filter = widgets.SelectMultiple(
    options=['All'] + sorted([mc for mc in df_original['Module/Cell'].dropna().unique()]),
    value=['All'],
    description='Module/Cell:',
    style={'description_width': '80px'},
    layout=Layout(width='300px', height='80px')
)

stack_filter = widgets.SelectMultiple(
    options=['All'] + all_stack_materials,
    value=['All'],
    description='Stack has:',
    style={'description_width': '80px'},
    layout=Layout(width='400px', height='120px')
)

plot_button = widgets.Button(
    description='Update Plot',
    button_style='primary',
    layout=Layout(width='200px', height='40px')
)

reset_button = widgets.Button(
    description='Reset Filters',
    button_style='warning',
    layout=Layout(width='200px', height='40px')
)

In [6]:
# Functions
def get_combined_data():
    """Combine Excel and NOMAD data based on checkbox"""
    if show_nomad_data.value and df_nomad is not None:
        return pd.concat([df_original, df_nomad], ignore_index=True)
    else:
        return df_original.copy()

def filter_data():
    df = get_combined_data()
    
    # Year filter
    if 'All' not in year_filter.value:
        df = df[df['Year'].isin([int(y) for y in year_filter.value])]
    
    # PCE filters
    df = df[(df['PCE(Max)'] >= pce_max_range.value[0]) & (df['PCE(Max)'] <= pce_max_range.value[1])]
    df = df[((df['PCE(Median)'] >= pce_median_range.value[0]) & (df['PCE(Median)'] <= pce_median_range.value[1])) | df['PCE(Median)'].isna()]
    
    # Area filter
    df = df[((df['Area (cm²)'] >= area_range.value[0]) & (df['Area (cm²)'] <= area_range.value[1])) | df['Area (cm²)'].isna()]
    
    # Deposition filter
    if 'All' not in deposition_filter.value:
        df = df[df['Deposition Technique'].isin(deposition_filter.value)]
    
    # Module/Cell filter
    if 'All' not in module_cell_filter.value:
        df = df[df['Module/Cell'].isin(module_cell_filter.value)]
    
    # Stack filter
    if 'All' not in stack_filter.value:
        def contains_material(stack, materials):
            if pd.isna(stack):
                return False
            stack_materials = [m.strip() for m in str(stack).split('/')]
            return any(mat in stack_materials for mat in materials)
        
        df = df[df['Stack (with Chemical Formula)'].apply(
            lambda x: contains_material(x, stack_filter.value)
        )]
    
    return df

def prepare_plot_data(df, axis_col, stack_material=None):
    if axis_col == 'Stack (with Chemical Formula)':
        if stack_material and stack_material != 'All':
            # Filter to only rows that contain the selected material
            def has_material(stack):
                if pd.isna(stack):
                    return False
                materials = [m.strip() for m in str(stack).split('/')]
                return stack_material in materials
            
            df_filtered = df[df['Stack (with Chemical Formula)'].apply(has_material)].copy()
            df_filtered['_plot_value'] = stack_material
            return df_filtered
        else:
            # Count occurrences of each material
            material_counts = {}
            for idx, stack in df['Stack (with Chemical Formula)'].dropna().items():
                materials = [m.strip() for m in str(stack).split('/')]
                for mat in materials:
                    if mat not in material_counts:
                        material_counts[mat] = []
                    material_counts[mat].append(idx)
            
            # Create plot dataframe
            plot_data = []
            for mat, indices in material_counts.items():
                for idx in indices:
                    row = df.loc[idx].copy()
                    row['_plot_value'] = mat
                    plot_data.append(row)
            
            return pd.DataFrame(plot_data)
    else:
        df_copy = df.copy()
        df_copy['_plot_value'] = df_copy[axis_col]
        return df_copy

def get_color_values(df_filtered, color_col, color_stack_mat=None):
    """Extract color values from the dataframe"""
    if color_col == 'None':
        return None
    elif color_col == 'Stack (with Chemical Formula)':
        if color_stack_mat and color_stack_mat != 'All':
            # Check if each row contains the specific material
            def has_material(stack):
                if pd.isna(stack):
                    return 'No Stack'
                materials = [m.strip() for m in str(stack).split('/')]
                return 'Has ' + color_stack_mat if color_stack_mat in materials else 'No ' + color_stack_mat
            return df_filtered['Stack (with Chemical Formula)'].apply(has_material)
        else:
            # Use first material in stack as color
            def get_first_material(stack):
                if pd.isna(stack):
                    return 'Unknown'
                materials = [m.strip() for m in str(stack).split('/')]
                return materials[0] if materials else 'Unknown'
            return df_filtered['Stack (with Chemical Formula)'].apply(get_first_material)
    else:
        return df_filtered[color_col].fillna('Unknown')

def update_plot(b=None):
    with output:
        clear_output(wait=True)
        
        # Filter data
        df_filtered = filter_data()
        
        if len(df_filtered) == 0:
            print("No data matches the current filters!")
            return
        
        # Prepare plot data
        x_col = x_axis.value
        y_col = y_axis.value
        color_col = color_by.value
        
        x_stack_mat = stack_material_x.value if x_col == 'Stack (with Chemical Formula)' else None
        y_stack_mat = stack_material_y.value if y_col == 'Stack (with Chemical Formula)' else None
        color_stack_mat = color_stack_material.value if color_col == 'Stack (with Chemical Formula)' else None
        
        df_plot_x = prepare_plot_data(df_filtered, x_col, x_stack_mat)
        df_plot_y = prepare_plot_data(df_filtered, y_col, y_stack_mat)
        
        # Merge on index
        df_plot = df_plot_x[['_plot_value']].rename(columns={'_plot_value': 'x_val'})
        df_plot['y_val'] = df_plot_y['_plot_value']
        
        # Add original data for hover information
        df_plot['DOI'] = df_filtered['DOI'].reindex(df_plot.index)
        df_plot['Author(s)'] = df_filtered['Author(s)'].reindex(df_plot.index)
        df_plot['What is new?'] = df_filtered['What is new?'].reindex(df_plot.index).fillna('N/A')
        df_plot['Year_data'] = df_filtered['Year'].reindex(df_plot.index)
        df_plot['PCE(Max)_data'] = df_filtered['PCE(Max)'].reindex(df_plot.index)
        df_plot['Area_data'] = df_filtered['Area (cm²)'].reindex(df_plot.index)
        df_plot['Stack'] = df_filtered['Stack (with Chemical Formula)'].reindex(df_plot.index).fillna('N/A')
        df_plot['Deposition'] = df_filtered['Deposition Technique'].reindex(df_plot.index).fillna('N/A')
        df_plot['Data Source'] = df_filtered['Data Source'].reindex(df_plot.index)
        
        # Add color values from original filtered data
        if color_col != 'None':
            color_values = get_color_values(df_filtered, color_col, color_stack_mat)
            df_plot['color_val'] = color_values.reindex(df_plot.index)
        else:
            df_plot['color_val'] = 'All Data'
        
        # Remove NaN values in x and y
        df_plot = df_plot.dropna(subset=['x_val', 'y_val'])
        
        if len(df_plot) == 0:
            print("No valid data points to plot!")
            return
        
        # Determine if axes are numeric
        x_is_numeric = x_col in numeric_columns
        y_is_numeric = y_col in numeric_columns
        
        # Create hover template
        hover_template = (
            '<b>Source:</b> %{customdata[0]}<br>'
            '<b>DOI:</b> %{customdata[1]}<br>'
            '<b>Authors:</b> %{customdata[2]}<br>'
            '<b>Year:</b> %{customdata[3]}<br>'
            '<b>PCE(Max):</b> %{customdata[4]:.2f}%<br>'
            '<b>Area:</b> %{customdata[5]:.3f} cm²<br>'
            '<b>Deposition:</b> %{customdata[6]}<br>'
            '<b>Stack:</b> %{customdata[7]}<br>'
            '<b>What is new?:</b><br>%{customdata[8]}'
            '<extra></extra>'
        )
        
        customdata = np.column_stack([
            df_plot['Data Source'].fillna('N/A'),
            df_plot['DOI'].fillna('N/A'),
            df_plot['Author(s)'].fillna('N/A'),
            df_plot['Year_data'].fillna('N/A'),
            df_plot['PCE(Max)_data'].fillna(0),
            df_plot['Area_data'].fillna(0),
            df_plot['Deposition'].fillna('N/A'),
            df_plot['Stack'].fillna('N/A'),
            df_plot['What is new?'].apply(lambda x: x[:50] + '...' if isinstance(x, str) and len(x) > 50 else x)
        ])
        
        # Create figure
        fig = go.Figure()
        
        # Get unique color values
        if color_col != 'None':
            unique_colors = sorted(df_plot['color_val'].unique())
            
            # Use Plotly color scales
            if len(unique_colors) <= 10:
                colors = px.colors.qualitative.Set1[:len(unique_colors)]
            elif len(unique_colors) <= 24:
                colors = px.colors.qualitative.Light24[:len(unique_colors)]
            else:
                colors = px.colors.sample_colorscale("hsv", [i/(len(unique_colors)-1) for i in range(len(unique_colors))])
            
            color_discrete_map = {val: colors[i] for i, val in enumerate(unique_colors)}
        else:
            unique_colors = ['All Data']
            color_discrete_map = {'All Data': '#636EFA'}
        
        # Plot based on axis types
        if x_is_numeric and y_is_numeric:
            # Scatter plot
            for color_val in unique_colors:
                mask = df_plot['color_val'] == color_val
                df_subset = df_plot[mask]
                
                fig.add_trace(go.Scatter(
                    x=df_subset['x_val'],
                    y=df_subset['y_val'],
                    mode='markers',
                    name=str(color_val),
                    marker=dict(
                        size=10,
                        color=color_discrete_map[color_val],
                        line=dict(width=1, color='DarkSlateGrey')
                    ),
                    customdata=customdata[mask.values],
                    hovertemplate=hover_template
                ))
            
        elif x_is_numeric and not y_is_numeric:
            # Strip plot
            categories = sorted(df_plot['y_val'].unique())
            y_positions = {cat: i for i, cat in enumerate(categories)}
            
            for color_val in unique_colors:
                mask = df_plot['color_val'] == color_val
                df_subset = df_plot[mask]
                y_numeric = [y_positions[y] for y in df_subset['y_val']]
                
                fig.add_trace(go.Scatter(
                    x=df_subset['x_val'],
                    y=y_numeric,
                    mode='markers',
                    name=str(color_val),
                    marker=dict(
                        size=10,
                        color=color_discrete_map[color_val],
                        line=dict(width=1, color='DarkSlateGrey')
                    ),
                    customdata=customdata[mask.values],
                    hovertemplate=hover_template
                ))
            
            fig.update_yaxis(
                tickmode='array',
                tickvals=list(range(len(categories))),
                ticktext=categories
            )
            
        elif not x_is_numeric and y_is_numeric:
            # Box plot with scatter overlay
            categories = sorted(df_plot['x_val'].unique())
            
            if color_col == 'None':
                # Simple box plot
                for i, cat in enumerate(categories):
                    df_cat = df_plot[df_plot['x_val'] == cat]
                    fig.add_trace(go.Box(
                        y=df_cat['y_val'],
                        name=str(cat),
                        boxmean='sd',
                        marker_color=color_discrete_map['All Data']
                    ))
            else:
                # Scatter plot with categorical x
                x_positions = {cat: i for i, cat in enumerate(categories)}
                
                for color_val in unique_colors:
                    mask = df_plot['color_val'] == color_val
                    df_subset = df_plot[mask]
                    x_numeric = [x_positions[x] for x in df_subset['x_val']]
                    
                    fig.add_trace(go.Scatter(
                        x=x_numeric,
                        y=df_subset['y_val'],
                        mode='markers',
                        name=str(color_val),
                        marker=dict(
                            size=10,
                            color=color_discrete_map[color_val],
                            line=dict(width=1, color='DarkSlateGrey')
                        ),
                        customdata=customdata[mask.values],
                        hovertemplate=hover_template
                    ))
                
                fig.update_xaxis(
                    tickmode='array',
                    tickvals=list(range(len(categories))),
                    ticktext=categories
                )
        else:
            # Categorical x categorical - grouped bar chart
            crosstab = pd.crosstab(df_plot['x_val'], df_plot['y_val'])
            
            for col in crosstab.columns:
                fig.add_trace(go.Bar(
                    x=crosstab.index,
                    y=crosstab[col],
                    name=str(col)
                ))
        
        # Labels
        x_label = f"{x_col}" + (f" ({x_stack_mat})" if x_stack_mat and x_stack_mat != 'All' else "")
        y_label = f"{y_col}" + (f" ({y_stack_mat})" if y_stack_mat and y_stack_mat != 'All' else "")
        
        # Count by source
        source_counts = df_filtered['Data Source'].value_counts()
        source_str = ', '.join([f"{count} from {source}" for source, count in source_counts.items()])
        
        title = f'{y_label} vs {x_label}<br>({len(df_filtered)} papers: {source_str})'
        if color_col != 'None':
            title += f' | Colored by: {color_col}'
        
        # Update layout
        fig.update_layout(
            title=dict(text=title, font=dict(size=16)),
            xaxis_title=x_label,
            yaxis_title=y_label,
            hovermode='closest',
            showlegend=(color_col != 'None'),
            legend=dict(
                yanchor="top",
                y=0.99,
                xanchor="left",
                x=1.02,
                bgcolor="rgba(255, 255, 255, 0.8)"
            ),
            width=1200,
            height=700,
            template='plotly_white'
        )
        
        # Apply log scale if requested
        if log_x_axis.value and x_is_numeric:
            fig.update_xaxes(type='log')
        
        if log_y_axis.value and y_is_numeric:
            fig.update_yaxes(type='log')
        
        fig.show()
        
        print(f"\nShowing {len(df_filtered)} papers: {source_str}")
        total_data = get_combined_data()
        print(f"Total available: {len(total_data)} papers")
        if color_col != 'None':
            print(f"Colored by: {color_col}")

def reset_filters(b):
    year_filter.value = ['All']
    pce_max_range.value = [0, df_original['PCE(Max)'].max() if pd.notna(df_original['PCE(Max)'].max()) else 30]
    pce_median_range.value = [0, df_original['PCE(Median)'].max() if pd.notna(df_original['PCE(Median)'].max()) else 30]
    area_range.value = [0, df_original['Area (cm²)'].max() if pd.notna(df_original['Area (cm²)'].max()) else 100]
    deposition_filter.value = ['All']
    module_cell_filter.value = ['All']
    stack_filter.value = ['All']
    stack_material_x.value = 'All'
    stack_material_y.value = 'All'
    color_by.value = 'None'
    color_stack_material.value = 'All'
    show_nomad_data.value = False
    log_x_axis.value = False
    log_y_axis.value = False
    update_plot()

def on_axis_change(change):
    # Show/hide stack material selectors
    if x_axis.value == 'Stack (with Chemical Formula)':
        stack_material_x.layout.visibility = 'visible'
    else:
        stack_material_x.layout.visibility = 'hidden'
    
    if y_axis.value == 'Stack (with Chemical Formula)':
        stack_material_y.layout.visibility = 'visible'
    else:
        stack_material_y.layout.visibility = 'hidden'

def on_color_change(change):
    # Show/hide color stack material selector
    if color_by.value == 'Stack (with Chemical Formula)':
        color_stack_material.layout.visibility = 'visible'
    else:
        color_stack_material.layout.visibility = 'hidden'

# Connect callbacks
plot_button.on_click(update_plot)
reset_button.on_click(reset_filters)
x_axis.observe(on_axis_change, names='value')
y_axis.observe(on_axis_change, names='value')
color_by.observe(on_color_change, names='value')

In [7]:
# Layout
plot_controls = VBox([
    widgets.HTML("<h3>Plot Settings</h3>"),
    x_axis,
    stack_material_x,
    y_axis,
    stack_material_y,
    widgets.HTML("<br><b>Scale Options:</b>"),
    HBox([log_x_axis, log_y_axis]),
    widgets.HTML("<br><b>Color Coding:</b>"),
    color_by,
    color_stack_material,
    widgets.HTML("<br><b>Data Sources:</b>"),
    show_nomad_data,
], layout=Layout(padding='10px'))

filters = VBox([
    widgets.HTML("<h3>Filters</h3>"),
    year_filter,
    pce_max_range,
    pce_median_range,
    area_range,
    deposition_filter,
    module_cell_filter,
    stack_filter,
], layout=Layout(padding='10px'))

buttons = HBox([plot_button, reset_button], layout=Layout(padding='10px'))

controls = VBox([
    HBox([plot_controls, filters]),
    buttons
])

# Display
display(controls)
display(output)

VBox(children=(HBox(children=(VBox(children=(HTML(value='<h3>Plot Settings</h3>'), Dropdown(description='X-axi…

Output()

In [16]:
# Get all unique deposition techniques from combined data
def get_all_deposition_techniques():
    df_combined = get_combined_data()
    techniques = df_combined['Deposition Technique'].dropna().unique()
    return sorted(list(techniques))

# Bin size selector
bin_size_widget = widgets.FloatText(
    value=0.5,
    description='Bin Size:',
    style={'description_width': '80px'},
    layout=Layout(width='200px')
)

log_x_histogram = widgets.Checkbox(
    value=False,
    description='Log X-axis',
    style={'description_width': '80px'},
    layout=Layout(width='150px')
)

# Button to generate histogram
histogram_button = widgets.Button(
    description='Generate Histogram',
    button_style='success',
    layout=Layout(width='200px', height='40px')
)

histogram_output = widgets.Output()

def generate_pce_histogram(b=None):
    with histogram_output:
        clear_output(wait=True)
        
        # Get combined data
        df_combined = get_combined_data()
        
        # Remove NaN values from PCE and Deposition
        df_pce = df_combined[['PCE(Max)', 'Deposition Technique']].dropna(subset=['PCE(Max)', 'Deposition Technique'])
        
        if len(df_pce) == 0:
            print("No PCE data available!")
            return
        
        bin_size = bin_size_widget.value
        
        # Create figure
        fig = go.Figure()
        
        # Get unique deposition techniques, with Spin coating last
        techniques = sorted(df_pce['Deposition Technique'].unique())
        if 'Spin coating' in techniques:
            techniques.remove('Spin coating')
            techniques.append('Spin coating')
        
        # Use Plotly color scales
        if len(techniques) <= 10:
            colors = px.colors.qualitative.Set1[:len(techniques)]
        elif len(techniques) <= 24:
            colors = px.colors.qualitative.Light24[:len(techniques)]
        else:
            colors = px.colors.sample_colorscale("hsv", [i/(len(techniques)-1) for i in range(len(techniques))])
        
        color_map = {tech: colors[i] for i, tech in enumerate(techniques)}
        
        # Plot cumulative count for each deposition technique
        for technique in techniques:
            df_tech = df_pce[df_pce['Deposition Technique'] == technique].copy()
            
            # Sort by PCE
            df_tech = df_tech.sort_values('PCE(Max)')
            
            if bin_size > 0:
                # Binned version - count papers up to each PCE bin
                max_pce = df_tech['PCE(Max)'].max()
                pce_bins = np.arange(0, max_pce + bin_size, bin_size)
                
                cumulative_counts = []
                for pce_threshold in pce_bins:
                    count = len(df_tech[df_tech['PCE(Max)'] <= pce_threshold])
                    cumulative_counts.append(count)
                
                fig.add_trace(go.Scatter(
                    y=cumulative_counts,
                    x=pce_bins,
                    mode='markers+lines',
                    name=technique,
                    marker=dict(
                        size=8,
                        color=color_map[technique],
                        line=dict(width=1, color='DarkSlateGrey')
                    ),
                    line=dict(
                        color=color_map[technique],
                        width=2
                    )
                ))
            else:
                # Continuous version - every PCE value
                cumulative_count = list(range(1, len(df_tech) + 1))
                
                fig.add_trace(go.Scatter(
                    y=cumulative_count,
                    x=df_tech['PCE(Max)'].values,
                    mode='lines',
                    name=technique,
                    line=dict(
                        color=color_map[technique],
                        width=2
                    )
                ))
        
        # Create title
        if bin_size > 0:
            title = f'Cumulative Papers vs PCE (Bin Size: {bin_size}%)'
        else:
            title = 'Cumulative Papers vs PCE (Continuous)'
        
        fig.update_layout(
            title=title,
            yaxis_title='Number of Papers (Cumulative)',
            xaxis_title='PCE (Max) [%]',
            width=1000,
            height=600,
            template='plotly_white',
            showlegend=True,
            legend=dict(
                yanchor="top",
                y=1,
                xanchor="left",
                x=1.02
            ),
            hovermode='closest'
        )
        
        # Apply log scale if requested
        if log_x_histogram.value:
            fig.update_yaxes(type='log')
        
        fig.show()
        
        # Print statistics
        print(f"\nTotal papers with PCE data: {len(df_pce)}")
        for technique in techniques:
            count = len(df_pce[df_pce['Deposition Technique'] == technique])
            max_pce = df_pce[df_pce['Deposition Technique'] == technique]['PCE(Max)'].max()
            print(f"  - {technique}: {count} papers (max PCE: {max_pce:.2f}%)")

# Connect button
histogram_button.on_click(generate_pce_histogram)

# Display
display(VBox([
    widgets.HTML("<h3>Cumulative Papers vs PCE by Deposition Method</h3>"),
    widgets.HTML("<p>Set Bin Size to 0 for continuous curve</p>"),
    HBox([bin_size_widget, log_x_histogram]),
    histogram_button,
    histogram_output
]))

VBox(children=(HTML(value='<h3>Cumulative Papers vs PCE by Deposition Method</h3>'), HTML(value='<p>Set Bin Si…

In [10]:
# Get all unique deposition techniques from combined data
# Define standard deposition techniques
def get_all_deposition_techniques():
    return ['Blade coating', 'Slot-die coating', 'Gravure', 'Inkjet printing', 'Spin coating', 'Screen printing', 'Spray coating']

# Deposition technique filter for area vs PCE plot
deposition_area_filter = widgets.SelectMultiple(
    options=['All'] + get_all_deposition_techniques(),
    value=['All'],
    description='Deposition:',
    style={'description_width': '80px'},
    layout=Layout(width='350px', height='150px')
)

# Color by selector
color_area_by = widgets.Dropdown(
    options=['Deposition Technique', 'Data Source'],
    value='Deposition Technique',
    description='Color by:',
    style={'description_width': '80px'},
    layout=Layout(width='300px')
)

# Log scale checkbox
log_area_scale = widgets.Checkbox(
    value=True,
    description='Log Area Scale',
    style={'description_width': '100px'},
    layout=Layout(width='180px')
)

# Button to generate area vs PCE plot
area_pce_button = widgets.Button(
    description='Generate Plot',
    button_style='success',
    layout=Layout(width='200px', height='40px')
)

area_pce_output = widgets.Output()

def generate_area_pce_plot(b=None):
    with area_pce_output:
        clear_output(wait=True)
        
        # Get combined data
        df_combined = get_combined_data()
        
        # Filter by deposition technique
        if 'All' not in deposition_area_filter.value:
            df_combined = df_combined[df_combined['Deposition Technique'].isin(deposition_area_filter.value)]
        
        # Remove NaN values from PCE and Area
        df_plot = df_combined[['PCE(Max)', 'Area (cm²)', 'Data Source', 'Deposition Technique', 'DOI', 'Author(s)', 'Year', 'Module/Cell']].dropna(subset=['PCE(Max)', 'Area (cm²)'])
        
        if len(df_plot) == 0:
            print("No data available with current filters!")
            return
        
        color_by = color_area_by.value
        
        # Create scatter plot
        fig = go.Figure()
        
        # Get unique values for coloring, with Spin coating first (plotted first = in back)
        color_categories = sorted(df_plot[color_by].dropna().unique())
        if 'Spin coating' in color_categories:
            color_categories.remove('Spin coating')
            color_categories.insert(0, 'Spin coating')
        
        # Use Plotly color scales
        if len(color_categories) <= 10:
            colors = px.colors.qualitative.Set1[:len(color_categories)]
        elif len(color_categories) <= 24:
            colors = px.colors.qualitative.Light24[:len(color_categories)]
        else:
            colors = px.colors.sample_colorscale("hsv", [i/(len(color_categories)-1) for i in range(len(color_categories))])
        
        color_map = {cat: colors[i] for i, cat in enumerate(color_categories)}
        
        # Create hover template
        hover_template = (
            '<b>Deposition:</b> %{customdata[0]}<br>'
            '<b>Source:</b> %{customdata[1]}<br>'
            '<b>Authors:</b> %{customdata[2]}<br>'
            '<b>Year:</b> %{customdata[3]}<br>'
            '<b>Module/Cell:</b> %{customdata[4]}<br>'
            '<b>Area:</b> %{x:.4f} cm²<br>'
            '<b>PCE(Max):</b> %{y:.2f}%<br>'
            '<b>DOI:</b> %{customdata[5]}'
            '<extra></extra>'
        )
        
        for category in color_categories:
            df_category = df_plot[df_plot[color_by] == category]
            
            customdata = np.column_stack([
                df_category['Deposition Technique'].fillna('N/A'),
                df_category['Data Source'].fillna('N/A'),
                df_category['Author(s)'].fillna('N/A'),
                df_category['Year'].fillna('N/A'),
                df_category['Module/Cell'].fillna('N/A'),
                df_category['DOI'].fillna('N/A')
            ])
            
            fig.add_trace(go.Scatter(
                x=df_category['Area (cm²)'],
                y=df_category['PCE(Max)'],
                mode='markers',
                name=str(category),
                marker=dict(
                    size=8,
                    color=color_map[category],
                    line=dict(width=1, color='DarkSlateGrey'),
                    opacity=0.7
                ),
                customdata=customdata,
                hovertemplate=hover_template
            ))
        
        # Create title with filter info
        title = 'Active Area vs PCE (Max)'
        if 'All' not in deposition_area_filter.value:
            title += f'<br>Filtered by: {", ".join(deposition_area_filter.value)}'
        
        fig.update_layout(
            title=title,
            xaxis_title='Active Area (cm²)',
            yaxis_title='PCE (Max) [%]',
            width=1100,
            height=700,
            template='plotly_white',
            showlegend=True,
            legend=dict(
                yanchor="top",
                y=1,
                xanchor="left",
                x=1.02
            ),
            hovermode='closest'
        )
        
        # Apply log scale if requested
        if log_area_scale.value:
            fig.update_xaxes(type='log')
        
        fig.show()
        
        # Print statistics
        print(f"\nTotal papers with Area and PCE data: {len(df_plot)}")
        
        if color_by == 'Data Source':
            for source in sorted(df_plot['Data Source'].unique()):
                count = len(df_plot[df_plot['Data Source'] == source])
                print(f"  - {source}: {count} papers")
        else:
            for technique in sorted(df_plot['Deposition Technique'].dropna().unique()):
                count = len(df_plot[df_plot['Deposition Technique'] == technique])
                print(f"  - {technique}: {count} papers")
        
        print(f"\nArea Range: {df_plot['Area (cm²)'].min():.4f} - {df_plot['Area (cm²)'].max():.4f} cm²")
        print(f"PCE Range: {df_plot['PCE(Max)'].min():.2f}% - {df_plot['PCE(Max)'].max():.2f}%")
        print(f"Mean PCE: {df_plot['PCE(Max)'].mean():.2f}%")
        print(f"Median PCE: {df_plot['PCE(Max)'].median():.2f}%")

# Connect button
area_pce_button.on_click(generate_area_pce_plot)

# Display
display(VBox([
    widgets.HTML("<h3>Active Area vs PCE Scatter Plot</h3>"),
    HBox([color_area_by, log_area_scale]),
    deposition_area_filter,
    area_pce_button,
    area_pce_output
]))

VBox(children=(HTML(value='<h3>Active Area vs PCE Scatter Plot</h3>'), HBox(children=(Dropdown(description='Co…