# NOMAD Data Explorer

This notebook demonstrates how to retrieve and analyze data from the NOMAD API.

## Authentication

We'll use the authentication module from `nomad_auth.ipynb` to handle NOMAD API authentication.

In [None]:
# Import the authentication module from nomad_auth.ipynb
%run './nomad_auth.ipynb'

# After running nomad_auth, the following variables are available:
# - api_client: NomadClient instance for making API calls
# - current_token: The authenticated token
# - current_user_info: Information about the authenticated user

# Import our data retrieval module
from nomad_api.data import (
    get_all_samples_with_authors,
    get_user_details,
    get_all_unique_authors,
    create_author_name_map,
    query_sample_entries
)

# Import other required libraries
import os
import sys
import json
import pandas as pd
import datetime
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Display the authentication UI
display_auth_ui()

## Data Exploration

Now that we're authenticated, we can start exploring NOMAD data.

In [None]:
# Define the query payload to find samples
query_payload = {
    "owner": "visible",
    "query": {
        "and": [
            {"results.eln.sections:any": ["HySprint_Sample"]},
            {"quantities:all": ["data"]},
        ]
    },
}

# Make sure we have an authenticated client
if not api_client:
    print("Please authenticate first using the UI above")
else:
    # Make the API request
    response_entries = api_client.make_request(
        "post", "entries/query", json_data=query_payload
    )
    
    # Print the number of entries found
    print(f"Found {response_entries.get('pagination', {}).get('total', 0)} matching entries")


In [None]:
# Check the data structure if entries were found
if 'data' in response_entries and response_entries['data']:
    print(f"Number of entries returned: {len(response_entries['data'])}")
    # Show the keys of the first entry to understand the structure
    print("\nKeys in the first entry:")
    print(response_entries['data'][0].keys())
else:
    print("No data found or you need to authenticate first.")

In [None]:
# Examine first entry in more detail if available
if 'data' in response_entries and response_entries['data']:
    entry = response_entries['data'][0]
    print(f"Entry ID: {entry.get('entry_id')}")
    print(f"Upload ID: {entry.get('upload_id')}")
    # Show additional details from the first entry if they exist
    if 'data' in entry and 'lab_id' in entry['data']:
        print(f"Lab ID: {entry['data']['lab_id']}")

# Collect Author Information for All Samples

This section demonstrates how to collect information about authors for all samples in the NOMAD database.

In [None]:
# Note: We are now using the imported functions from nomad_api.data
# No need to redefine these functions in the notebook

In [None]:
# Get all samples with author information (limiting to 5 pages for testing)
# Remove the max_pages parameter to get all samples
samples_data = get_all_samples_with_authors(api_client, page_size=50, max_pages=5)

In [None]:
# Convert to DataFrame for easier analysis
samples_df = pd.DataFrame(samples_data)

# Display the first few rows
samples_df.head()

In [None]:
# Save the data to CSV for backup and further analysis
samples_df.to_csv('nomad_samples_with_authors.csv', index=False)
print(f"Data saved to 'nomad_samples_with_authors.csv'")

# Enriching Author Information

To make the data more useful for dashboards, let's enrich it by retrieving user details for the author IDs.

In [None]:
# Note: We're now using the imported get_user_details function from nomad_api.data

In [None]:
# Extract unique author IDs from samples using our imported function
unique_authors = get_all_unique_authors(samples_data)
print(f"Found {len(unique_authors)} unique authors")

In [None]:
# Create a mapping from author IDs to names using our imported function
user_id_to_name = create_author_name_map(api_client, samples_data)
print(f"Created name mapping for {len(user_id_to_name)} authors")

In [None]:
# Add user names to the DataFrame
samples_df['main_author_name'] = samples_df['main_author'].map(lambda x: user_id_to_name.get(x, 'Unknown'))

# For coauthors (which is a list), add a new column with names
def get_coauthor_names(coauthor_ids):
    if not isinstance(coauthor_ids, list):
        return []
    return [user_id_to_name.get(user_id, 'Unknown') for user_id in coauthor_ids]

samples_df['coauthor_names'] = samples_df['coauthors'].apply(get_coauthor_names)

# Display the first few rows with author names
samples_df[['entry_id', 'main_author', 'main_author_name', 'coauthor_names']].head()

# Data Analysis and Dashboard Preparation

Now let's analyze the data and create some visualizations for our dashboard.

In [None]:
# 1. Count samples per main author
author_sample_counts = samples_df['main_author_name'].value_counts()

# Create a bar chart
plt.figure(figsize=(12, 6))
author_sample_counts.head(15).plot(kind='bar')
plt.title('Number of Samples by Main Author (Top 15)', fontsize=14)
plt.xlabel('Author')
plt.ylabel('Number of Samples')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# 2. Analyze sample creation over time

# Convert upload_create_time to datetime
samples_df['upload_date'] = pd.to_datetime(samples_df['upload_create_time'])

# Extract year and month for time series analysis
samples_df['year_month'] = samples_df['upload_date'].dt.to_period('M')

# Count samples by month
time_series = samples_df.groupby('year_month').size()

# Plot time series
plt.figure(figsize=(14, 6))
time_series.plot(kind='line', marker='o')
plt.title('Sample Uploads Over Time', fontsize=14)
plt.xlabel('Date (Year-Month)')
plt.ylabel('Number of Samples')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# 3. Analyze collaboration patterns (co-authorship)

# Count number of coauthors per sample
samples_df['coauthor_count'] = samples_df['coauthors'].apply(lambda x: len(x) if isinstance(x, list) else 0)

# Plot distribution of number of coauthors
plt.figure(figsize=(10, 6))
sns.histplot(samples_df['coauthor_count'], bins=range(0, max(samples_df['coauthor_count'])+2))
plt.title('Distribution of Co-authors per Sample', fontsize=14)
plt.xlabel('Number of Co-authors')
plt.ylabel('Frequency')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# 4. Create collaboration network data (for network visualization)
collaborations = []

# For each sample, create pairs of collaborating authors
for _, row in samples_df.iterrows():
    main_author = row['main_author']
    main_author_name = row['main_author_name']
    
    # Add collaboration between main author and each co-author
    coauthors = row['coauthors']
    if isinstance(coauthors, list) and len(coauthors) > 0:
        for coauthor in coauthors:
            coauthor_name = user_id_to_name.get(coauthor, 'Unknown')
            collaborations.append((main_author_name, coauthor_name))

# Count frequency of each collaboration
from collections import Counter
collaboration_counts = Counter(collaborations)

# Print the top 10 most frequent collaborations
print("Top 10 collaborations:")
for collab, count in collaboration_counts.most_common(10):
    print(f"{collab[0]} ↔ {collab[1]}: {count} samples")

# Prepare Data for Interactive Dashboard

Let's prepare and save structured data for an interactive dashboard.

In [None]:
# Convert our dataframe to more dashboard-friendly formats

# 1. Summary statistics
dashboard_stats = {
    'total_samples': int(len(samples_df)),
    'total_authors': int(len(unique_authors)),
    'published_samples': int(samples_df['published'].sum()),
    'private_samples': int(len(samples_df) - samples_df['published'].sum()),
    'samples_with_coauthors': int((samples_df['coauthor_count'] > 0).sum()),
    'avg_coauthors_per_sample': float(samples_df['coauthor_count'].mean()),
    'most_prolific_author': str(author_sample_counts.index[0]),
    'most_prolific_author_count': int(author_sample_counts.iloc[0]),
    'most_recent_upload': samples_df['upload_date'].max().strftime('%Y-%m-%d'),
    'oldest_upload': samples_df['upload_date'].min().strftime('%Y-%m-%d'),
}

# Save summary stats
with open('dashboard_summary_stats.json', 'w') as f:
    json.dump(dashboard_stats, f, indent=2)

print("Summary statistics saved for dashboard")

In [None]:
# 2. Time series data (monthly uploads)
time_series_data = {
    'dates': [str(period) for period in time_series.index],
    'counts': [int(count) for count in time_series.values]  # Convert numpy.int64 to Python int
}

# Save time series data
with open('dashboard_time_series.json', 'w') as f:
    json.dump(time_series_data, f)

# 3. Author statistics
author_stats = []
for author_name, count in author_sample_counts.items():
    # Get author ID
    author_id = None
    for id, name in user_id_to_name.items():
        if name == author_name:
            author_id = id
            break
            
    author_stats.append({
        'author_name': author_name,
        'author_id': author_id,
        'sample_count': int(count),
        # Additional metrics could be added here
    })

# Save author statistics
with open('dashboard_author_stats.json', 'w') as f:
    json.dump(author_stats, f, indent=2)

# 4. Collaboration network data
network_data = {
    'nodes': [{'id': author, 'group': 1, 'size': author_sample_counts.get(author, 1)} 
              for author in set(user_id_to_name.values())],
    'links': [{'source': source, 'target': target, 'value': count} 
              for (source, target), count in collaboration_counts.items()]
}

# Save network data
with open('dashboard_network.json', 'w') as f:
    # Convert any NumPy types to Python native types
    network_data_serializable = json.loads(
        json.dumps(network_data, default=lambda x: int(x) if hasattr(x, 'dtype') else x)
    )
    json.dump(network_data_serializable, f, indent=2)

print("Data for interactive dashboard saved successfully.")

# Summary

We've collected comprehensive author information for all samples from NOMAD, structured this data for analysis, and created visualizations. The data has been saved in formats suitable for building interactive dashboards.

The following files were created:

1. `nomad_samples_with_authors.csv` - Raw data with all samples and their author information
2. `dashboard_summary_stats.json` - Summary statistics for the dashboard
3. `dashboard_time_series.json` - Time series data for charts
4. `dashboard_author_stats.json` - Detailed author statistics
5. `dashboard_network.json` - Collaboration network data for network visualizations

These can be used with dashboard frameworks like Plotly Dash, Streamlit, or web-based visualization libraries like D3.js to create interactive dashboards.