Making Sankey!

In [1]:
# import the used libraries
import pandas as pd
import plotly.graph_objects as go
import plotly.offline as pyo
import random

In [2]:
# read the data into a pandas dataframe
data = pd.read_csv('Final_Portal_GCM_mappedcountries.csv')

In [None]:
# To check the number of unique values, print the unique values in each column and show the number of unique values
print(data['Portal'].nunique())
print(data['Country'].nunique())
print(data['Institute'].nunique())
print(data['Model'].nunique())
print(data['Downscaling'].nunique())

In [4]:
# Set color palette, colorblind friendly
portal_colors = ['#68023F',
                '#008169',
                '#EF0096',
                '#00DCB5',
                '#FFCFE2',
                '#003C86',
                '#9400E6',
                '#009FFA',
                '#FF71FD',
                '#7CFFFA',
                '#6A0213',
                '#008607',
                '#F60239',
                '#00E307',
                '#FFDC3D']

In [5]:
def blend_with_white(color, alpha):
    """
    Blend a color with white. There are a lot of nodes in this Sankey diagram, so we need to generate a lot of colors in the colorblind-friendly palette.
    
    Args:
    - color (str): Hexadecimal color string (e.g., "#FF0000" for red).
    - alpha (float): Blending factor between 0 and 1. 
        0 will give original color, 1 will give white.

    Returns:
    - str: Hexadecimal color string after blending.
    """
    r = int(color[1:3], 16)
    g = int(color[3:5], 16)
    b = int(color[5:7], 16)
    
    r = int(r + (255 - r) * alpha)
    g = int(g + (255 - g) * alpha)
    b = int(b + (255 - b) * alpha)
    
    return "#{:02X}{:02X}{:02X}".format(r, g, b)

colors = [
    "#68023F", "#008169", "#EF0096", "#00DCB5", "#FFCFE2",
    "#003C86", "#9400E6", "#009FFA", "#FF71FD", "#7CFFFA",
    "#6A0213", "#008607", "#F60239", "#00E307", "#FFDC3D"
]

# Assign base colors to portals
portal_colors = colors

# Generate shades for the other nodes
other_node_colors = [blend_with_white(random.choice(colors), random.uniform(0.3, 0.7)) for _ in range(174)] # change 100 to your requirement

# Now you can use `portal_colors` for portal nodes and randomly pick from `other_node_colors` for the "Model", "institute", and "country" nodes in your Sankey diagram.


In [6]:
# Assign colors to the nodes
node_colors = other_node_colors + portal_colors 

In [5]:
#get the unique values from the columns
institutes = data['Institute'].unique().tolist()
models = data['Model'].unique().tolist()
portals = data['Portal'].unique().tolist()
countries = data['Country'].unique().tolist()
downscaling = data['Downscaling'].unique().tolist()

In [6]:
# sort the unique values
models = sorted(models)
countries = sorted(countries)
institutes = sorted(institutes)
portals = sorted(portals)
downscaling = sorted(downscaling)

In [7]:
# create a list of labels
labels = models + countries + institutes + downscaling + portals

In [None]:
# This highlights US paths and brings them in front of all other paths

# Get source, target, value, and color lists
source = []
target = []
value = []
link_colors = []

# First, create all non-USA paths
for _, row in data.iterrows():
    if row['Country'] != 'USA':
        # Link from model to country
        source.append(models.index(row['Model']))
        target.append(len(models) + countries.index(row['Country']))
        value.append(1)  # you can change this value if needed
        link_colors.append('lightgray')

        # Link from country to institute
        source.append(len(models) + countries.index(row['Country']))
        target.append(len(models) + len(countries) + institutes.index(row['Institute']))
        value.append(1)  # assuming each row in the csv represents a count of 1
        link_colors.append('lightgray')

        # Link from institute to portal
        source.append(len(models) + len(countries) + institutes.index(row['Institute']))
        target.append(len(models) + len(countries) + len(institutes) + portals.index(row['Portal']))
        value.append(1)
        link_colors.append('lightgray')

# Then, create all USA paths
for _, row in data.iterrows():
    if row['Country'] == 'USA':
        # Link from model to country
        source.append(models.index(row['Model']))
        target.append(len(models) + countries.index(row['Country']))
        value.append(1)
        link_colors.append('#003C86')

        # Link from country to institute
        source.append(len(models) + countries.index(row['Country']))
        target.append(len(models) + len(countries) + institutes.index(row['Institute']))
        value.append(1)
        link_colors.append('#003C86')

        # Link from institute to portal
        source.append(len(models) + len(countries) + institutes.index(row['Institute']))
        target.append(len(models) + len(countries) + len(institutes) + portals.index(row['Portal']))
        value.append(1)
        link_colors.append('#003C86')

# Plot the Sankey diagram
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=labels,
        color=node_colors  # use the set node colors
    ),
    link=dict(
        source=source,
        target=target,
        value=value,
        color=link_colors  # set color based on USA connection
    )
)])

fig.update_layout(title_text="Model - Country - Institute - Portal Relationships with USA Highlighted", font_size=10)
fig.show()
pyo.plot(fig, filename='USA_On_Top_Sankey.html')


In [None]:
# Get source, target, value, and color lists
source = []
target = []
value = []
link_colors = []

# First, create all non-USA paths
for _, row in data.iterrows():
    if row['Institute'] != 'NASA':
        # ... [other links remain the same] ...

        # Link from institute to downscaling
        source.append(len(models) + len(countries) + institutes.index(row['Institute']))
        target.append(len(models) + len(countries) + len(institutes) + downscaling.index(row['Downscaling']))
        value.append(1)
        link_colors.append('lightgray')

        # Link from downscaling to portal
        source.append(len(models) + len(countries) + len(institutes) + downscaling.index(row['Downscaling']))
        target.append(len(models) + len(countries) + len(institutes) + len(downscaling) + portals.index(row['Portal']))
        value.append(1)
        link_colors.append('lightgray')

# Then, create all USA paths
for _, row in data.iterrows():
    if row['Institute'] == 'NASA':
        # ... [other links remain the same] ...

        # Link from institute to downscaling
        source.append(len(models) + len(countries) + institutes.index(row['Institute']))
        target.append(len(models) + len(countries) + len(institutes) + downscaling.index(row['Downscaling']))
        value.append(1)
        link_colors.append('#003C86')

        # Link from downscaling to portal
        source.append(len(models) + len(countries) + len(institutes) + downscaling.index(row['Downscaling']))
        target.append(len(models) + len(countries) + len(institutes) + len(downscaling) + portals.index(row['Portal']))
        value.append(1)
        link_colors.append('#003C86')

# Update labels to include downscalings
labels = models + countries + institutes + downscaling + portals

# Plot the Sankey diagram
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=labels,
        color=node_colors  # use the set node colors
    ),
    link=dict(
        source=source,
        target=target,
        value=value,
        color=link_colors  # set color based on USA connection
    )
)])

fig.update_layout(title_text="Institute - Downscaling - Portal Relationships with NASA Highlighted", font_size=10)
fig.show()
pyo.plot(fig, filename='downscaling.html')