# Creating Curve Ensemble Summaries 

In [1]:
from google.cloud import storage
from google.cloud import bigquery
from google.oauth2 import service_account
import pandas as pd
import io
import os
import gzip
import plotly.express as px
from scipy.spatial.distance import euclidean
from tqdm import tqdm
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import euclidean_distances
import matplotlib.colors as mcolors
import similaritymeasures
import plotly.graph_objs as go
from time import time

In [2]:
# get our query building function from the other file
from bin_builder import build_sir_query_for_categories as build_query
build_query

<function bin_builder.build_sir_query_for_categories(country_ids, run_ids, min_age, max_age, categories, grouped=True)>

In [3]:
# this prevents a potential memory leak from using kmeans
os.environ['OMP_NUM_THREADS'] = '1'

In [4]:
service_account_id = 'elijahsandler@net-data-viz-handbook.iam.gserviceaccount.com'

# os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'C:\\Users\\elija\\Documents\\24f-coop\\net-data-viz-handbook-fe2c5531555d.json'
credentials = service_account.Credentials.from_service_account_file('C:\\Users\\elija\\Documents\\24f-coop\\net-data-viz-handbook-fe2c5531555d.json')
project = 'net-data-viz-handbook'

## Import data from BigQuery

In [23]:
# construct query
# NOTE: since the rest of this file is about curve ensamble summaries, we import all runs of one country for a specific age
# group, but you can ask for as many countries and runs as you want
country_ids = [215]
run_ids = list(range(1, 101))
min_age = 13
max_age = 23
categories = ['Infectious']

# Generate SQL query with grouped=True (default behavior)
query_grouped = build_query(country_ids, run_ids, min_age, max_age, categories, grouped=True)

# Generate SQL query with grouped=False (return individual bins)
query_separate = build_query(country_ids, run_ids, min_age, max_age, categories, grouped=False)

query_separate

'SELECT         date,         country_id,         run_id,         Infectious_13_17 AS infectious_13_17, Infectious_18_23 AS infectious_18_23     FROM `net-data-viz-handbook.sri_data.SIR_0_countries_incidence_daily`     WHERE country_id IN (215) AND run_id IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100)     GROUP BY date, country_id, run_id, Infectious_13_17, Infectious_18_23     ORDER BY date;'

In [24]:
# Initialize a GCS client
client = bigquery.Client(credentials=credentials, project=project)

# Run the query
query_job = client.query(query_grouped)

# Fetch the results into a pandas DataFrame
results = query_job.to_dataframe()

# Display the first few rows
results.head()



Unnamed: 0,date,country_id,run_id,total_infectious
0,2009-02-17,215,43,0
1,2009-02-17,215,67,0
2,2009-02-17,215,69,0
3,2009-02-17,215,12,0
4,2009-02-17,215,77,0


## Process data

In [25]:
# this has to happen in pandas
# pivoting data. god what a good function.
""" This is how we create a graphing dataframe for plotly. use df.pivot, select your index, column, and values 
    (which are columns of your original dataframe) """
df_pivot = results.reset_index().pivot(index='date', columns='run_id', values='total_infectious').fillna(0)

df_pivot

run_id,1,2,3,4,5,6,7,8,9,10,...,91,92,93,94,95,96,97,98,99,100
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2009-02-17,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2009-02-18,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2009-02-19,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2009-02-20,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2009-02-21,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2010-02-13,12994,0,13649,14039,14690,0,14518,13564,14038,11972,...,0,14212,13497,13493,16202,0,0,0,0,15322
2010-02-14,12483,0,13170,13731,13947,0,13623,13137,13325,11363,...,0,13747,12917,13010,15525,0,0,0,0,14676
2010-02-15,19267,0,20866,21565,21944,0,21361,20537,21205,17788,...,0,21541,20579,20459,24187,0,0,0,0,22661
2010-02-16,20027,0,21190,22336,22723,0,21737,21302,21384,18326,...,0,22356,21225,20994,24950,0,0,0,0,24166


## Group and sort curves

In [26]:
def distance_between_curves(df, method=similaritymeasures.area_between_two_curves, downsample=1):
    """Gets the distance between two curves using various algorithms.

    Note: all curves must have same domain to use AUC.

    Inputs:
        df (dataframe or dictionary): dataframe where columns are curve values.
        method (func): a distance function from the https://pypi.org/project/similaritymeasures/ library
        downsample_factor (int): Factor to downsample the curve values.
                                 E.g., if set to 2, every second point will be kept.

    Returns:
        curve_diff (dict): keys are tuples of curves, values are distance between them.
    """
    # Assert we are using a valid metric
    
    # Initialize dictionary to store distances
    curve_diff = dict()
    
    # Data cleanup
    # Convert the index to datetime
    df = df.reset_index(names='date')
    df['date'] = pd.to_datetime(df['date'])
        
    # Convert the datetime to an integer (number of days since epoch)
    df['date'] = (df['date'] - pd.Timestamp("1970-01-01")).dt.days
    df.set_index('date', inplace=True)

    # Downsample the curve values if needed
    if downsample > 1:
        df = df.iloc[::downsample]

    # Loop through each pair of curves - computationally intensive part: O(n^2)
    for first_curve in tqdm(df.columns):
        for second_curve in df.columns[df.columns.get_loc(first_curve) + 1:]:
            
            curve_diff[(first_curve, second_curve)] = \
            method(df[first_curve].reset_index().to_numpy(), df[second_curve].reset_index().to_numpy())

    # Return dictionary
    return curve_diff

In [27]:
fr = distance_between_curves(df_pivot, method=similaritymeasures.mse, downsample=1)

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:07<00:00, 13.74it/s]


In [47]:
def generate_groups(curve_diff, k):
    """
    Uses k-means clustering to group curves based on their distance scores.
    
    Inputs:
        curve_diff (dict): Dictionary where keys are tuples representing pairs of curves, 
                           and values are the distance between them.
        k (int): Desired number of groups.
    
    Returns:
        labels (array): Integer labels [0, k) for each curve.
    """
    
    # Get all unique elements
    elements = sorted(set().union(*curve_diff.keys()))
    n = len(elements)
    
    # Create a mapping from curves to indices
    element_idx = {elem: idx for idx, elem in enumerate(elements)}
    
    # Create the distance matrix
    distance_matrix = np.zeros((n, n))
    for (curve1, curve2), score in curve_diff.items():
        idx1, idx2 = element_idx[curve1], element_idx[curve2]
        distance_matrix[idx1, idx2] = score
        distance_matrix[idx2, idx1] = score  # Matrix is symmetric
    
    # Convert distance matrix to similarity matrix
    similarity_matrix = np.max(distance_matrix) - distance_matrix
    
    # Fit K-Means clustering
    kmeans = KMeans(n_clusters=k)
    labels = kmeans.fit_predict(similarity_matrix)
    
    return labels

In [92]:
labels = generate_groups(fr, 2)




KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.



In [93]:
# def get_centrality(curve_diff, labels):
#     # get unique labels
#     unique_labels = set(labels)

#     df_scores = pd.DataFrame(curve_diff, index=['score']).T.reset_index(names=['curve1', 'curve2'])

#     # dict for storing centrality scores
#     centrality = {c: 0 for c in set(c for pair in curve_diff.keys() for c in pair)}

#     # for each group...
#     for label in unique_labels:

#         # find which curves are in that group
#         curves = np.where(labels == label)[0]

#         # get only scores within the group
#         df_gr = df_scores[df_scores['curve1'].isin(curves) & df_scores['curve2'].isin(curves)]

#         for curve in curves:
#             centrality[curve] = df_gr[(df_gr['curve1'] == curve) | (df_gr['curve2'] == curve)]['score'].sum()
#     return centrality

In [94]:
def get_centrality(curve_diff, labels):
    """
    Computes centrality scores for each curve based on their group assignments.

    Inputs:
        curve_diff (dict): Dictionary where keys are tuples representing pairs of curves, 
                           and values are the distance between them.
        labels (array): Array of cluster labels for each curve.

    Returns:
        centrality (dict): Centrality scores for each curve.
    """
    
    # Get all unique curves and their corresponding labels
    unique_curves = sorted(set(c for pair in curve_diff.keys() for c in pair))
    unique_labels = set(labels)
    
    # Create a DataFrame from curve_diff
    df_scores = pd.DataFrame(list(curve_diff.items()), columns=['pair', 'score'])
    df_scores[['curve1', 'curve2']] = pd.DataFrame(df_scores['pair'].tolist(), index=df_scores.index)
    df_scores = df_scores.drop(columns='pair')
    
    # Initialize a centrality dictionary with curve names as keys
    centrality = {curve: 0 for curve in unique_curves}
    
    # Loop over each label to calculate centrality for each group
    for label in unique_labels:
        # Get curves that belong to this group by mapping indices to actual curve names
        curves_in_group = [unique_curves[i] for i in np.where(labels == label)[0]]
        
        # Filter scores only for curves within the same group
        df_gr = df_scores[df_scores['curve1'].isin(curves_in_group) & df_scores['curve2'].isin(curves_in_group)]
        
        # Calculate centrality for each curve in the group
        for curve in curves_in_group:
            centrality[curve] = df_gr[(df_gr['curve1'] == curve) | (df_gr['curve2'] == curve)]['score'].sum()
    
    return centrality

In [95]:
centrality = get_centrality(fr, labels);

In [96]:
# def get_most_central(labels, centrality, percentile=50):
#     """ gets most central curves for each group based on centrality scores """
#     curves = dict()
    
#     for group in set(labels):
#         # this is the greatest line of code of all time
#         curves[group] = pd.Series(centrality)[np.where(labels == group)[0]]\
#         .sort_values().reset_index(drop=False).loc[:(50/100) * (len(np.where(labels == group)[0]))]['index']
        
#     return curves

In [97]:
def get_most_central(labels, centrality, percentile=50):
    """
    Gets the most central curves for each group based on centrality scores.

    Inputs:
        labels (array): Array of cluster labels for each curve.
        centrality (dict): Dictionary of centrality scores for each curve.
        percentile (float, optional): Percentile threshold to select the most central curves. Default is 50.
    
    Returns:
        curves (dict): A dictionary where the keys are group labels and the values are lists of curve names 
                       that are the most central in each group.
    """
    
    # Get unique curves from the centrality dictionary
    unique_curves = list(centrality.keys())
    
    # Initialize dictionary to store results
    curves = dict()
    
    # Loop over each group
    for group in set(labels):
        # Get the curve names corresponding to the current group label
        curves_in_group = [unique_curves[i] for i in np.where(labels == group)[0]]
        
        # Create a pandas Series for centrality scores, filtered by curves in the current group
        centrality_series = pd.Series(centrality).loc[curves_in_group]
        
        # Sort by centrality and select the top percentile
        n_top_curves = int((percentile / 100) * len(curves_in_group))
        most_central_curves = centrality_series.sort_values().reset_index(drop=False).loc[:n_top_curves, 'index']
        
        # Store the most central curves for the group
        curves[group] = most_central_curves.tolist()
    
    return curves

In [98]:
get_most_central(labels, centrality);

In [99]:
def normalize_column(column, flip=True):
    if flip:
        return 1 - (column-column.min()) / (column.max()-column.min())
    else:
        return (column-column.min()) / (column.max()-column.min())

In [100]:
def color_to_rgba(color_name, value):
    # Get the RGB components of the color
    rgb = mcolors.to_rgb(color_name)
    
    # Calculate the gray value based on the input value
    gray = (1 - value) * 0.8  # This adjusts how gray the color will be

    # Compute the final RGBA values
    r = rgb[0] * value + gray
    g = rgb[1] * value + gray
    b = rgb[2] * value + gray
    
    # Return the RGBA string
    return f'rgba({int(r * 255)}, {int(g * 255)}, {int(b * 255)}, {value})'

In [101]:
color_to_rgba('cyan', .4)

'rgba(122, 224, 224, 0.4)'

In [102]:
df_info = pd.DataFrame(normalize_column(pd.Series(centrality)))
df_info.columns = ['centrality']
df_info['group'] = labels
df_info

Unnamed: 0,centrality,group
1,0.813483,0
2,1.000000,1
3,0.884475,0
4,0.824983,0
5,0.835490,0
...,...,...
96,1.000000,1
97,1.000000,1
98,1.000000,1
99,1.000000,1


In [103]:
colors = ['red', 'blue', 'green', 'cyan', 'yellow', 'gray']
cmap = dict(zip(range(len(colors)), colors))

In [104]:
fig = go.Figure()

# Create a set to track groups already added to the legend
legend_groups = set()

# df_roll = df_pivot.rolling(7).median().dropna()

for curve in df_info.sort_values('centrality', ascending=True).index:
    c_gr = df_info['group'][curve]
    c_cen = df_info['centrality'][curve]

    # Check if the group has already been added to the legend
    show_legend = c_gr not in legend_groups
    if show_legend:
        legend_groups.add(c_gr)  # Add group to the set

    fig.add_trace(go.Scatter(
        name=f'Group {c_gr}' if show_legend else "",  # Only add name if it's the first curve in the group
        x=df_pivot.index,
        y=df_pivot[curve],
        marker=dict(color=color_to_rgba(cmap[c_gr], c_cen)),
        line=dict(width=(c_cen/2)**2),
        mode='lines',
        showlegend=show_legend,  # Show legend only for the first curve of the group
        legendgroup=str(c_gr)  # Assign to legend group
    ))

fig.show()
