# Creating Curve Ensemble Summaries 

In [24]:
from google.cloud import storage
import pandas as pd
import io
import os
import gzip
import plotly.express as px
from scipy.spatial.distance import euclidean # yay sci
from tqdm import tqdm
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import euclidean_distances
import matplotlib.colors as mcolors
import similaritymeasures
import plotly.graph_objs as go

In [25]:
# this prevents a potential memory leak from using kmeans
os.environ['OMP_NUM_THREADS'] = '1'

In [26]:
service_account_id = 'elijahsandler@net-data-viz-handbook.iam.gserviceaccount.com'

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'C:\\Users\\elija\\Documents\\24f-coop\\net-data-viz-handbook-fe2c5531555d.json'

## Import data from Google Cloud

In [27]:
# Initialize a GCS client
client = storage.Client()

# Specify your bucket name and the specific .csv.gz file you want
bucket_name = 'gs_net-data-viz-handbook'
file_name = 'sample/sample_SIR_0_countries_incidence_daily.csv.gz'  # Update this to the specific file name
meta_file = 'sample/sample_SIR_0_meta.csv.gz'

# Get the bucket and blob
bucket = client.get_bucket(bucket_name)
blob = bucket.blob(file_name)
metablob = bucket.blob(meta_file)


# Download the .csv.gz file as bytes
compressed_content = blob.download_as_bytes()

# Decompress the .csv.gz content
with gzip.GzipFile(fileobj=io.BytesIO(compressed_content)) as gz:
    # Read the decompressed content into a pandas DataFrame
    df = pd.read_csv(gz)
    
# Download the .csv.gz file as bytes
compressed_content = metablob.download_as_bytes()

# Decompress the .csv.gz content
with gzip.GzipFile(fileobj=io.BytesIO(compressed_content)) as gz:
    # Read the decompressed content into a pandas DataFrame
    df_meta = pd.read_csv(gz)

## Process data

In [28]:
df_sum = df.drop(['t'], axis=1).groupby(['date', 'country_id', 'run_id']).sum()

In [29]:
# get only 1 country's data
country =  0
df_country = df_sum.loc[(slice(None), country), :]
df_country = df_country.droplevel('country_id').T.sum().reset_index()

# pivoting data. god what a good function.
df_pivot = df_country.reset_index().pivot(index='date', columns='run_id', values=0).fillna(0)

# zero-indexing run_id because we aren't barbarians
df_pivot.columns = df_pivot.columns - 1 
df_pivot

run_id,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2009-02-17,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2009-02-18,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2009-02-19,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2009-02-20,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2009-02-21,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2010-02-13,54,0,11,8,12,0,31,33,17,19,...,0,29,99,25,14,0,0,0,0,25
2010-02-14,47,0,8,14,10,0,18,29,16,10,...,0,33,70,9,17,0,0,0,0,11
2010-02-15,49,0,9,17,24,0,27,29,23,14,...,0,33,77,15,14,0,0,0,0,21
2010-02-16,56,0,4,22,13,0,16,31,10,11,...,0,28,60,21,21,0,0,0,0,16


## Group and sort curves

In [30]:
def distance_between_curves(df, method=similaritymeasures.area_between_two_curves, downsample=1):
    """Gets the distance between two curves using various algorithms.

    Note: all curves must have same domain to use AUC.

    Inputs:
        df (dataframe or dictionary): dataframe where columns are curve values.
        method (func): a distance function from the https://pypi.org/project/similaritymeasures/ library
        downsample_factor (int): Factor to downsample the curve values.
                                 E.g., if set to 2, every second point will be kept.

    Returns:
        curve_diff (dict): keys are tuples of curves, values are distance between them.
    """
    # Assert we are using a valid metric
    
    # Initialize dictionary to store distances
    curve_diff = dict()
    
    # Data cleanup
    # Convert the index to datetime
    df = df.reset_index(names='date')
    df['date'] = pd.to_datetime(df['date'])
        
    # Convert the datetime to an integer (number of days since epoch)
    df['date'] = (df['date'] - pd.Timestamp("1970-01-01")).dt.days
    df.set_index('date', inplace=True)

    # Downsample the curve values if needed
    if downsample > 1:
        df = df.iloc[::downsample]

    # Loop through each pair of curves - computationally intensive part: O(n^2)
    for first_curve in tqdm(df.columns):
        for second_curve in df.columns[df.columns.get_loc(first_curve) + 1:]:
            
            curve_diff[(first_curve, second_curve)] = \
            method(df[first_curve].reset_index().to_numpy(), df[second_curve].reset_index().to_numpy())

    # Return dictionary
    return curve_diff

In [31]:
fr = distance_between_curves(df_pivot, method=similaritymeasures.mse, downsample=1)

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:04<00:00, 22.75it/s]


In [41]:
def generate_groups(curve_diff, k):
    """ Uses k-means group to sort curves into groups based on difference scores
    
    Inputs:
        curve_diff (dict): output from distance_between_curves, keys are tuples of curves, values are distance between them
        k (int): desired number of groups
        downsample (int): defaults to 1, selects 1/downsample many rows from data. lose accuracy, gain efficiency.
        
    Returns:
        labels (array): integer labels [0, k) for each curve
        
    """
    
    # Get all unique elements
    elements = sorted(set(i for pair in curve_diff.keys() for i in pair))
    n = len(elements)

    # Create the distance matrix
    distance_matrix = np.zeros((n, n))
    for (i, j), score in curve_diff.items():
        distance_matrix[i, j] = score
        distance_matrix[j, i] = score  # Because the matrix is symmetric

    # Convert distance matrix to similarity matrix
    similarity_matrix = np.max(distance_matrix) - distance_matrix
    
    # Fit K-Means clustering
    kmeans = KMeans(n_clusters=k)
    labels = kmeans.fit_predict(similarity_matrix)
    
    return labels

In [36]:
labels = generate_groups(fr, 3)




KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.



In [11]:
curve_diff = fr

In [12]:
def get_centrality(curve_diff, labels):
    # get unique labels
    unique_labels = set(labels)

    df_scores = pd.DataFrame(curve_diff, index=['score']).T.reset_index(names=['curve1', 'curve2'])

    # dict for storing centrality scores
    centrality = {c: 0 for c in set(c for pair in curve_diff.keys() for c in pair)}

    # for each group...
    for label in unique_labels:

        # find which curves are in that group
        curves = np.where(labels == label)[0]

        # get only scores within the group
        df_gr = df_scores[df_scores['curve1'].isin(curves) & df_scores['curve2'].isin(curves)]

        for curve in curves:
            centrality[curve] = df_gr[(df_gr['curve1'] == curve) | (df_gr['curve2'] == curve)]['score'].sum()
    return centrality

In [13]:
centrality = get_centrality(fr, labels);

In [14]:
def get_most_central(labels, centrality, percentile=50):
    """ gets most central curves for each group based on centrality scores """
    curves = dict()
    
    for group in set(labels):
        # this is the greatest line of code of all time
        curves[group] = pd.Series(centrality)[np.where(labels == group)[0]]\
        .sort_values().reset_index(drop=False).loc[:(50/100) * (len(np.where(labels == group)[0]))]['index']
        
    return curves

In [15]:
get_most_central(labels, centrality);

In [16]:
def normalize_column(column, flip=True):
    if flip:
        return 1 - (column-column.min()) / (column.max()-column.min())
    else:
        return (column-column.min()) / (column.max()-column.min())

In [17]:
def color_to_rgba(color_name, value):
    # Get the RGB components of the color
    rgb = mcolors.to_rgb(color_name)
    
    # Calculate the gray value based on the input value
    gray = (1 - value) * 0.8  # This adjusts how gray the color will be

    # Compute the final RGBA values
    r = rgb[0] * value + gray
    g = rgb[1] * value + gray
    b = rgb[2] * value + gray
    
    # Return the RGBA string
    return f'rgba({int(r * 255)}, {int(g * 255)}, {int(b * 255)}, {value})'

In [18]:
color_to_rgba('cyan', .4)

'rgba(122, 224, 224, 0.4)'

In [19]:
df_info = pd.DataFrame(normalize_column(pd.Series(centrality)))
df_info.columns = ['centrality']
df_info['group'] = labels
df_info

Unnamed: 0,centrality,group
0,0.947589,2
1,1.000000,1
2,0.644595,0
3,0.685069,0
4,0.802057,0
...,...,...
95,1.000000,1
96,1.000000,1
97,1.000000,1
98,1.000000,1


In [20]:
colors = ['red', 'blue', 'green', 'cyan', 'yellow', 'gray']
cmap = dict(zip(range(len(colors)), colors))

In [22]:
fig = go.Figure()

# Create a set to track groups already added to the legend
legend_groups = set()

for curve in df_info.sort_values('centrality', ascending=True).index:
    c_gr = df_info['group'][curve]
    c_cen = df_info['centrality'][curve]

    # Check if the group has already been added to the legend
    show_legend = c_gr not in legend_groups
    if show_legend:
        legend_groups.add(c_gr)  # Add group to the set

    fig.add_trace(go.Scatter(
        name=f'Group {c_gr}' if show_legend else "",  # Only add name if it's the first curve in the group
        x=df_pivot.index,
        y=df_pivot[curve],
        marker=dict(color=color_to_rgba(cmap[c_gr], c_cen)),
        line=dict(width=(c_cen/2)**2),
        mode='lines',
        showlegend=show_legend,  # Show legend only for the first curve of the group
        legendgroup=str(c_gr)  # Assign to legend group
    ))

fig.show()
