In [13]:
from google.cloud import storage
import pandas as pd
import io
import os
import gzip
import plotly.express as px
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean
from tqdm import tqdm
import numpy as np
from multiprocessing import Pool, cpu_count

In [3]:
service_account_id = 'elijahsandler@net-data-viz-handbook.iam.gserviceaccount.com'

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'C:\\Users\\elija\\Documents\\24f-coop\\net-data-viz-handbook-fe2c5531555d.json'

In [4]:
# Initialize a GCS client
client = storage.Client()

# Specify your bucket name and the specific .csv.gz file you want
bucket_name = 'gs_net-data-viz-handbook'
file_name = 'sample/sample_SIR_0_countries_incidence_daily.csv.gz'  # Update this to the specific file name
meta_file = 'sample/sample_SIR_0_meta.csv.gz'

# Get the bucket and blob
bucket = client.get_bucket(bucket_name)
blob = bucket.blob(file_name)
metablob = bucket.blob(meta_file)


# Download the .csv.gz file as bytes
compressed_content = blob.download_as_bytes()

# Decompress the .csv.gz content
with gzip.GzipFile(fileobj=io.BytesIO(compressed_content)) as gz:
    # Read the decompressed content into a pandas DataFrame
    df = pd.read_csv(gz)
    
# Download the .csv.gz file as bytes
compressed_content = metablob.download_as_bytes()

# Decompress the .csv.gz content
with gzip.GzipFile(fileobj=io.BytesIO(compressed_content)) as gz:
    # Read the decompressed content into a pandas DataFrame
    df_meta = pd.read_csv(gz)

In [5]:
df_sum = df.drop(['t'], axis=1).groupby(['date', 'country_id', 'run_id']).sum()

# get only 1 country's data
country =  0
df_country = df_sum.loc[(slice(None), country), :]
df_country = df_country.droplevel('country_id').T.sum().reset_index()

# pivoting data. god what a good function.
df_pivot = df_country.reset_index().pivot(index='date', columns='run_id', values=0).fillna(0)

# zero-indexing run_id because we aren't barbarians
df_pivot.columns = df_pivot.columns - 1 
df_pivot

run_id,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2009-02-17,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2009-02-18,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2009-02-19,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2009-02-20,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2009-02-21,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2010-02-13,54,0,11,8,12,0,31,33,17,19,...,0,29,99,25,14,0,0,0,0,25
2010-02-14,47,0,8,14,10,0,18,29,16,10,...,0,33,70,9,17,0,0,0,0,11
2010-02-15,49,0,9,17,24,0,27,29,23,14,...,0,33,77,15,14,0,0,0,0,21
2010-02-16,56,0,4,22,13,0,16,31,10,11,...,0,28,60,21,21,0,0,0,0,16


In [11]:
def distance_between_curves(df, method='AUC'):
    """ gets the distance between two curves using various algorithms
    
    Note: must have same domain to use AUC
    
    Inputs:
        df (dataframe or dictionary): dataframe where columns are curve values
        method (str): 'auc' for Area Under the Curve difference or 
                      'dtw' for Dynamic Time Warping
                      
    Returns:
        curve_diff (dict): keys are tuples of curves, values are distance between them
    
    """
    # assert we are using a valid metric
    valid_methods = ['auc', 'dtw']
    assert method in valid_methods, f"must select a valid method: {valid_methods}"
    
    # initialize dictionary to store distances
    curve_diff = dict()
    
    # data cleanup for things that use x coordinates
    if method != 'auc':
        # Convert the index to datetime
        df = df.reset_index(names='date')
        df['date'] = pd.to_datetime(df['date'])
        
        # Convert the datetime to an integer (number of days since epoch)
        df['date'] = (df['date'] - pd.Timestamp("1970-01-01")).dt.days
        df.set_index('date', inplace=True)

    # loop through each pair of curves - computationally intensive part: O(n^2)
    for first_curve in tqdm(df.columns):
        for second_curve in (df_pivot.columns[first_curve+1:]):
            
            # calculate total area between curves
            if method == 'auc':
                curve_diff[(first_curve, second_curve)] = \
                ((df[first_curve] - df[second_curve])**1).abs().sum()
                
            # calculate dtw distance
            elif method == 'dtw':
                curve_diff[(first_curve, second_curve)] = \
                fastdtw(df[first_curve].reset_index().to_numpy(), df[second_curve].reset_index().to_numpy())
                
    # return dictionary
    return curve_diff

In [None]:
import pandas as pd
from tqdm import tqdm
from fastdtw import fastdtw
from multiprocessing import Pool, cpu_count

def calculate_distance(args):
    df, first_curve, second_curve, method = args
    if method == 'auc':
        return (first_curve, second_curve), ((df[first_curve] - df[second_curve])**1).abs().sum()
    elif method == 'dtw':
        return (first_curve, second_curve), fastdtw(df[first_curve].reset_index().to_numpy(), df[second_curve].reset_index().to_numpy())

def distance_between_curves(df, method='auc'):
    """ Gets the distance between two curves using various algorithms.

    Note: must have same domain to use AUC.

    Inputs:
        df (dataframe or dictionary): dataframe where columns are curve values.
        method (str): 'auc' for Area Under the Curve difference or 
                      'dtw' for Dynamic Time Warping.

    Returns:
        curve_diff (dict): keys are tuples of curves, values are distance between them.
    """
    # Assert we are using a valid metric
    valid_methods = ['auc', 'dtw']
    assert method in valid_methods, f"Must select a valid method: {valid_methods}"
    
    # Initialize dictionary to store distances
    curve_diff = dict()
    
    # Data cleanup for things that use x coordinates
    if method != 'auc':
        df = df.reset_index(names='date')
        df['date'] = pd.to_datetime(df['date'])
        df['date'] = (df['date'] - pd.Timestamp("1970-01-01")).dt.days
        df.set_index('date', inplace=True)

    # Prepare pairs of curves for processing
    curve_pairs = []
    for i, first_curve in enumerate(df.columns):
        for second_curve in df.columns[i + 1:]:
            curve_pairs.append((df, first_curve, second_curve, method))
    print('yay')
    # Use multiprocessing to compute distances
    with Pool(cpu_count()) as pool:
        results = list(tqdm(pool.imap(calculate_distance, curve_pairs), total=len(curve_pairs)))

    # Collect results into the dictionary
    for key, value in results:
        curve_diff[key] = value

    # Return dictionary
    return curve_diff


In [None]:
distance_between_curves(df_pivot, method='dtw')