In [1]:
import itertools
import os
import time
import glob
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from numpy import linalg as la
import networkx as nx
from scipy.linalg import logm, expm
from scipy.sparse.linalg import eigsh, svds
from scipy.stats import median_abs_deviation
from pandas.plotting._matplotlib.style import get_standard_colors
colors = get_standard_colors(num_colors=13)

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import ray
ray.init()

2024-01-16 14:00:16,213	INFO worker.py:1724 -- Started a local Ray instance.


0,1
Python version:,3.11.5
Ray version:,2.9.0


## NSM algorithm

In [2]:
def cusum(x):
    n, T = x.shape
    leftsums = np.cumsum(x, axis=1)
    rightsums = leftsums[:, -1].reshape(-1, 1) - leftsums
    leftmeans = leftsums[:, :-1] / np.arange(1, T, 1).reshape(1, -1)
    rightmeans = rightsums[:, :-1] / np.arange(T-1, 0, -1).reshape(1, -1)
    result = (rightmeans - leftmeans) * np.sqrt(np.arange(1, T, 1) * np.arange(T-1, 0, -1) / T).reshape(1, -1) 
    return result

In [3]:
def soft_thresholding(x, lam):
    return np.sign(x) * np.maximum(0, np.abs(x)-lam)

def sparse_svd(Z, lam):
    Mhat = soft_thresholding(Z, lam)
    try:
        U = np.linalg.svd(Mhat)[0]
        proj = U[:, 0]
    except:
        proj = np.random.randn(Z.shape[0])
        proj = proj / np.linalg.norm(proj)
        
    return proj

def sin_loss(U, V):
    U = U.reshape(-1, 1)
    V = V.reshape(-1, 1)
#     return np.sqrt(1 - (U.T@V)**2)
    return np.sqrt(np.sum((U @ U.T - V @ V.T)**2) / 2)

In [4]:
def NCM(X, lam_scale=1, alpha=0.01, b=None):
    n, T = X.shape
    if b == None:
        b = T//4
#         b = min(T//4, int((2*np.log(T)+np.log(2/alpha))/n*8))
    vhat = np.zeros_like(X)
    for t in range(2, T):
        lam = np.sqrt(2*(np.log(t**2*n) - np.log(alpha))) * lam_scale
        Y = X[:, :t]
        Y = Y / median_abs_deviation(Y[:, -1] - Y[:, 0]) * np.sqrt(2)
        Y_cusum = cusum(Y)
        if abs(Y_cusum).max() > lam:
            vhat[:, t] = sparse_svd(Y_cusum, lam)
        else:
            v = np.random.randn(n)
            v = v / np.linalg.norm(v)
            vhat[:, t] = v

    A = np.zeros(T)
    for t in range(3, T):
        A[t] = sin_loss(vhat[:, t-1], vhat[:, t])

    cp = -1
    change = False
    for t in range(T//2, T):
        if np.all(A[t-b:t] < 0.5):
            cp = t
            change = True
            break

    return change, cp

## functions to generate dynamic SBM based on given changepoints 

In [5]:
def create_SBM_parameters(n=50, p11=0.25, p12=0.05, change_type=1, minimum_average_node_per_block=5):
    if change_type == 1:
        num_blocks = 2 ** np.arange(np.log(n // minimum_average_node_per_block) / np.log(2))
        num_blocks = num_blocks.astype(int)
        size_prob_parameters = []
        for n_block in num_blocks:
            sizes = [n // n_block] * (n_block - 1)
            sizes.append(n - sum(sizes))
            p = (p11 - p12) * np.eye(n_block) + p12 * np.ones((n_block, n_block))
            size_prob_parameters.append([sizes, p])
    else:
        p11 = p11 * 50 / n
        p12 = p12 * (50 / n) ** (1/2)
        n_block = 4
        size_prob_parameters = []
        sizes = [n // n_block] * (n_block - 1)
        sizes.append(n - sum(sizes))
        for q11 in np.linspace(p11*0.5, p11*2, 3):
            for q12 in np.linspace(p12*0.5, p12*2, 3):
                p = (q11 - q12) * np.eye(n_block) + q12 * np.ones((n_block, n_block))
                size_prob_parameters.append([sizes, p])
                
    return size_prob_parameters

In [6]:
def generate_snapshot_on_consecutive_networks(G_old, G_new, alpha=0):
    if alpha == 0:
        return G_old
    if alpha == 1:
        return G_new
    num_nodes = G_old.number_of_nodes()
    G_interp = nx.from_edgelist([edge for u, edge in zip(np.random.rand(G_old.number_of_edges()), nx.to_edgelist(G_old)) if u < alpha] + \
                            [edge for u, edge in zip(np.random.rand(G_new.number_of_edges()), nx.to_edgelist(G_new)) if u > alpha])
    connected = nx.is_connected(G_interp)
    while G_interp.number_of_nodes() < num_nodes or connected == False:
        G_interp = nx.from_edgelist([edge for u, edge in zip(np.random.rand(G_old.number_of_edges()), nx.to_edgelist(G_old)) if u < alpha] + \
                            [edge for u, edge in zip(np.random.rand(G_new.number_of_edges()), nx.to_edgelist(G_new)) if u > alpha])
        connected = nx.is_connected(G_interp)
    return G_interp

In [7]:
def generate_dynamic_SBM(T=100, cps=[], n=50, p11=0.25, p12=0.05, alpha=0.001, change_type=1, directed=False):
    cp_counter = 0
    G_prev = None
    G_curr = None
    G_list = []
    size_prob_parameters = create_SBM_parameters(n, p11, p12, change_type=change_type)
    size_prob_index_prev = None
    size_prob_index_curr = np.random.choice(len(size_prob_parameters))
    for t in range(T):
        cp = False
        connected = False
        if cp_counter < len(cps) and t == cps[cp_counter]:
            cp = True
            if size_prob_index_prev is None:
                size_prob_index_prev = size_prob_index_curr
            while size_prob_index_curr == size_prob_index_prev:
                size_prob_index_curr = np.random.choice(len(size_prob_parameters))
            cp_counter += 1
        sizes, p = size_prob_parameters[size_prob_index_curr]
        while connected == False:
            G_curr = nx.stochastic_block_model(sizes, p, directed=directed)
            connected = nx.is_connected(G_curr) # this is to ensure that each graph snapshot is connected
        if G_prev is not None and cp == False:
            G_curr = generate_snapshot_on_consecutive_networks(G_prev, G_curr, alpha)
        G_list.append(G_curr)
        G_prev = G_curr
    
    return G_list, cps

In [8]:
def simulation(change=False, lam_scale=1, alpha=0.01, T=100, change_type=1):
    T0 = T//2 # the earliest time of a possible change point
    if change == False:
        cps = []
    else:
        cps = [np.random.randint(2*T0//3+T//3, T0//3+2*T//3)]
        
    n = 50
    directed = False
    G_list, cps = generate_dynamic_SBM(T, cps, n, directed=directed, change_type=change_type)
    degree_mat = np.stack([np.array([val for (node, val) in g.degree()]) for g in G_list]).T # n*T
    exist, loc = NCM(degree_mat, lam_scale=lam_scale, alpha=alpha)
    
    return exist, loc, cps[0] if len(cps)==1 else float('inf')

In [9]:
ALPHA = [0.01, 0.05, 0.1]
LAM = [1]
TIME = [100, 200]
num_runs = 1000

sizes = {}
powers = {}
accuracys = {}
delays = {}

for alpha in ALPHA:
    for lam_scale in LAM:
        for T in TIME:
            print(f"Experiment with alpha={alpha}, lam_scale={lam_scale}, T={T}")
            change = False
            exist_array = np.zeros(num_runs)
            loc_detect_array = np.zeros(num_runs)
            loc_true_array = np.zeros(num_runs)
            for run_id in range(num_runs):
                exist, loc_detect, loc_true = simulation(change=change, alpha=alpha, lam_scale=lam_scale, T=T, change_type=1)
                exist_array[run_id] = exist
                loc_detect_array[run_id] = loc_detect
                loc_true_array[run_id] = loc_true    
            print(f"size: {exist_array.mean()}, detected changes: {loc_detect_array[exist_array==True]}")
            sizes[(alpha, lam_scale, T)] = exist_array.mean(), exist_array.std()

            change = True
            exist_array = np.zeros(num_runs)
            loc_detect_array = np.zeros(num_runs)
            loc_true_array = np.zeros(num_runs)
            for run_id in range(num_runs):
                exist, loc_detect, loc_true = simulation(change=change, alpha=alpha, lam_scale=lam_scale, T=T)
                exist_array[run_id] = exist
                loc_detect_array[run_id] = loc_detect
                loc_true_array[run_id] = loc_true

            print(f"power: {exist_array.mean()}, delay: {np.where(loc_detect_array >= loc_true_array, loc_detect_array - loc_true_array, 0)[loc_detect_array >= loc_true_array].mean()}")                        
            powers[(alpha, lam_scale, T)] = exist_array.mean(), exist_array.std()
            accuracys[(alpha, lam_scale, T)] = (loc_detect_array[exist_array==True]==loc_true_array[exist_array==True]).mean(), (loc_detect_array[exist_array==True]==loc_true_array[exist_array==True]).std()
            delays[(alpha, lam_scale, T)] = np.where(loc_detect_array >= loc_true_array, loc_detect_array - loc_true_array, 0)[loc_detect_array >= loc_true_array].mean(), np.where(loc_detect_array >= loc_true_array, loc_detect_array - loc_true_array, 0)[loc_detect_array >= loc_true_array].std()

Experiment with alpha=0.01, lam_scale=1, T=100
size: 0.319, detected changes: [93. 85. 50. 75. 69. 56. 71. 80. 66. 53. 50. 50. 80. 67. 50. 91. 54. 61.
 84. 69. 56. 88. 50. 64. 50. 68. 75. 55. 93. 98. 50. 76. 75. 90. 78. 55.
 50. 74. 79. 50. 62. 70. 69. 69. 65. 70. 92. 55. 50. 55. 56. 50. 74. 65.
 88. 76. 54. 97. 80. 81. 50. 77. 60. 82. 50. 55. 50. 50. 98. 50. 50. 97.
 56. 88. 50. 80. 51. 81. 50. 56. 50. 71. 51. 90. 63. 74. 94. 58. 79. 50.
 50. 83. 50. 94. 87. 73. 51. 50. 50. 78. 50. 74. 66. 92. 74. 57. 70. 63.
 50. 72. 50. 75. 72. 71. 50. 50. 50. 84. 74. 60. 85. 89. 99. 87. 55. 85.
 64. 89. 70. 81. 55. 50. 63. 84. 63. 71. 55. 60. 50. 80. 70. 62. 70. 64.
 75. 75. 84. 90. 76. 99. 65. 55. 50. 72. 60. 89. 83. 67. 73. 50. 64. 65.
 51. 50. 50. 76. 82. 53. 94. 92. 93. 72. 50. 93. 64. 52. 91. 77. 92. 61.
 50. 68. 77. 81. 50. 50. 77. 50. 76. 55. 97. 65. 97. 53. 87. 72. 82. 87.
 91. 61. 79. 59. 50. 74. 50. 54. 57. 86. 89. 62. 74. 76. 77. 62. 95. 98.
 67. 86. 62. 98. 95. 59. 50. 92. 60. 50. 78. 6