# Modified Definition Test
This notebook seeks to test the modified definition to the disruptivity that I defined using chains of events. I am worried this definition will suck due to the variance of the Poisson distribution but who knows! The new method is outlined in this blog post:

https://cfsenergy.atlassian.net/wiki/spaces/~6318cff19794410874c7744f/blog/2023/05/05/2788819001/Lecture+2+fr+Froude+3+Probabilities+3+7+05+2023

We start with imports.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy
from scipy.optimize import minimize, LinearConstraint

# Move into the source directory for this notebook to work properly
# Probably want a better way of doing this.
import os
import importlib
os.chdir('../src/')

# Import whatever we need
import disruptivity as dis
import indexing as ind
import vis.disruptivity_vis as dis_vis
import vis.probability_vis as prob_vis
from vis.plot_helpers import plot_subplot as plot
import data_loader

# Import tokamak Configuartions
from tokamaks.cmod import CONFIG as CMOD
from tokamaks.d3d import CONFIG as D3D

importlib.reload(ind)
importlib.reload(dis)
importlib.reload(dis_vis)
load_disruptions_mat = data_loader.load_disruptions_mat

Loading is the same as before, we use the premade functions for disruptivity computations.

In [None]:
cmod_df, cmod_indices = load_disruptions_mat('../data/CMod_disruption_warning_db.mat')
n_shots = np.unique(cmod_df.shot).shape[0]
n_shots_no_disrupt = np.unique(cmod_df.shot[cmod_indices['indices_no_disrupt']]).shape[0]
n_shots_disrupt = np.unique(cmod_df.shot[cmod_indices['indices_disrupt']]).shape[0]
assert n_shots_disrupt+n_shots_no_disrupt == n_shots, \
    'Number of disrupts plus number of non disruptions does not equal the total shot number'
print(f'Total Shot Number: {n_shots}, Non-Disrupted Shots: {n_shots_no_disrupt}, Disrupted Shots: {n_shots_disrupt}')

'''
So my goal with this block of code is to find all the portions of flat top disrupted shots 
that are in flat tops. Should be simple enough.
'''

# Compute the indice
ind.get_indices_disruptivity(CMOD, cmod_df, cmod_indices)
ind.get_indices_detectable_disruptivity(CMOD, cmod_df, cmod_indices)


# All other flattop data points
indices_n_detectable_disrupt = cmod_indices['indices_n_detectable_disrupt']
indices_n_detectable_total = cmod_indices['indices_n_detectable_total']
indices_n_disrupt = cmod_indices['indices_n_disrupt']
indices_n_total = cmod_indices['indices_n_total']

# Entry dictionary
entry_dict_1D = {
    'kappa':{
        'range':[0.8, 2.0],
        'axis_name': "$\kappa$",
    },
}

Now, we can reuse the histogram binning code for variable timesteps that returns the data indices of data points for each bin. Since this new method essentially tries to compute the dt of subsequent data points, it is mechanically the same as the dt calculation for variable timestep as well! 

In [None]:
# Step 1: Get the histogram with the list of data entries.
# There is no need for numerator and denominators, only histograms of all the data.
hist = dis.indices_to_histogram(cmod_df, entry_dict_1D, indices_n_total, 25)

In [None]:
# Create a modified variable dt function that stores the sequence of dts and marks them as disrupted or not.
def compute_dt_bin(
    dataframe: pd.core.frame.DataFrame,
    denom_dd: scipy.stats._binned_statistic.BinnedStatisticddResult,
    denom_indices: np.ndarray,
    tau = 0,
    window = 1,
    nbins=25,
) -> np.ndarray:
    """Computes the sequence lengths dt for each histogram bin.

    Args:
        dataframe (pd.core.frame.DataFrame): The tokamak dataframe
        denom_dd (scipy.stats._binned_statistic.BinnedStatisticddResult): The denominator histogram.
        denom_indices (np.ndarray): The denominator indices.
        nbins (int, optional): Number of histogram bins. Defaults to 25.

    Returns:
        non_dis_time (list): List of arrays of sequence times for non-disrupted data for each bin.
        dis_time (list): List of arrays of sequence times for disrupted data for each bin.
    """

    # Dimension Calculation
    dimension = len(denom_dd.bin_edges)
    if dimension == 1:
        binnumber = np.array([denom_dd.binnumber])
    else:
        binnumber = denom_dd.binnumber
    # The : indexes all dims, : indexes the data index, np.newaxis
    # prepares the data for np.ix_() to be called
    binnumber = binnumber[:, :, np.newaxis]

    dt_array = np.zeros(denom_dd.statistic.shape)
    
    # Store the last entry for continuity checks.
    # Here we assume that subsequent dataframe indices
    # that are in the same bin must belong to the same 
    # pulse.
    #
    # Thus we must check that the last entry and last
    # dataframe index number are one and the same.
    last_entry = binnumber[:, 0] - 1
    dt_integrator = 0
    last_dataframe_index = denom_indices[0]-1
    last_shot = dataframe["shot"][last_dataframe_index+1]
    
    # Lists that contain the for loop outputs.
    ix_list = []
    shot_list = []
    dt_list = []
    disrupt_list =[]
    
    # test_array = np.zeros(n_disrupt.shape)
    for i in range(denom_dd.binnumber.shape[-1]):
        # Get the entry and make sure it is not out of bounds
        # For details on this, refer to the notes section of the
        # stats.binned_statistic_dd function. Basically bin 0
        # and bin nbin+1 are padded bins for outside boundaries
        entry = binnumber[:, i] - 1

        # Check if the data is out of bounds
        if (entry < 0).any() or (entry > nbins - 1).any():
            continue

        # Get the dataframe entry for this histogram entry
        dataframe_index = denom_indices[i]
        shot = dataframe["shot"][dataframe_index]
        
        # Set the is_disrupt flag:
        is_disrupt = False
        time_until_disrupt_ms = dataframe.time_until_disrupt[dataframe_index] * 1000
        if np.abs(time_until_disrupt_ms - tau)<= window:
            is_disrupt = True
        
        # Data continuity check.
        # If the current entry is not the last entry
        # place all the data into memory and increment
        # the entry.
        if (entry!=last_entry).all() or dataframe_index != last_dataframe_index+1:
            # Convert entry into a struct for array indexing.
            ix_entry = np.ix_(*last_entry)
            
            # List Appends
            shot_list.append(last_shot)
            ix_list.append(ix_entry)
            dt_list.append(dt_integrator)
            disrupt_list.append(is_disrupt)
            
            # Reset the integrator and is_disrupt flags.
            last_entry = entry
            last_shot = shot
            dt_integrator = 0
        # Reset the last dataframe index
        last_dataframe_index = dataframe_index

        # Check if first frame in the shot
        # Should be impossible since we exclude ramp ups
        if (shot!= dataframe["shot"][dataframe_index - 1]):
            continue
        # Else: integrate the time
        else:
            dt_integrator += (
                dataframe["time"][dataframe_index]
                - dataframe["time"][dataframe_index - 1]
            )

    # After the loop, convert the lists into a pandas dataframe
    pd_dict = {
        'shot':shot_list,
        'ix':ix_list,
        'dt':dt_list,
        'is_disrupt':disrupt_list,
    }
    
    return pd.DataFrame.from_dict(pd_dict)

def entry_list_to_arrays(entry_df, nbins=25):
    
    # We do this in 1D
    non_disrupt = entry_df[entry_df.is_disrupt==False]
    disrupt = entry_df[entry_df.is_disrupt==True]
    
    non_dis_dt_list = []
    for i in range(nbins):
        # Data selection
        non_dis_dt_list.append(np.array(non_disrupt.dt[non_disrupt.ix==([i],)]))
        
    dis_dt_list = []
    for i in range(nbins):
        # Data selection
        dis_dt_list.append(np.array(disrupt.dt[disrupt.ix==([i],)]))
    
    return dis_dt_list, non_dis_dt_list

In [None]:
entry_list = compute_dt_bin(cmod_df, hist, indices_n_total, nbins=25)

In [None]:
dis_dt_list, non_dis_dt_list = entry_list_to_arrays(entry_list,25)

## Optimization Loss Functions
Here we do an investigation of the investigation and the corresponding loss functions before we pipe the above data structure into the loss. We do this to make sure it is right.

In [None]:
# Now we define the function to minimize
# This is the log likelihood defined in the document.
def z_disruptivity(d: float, dt: np.ndarray):
    return (1/d)*(1-np.exp(-dt*d))
    
def p_data(d:float, dt:np.ndarray, is_disrupt:bool):
    
    z=z_disruptivity(d, dt)
    
    if is_disrupt:
        return (1 - np.exp(-dt*d)/z)
    return (np.exp(-dt*d)/z)

def disruptivity_neg_log_likelihood(d:float, dis_dt:np.ndarray, non_dis_dt:np.ndarray,)->float:
    
    # Safety?
    assert d>0, "Invalid disruptivity, must be greater than 0."
    
    # Compute the renormalization arrays
    dis_z = z_disruptivity(d, dis_dt)
    non_dis_z = z_disruptivity(d, non_dis_dt)
    
    # Compute the log likelihood
    neg_log_likelihood = \
        - np.sum(np.log(dis_z-np.exp(-dis_dt*d))-np.log(dis_z)) \
        - np.sum((-non_dis_dt*d)-np.log(non_dis_z))
    
    return neg_log_likelihood
    

In [None]:
d_space = np.linspace(0.01,5,100)
dis_dt = np.array([1]*10)
non_dis_dt = np.array([1]*10)

p_list = []
p_dis_list = []
p_non_dis_list = []
for d in d_space:
    p = disruptivity_neg_log_likelihood(d, dis_dt, non_dis_dt)
    p_list.append(p)
    p_dis_list.append(p_data(d, dis_dt[0], True))
    p_non_dis_list.append(p_data(d, non_dis_dt[0], False))
    
plt.plot(d_space,p_dis_list, label='Disrupted Data')
plt.plot(d_space,p_non_dis_list, label='Non Disrupted Data')
plt.legend()
plt.grid()
plt.ylabel("Probability")
plt.xlabel("$d$ [1/s]")

In [None]:
plt.plot(d_space,p_list, label='Negative Log Likelihood')
plt.legend()
plt.grid()
plt.ylabel("")
plt.xlabel("$d$ [1/s]")

In [None]:
constraint = LinearConstraint([1], lb=1e-4)
ratio_list = []
optimal_d = []
n_data = 1000
for i in range(n_data):
    # Create the dataset
    dis_dt = np.array([1]*i)
    non_dis_dt = np.array([1]*(n_data-i))
    
    # Fill the ration
    ratio_list.append(i/n_data)
    
    # Optimize
    res = minimize(disruptivity_neg_log_likelihood,
                   x0=1,
                   args=(dis_dt, non_dis_dt),
                   constraints=(constraint)
                  )
    optimal_d.append(res.x[0])

In [None]:
plt.plot(ratio_list, optimal_d, label ='Optimal $d$')
plt.legend()
plt.grid()
plt.ylabel("$d$ [1/s]")
plt.xlabel("Data Ratio")

## Making the connection

In [None]:
# Lets pipe this together
# something first, efficiency later
constraint = LinearConstraint([1], lb=1e-4)
optimal_d = []
nbins = 25
for i in range(nbins):
    # Optimize
    res = minimize(disruptivity_neg_log_likelihood,
                   x0=1,
                   args=(dis_dt_list[i], non_dis_dt_list[i]),
                   constraints=(constraint)
                  )
    optimal_d.append(res.x[0])
    
plot('hello.png', dis_vis.subplot_disruptivity1d, (optimal_d, [0]*nbins, hist.bin_edges, entry_dict_1D))