# Notebook for identifying and removing bottlenecks from ICET implementation

In [1]:
from vedo import *
import os
from ipyvtklink.viewer import ViewInteractiveWidget
import pykitti
import numpy as np
import tensorflow as tf
import time

#limit GPU memory ------------------------------------------------
gpus = tf.config.experimental.list_physical_devices('GPU')
print(gpus)
if gpus:
  try:
    memlim = 4*1024
    tf.config.experimental.set_virtual_device_configuration(gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=memlim)])
  except RuntimeError as e:
    print(e)
#-----------------------------------------------------------------
# tf.config.set_visible_devices([], 'GPU') #run on CPU only -- seems to actually execute main parts of code faster here...

from tensorflow.math import sin, cos, tan
import tensorflow_probability as tfp
from ICET_spherical import ICET
from utils import R_tf
from metpy.calc import lat_lon_grid_deltas

%load_ext autoreload
%autoreload 2
%autosave 180
# %matplotlib notebook

# %%bash
# # python -m cProfile scan_match.py
# python scan_match.py

2022-11-27 16:45:02.509489: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-27 16:45:03.129978: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-11-27 16:45:04.120045: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/derm/anaconda3/envs/py39/lib/python3.9/site-packages/cv2/../../lib64:
2022-11-27 16:45:04.120139: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_p

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


2022-11-27 16:45:05.878743: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-27 16:45:05.880226: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-27 16:45:05.880422: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-27 16:45:05.880561: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zer

Autosaving every 180 seconds


In [2]:
basepath = '/media/derm/06EF-127D1/KITTI'
# sequence = '03' #forest
sequence = '09' #trees and small town
dataset = pykitti.odometry(basepath, sequence)
velo1 = dataset.get_velo(400)
c1 = velo1[:,:3]
velo2 = dataset.get_velo(401)
c2 = velo2[:,:3]

# fn1 = "/home/derm/ASAR/v3/spherical_paper/MC_trajectories/scene1_scan13.txt"
# c1 = np.loadtxt(fn1)
# fn2 = "/home/derm/ASAR/v3/spherical_paper/MC_trajectories/scene1_scan14.txt"
# c2 = np.loadtxt(fn2)

it = ICET(cloud1 = c1, cloud2 = c2, fid = 50, niter = 9, 
           draw = False, group = 2, RM = True, DNN_filter = False)

Ground truth poses are not avaialble for sequence 09.

 loading model took 4.76837158203125e-07 
 total:  5.0067901611328125e-06

 shuffling and converting to tensor took  0.012696027755737305 
 total:  0.012722015380859375

 converting to spherical took 0.1445903778076172 
 total:  0.15732693672180176

 getting cluster took 0.4924807548522949 seconds !!!

 fit_gaussian for scan 1 0.045018672943115234 
 total:  0.846733808517456


2022-11-27 16:45:10.450968: I tensorflow/stream_executor/cuda/cuda_blas.cc:1614] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2022-11-27 16:45:10.520179: I tensorflow/core/util/cuda_solvers.cc:179] Creating GpuSolver handles for stream 0x84a4f60



 ~~~~~~~~~~~~~~ 
 fit_gaussian for scan 2 0.05652356147766113 
 total:  2.3468685150146484 
 ~~~~~~~~~~~~~~

 estimated solution vector X: 
 tf.Tensor([ 0.2882643   0.02340015  0.00240992 -0.00082739  0.00107237 -0.0050481 ], shape=(6,), dtype=float32)

 ~~~~~~~~~~~~~~ 
 correcting solution estimate 0.6005465984344482 
 total:  2.9474680423736572 
 ~~~~~~~~~~~~~~

 ~~~~~~~~~~~~~~ 
 fit_gaussian for scan 2 0.04670572280883789 
 total:  2.9946579933166504 
 ~~~~~~~~~~~~~~

 estimated solution vector X: 
 tf.Tensor(
[ 5.7045066e-01  1.6899558e-02  6.7746909e-03 -4.9210084e-04
  1.4997894e-03 -2.4524932e-03], shape=(6,), dtype=float32)

 ~~~~~~~~~~~~~~ 
 correcting solution estimate 0.01548910140991211 
 total:  3.0101685523986816 
 ~~~~~~~~~~~~~~

 ~~~~~~~~~~~~~~ 
 fit_gaussian for scan 2 0.041318416595458984 
 total:  3.051738739013672 
 ~~~~~~~~~~~~~~

 estimated solution vector X: 
 tf.Tensor(
[ 7.54447639e-01  1.06849279e-02  1.01552475e-02 -5.68593270e-04
  1.74943893e-03  1.2166099

# get_cluster()

In [3]:
def gt2(rads, thresh = 0.5, mnp = 100):
    """testing new method of finding radial bins for spherical voxels"""
    
    before = time.time()

    max_buffer = 0.2 

    if len(tf.shape(rads)) < 2:
        rads = rads[:,None]

    OG_rads = rads #hold on to OG rads
    #replace all zeros in rads (result of converting ragged -> standard tensor) with some arbitrarily large value
    mask = tf.cast(tf.math.equal(rads, 0), tf.float32)*1000
    rads = rads + mask
    # print(rads)

    #sort in ascending order for each column in tensor
    top_k = tf.math.top_k(tf.transpose(rads), k = tf.shape(rads)[0])
#     print("\n top_k \n", top_k[1])
    rads = tf.transpose(tf.gather(tf.transpose(rads), top_k[1], batch_dims = 1))
    rads = tf.reverse(rads, axis = tf.constant([0]))
#     print("rads \n", rads)

    # calculate the forward difference between neighboring points
    z = tf.zeros([1, tf.shape(rads)[1].numpy()])
    shifted = tf.concat((rads[1:], z), axis = 0)
    diff = shifted - rads

    # #find where difference jumps
    jumps = tf.where(diff > thresh)
#     print("\n jumps \n", jumps) #[idx of jump, which spike is jumping]

    #----------------------------------------------------------------------
    #not sure if actually needed...
    #get indexes of all used spikes
    used = jumps[:,1][None,:]
    # print("used", used)
    biggest = tf.math.reduce_max(used, axis = 1).numpy()[0]
    # print("biggest", biggest)
    all_spikes = tf.cast(tf.linspace(0,biggest,biggest+1), tf.int64)[None,:] #list all spikes total
    # print("all_spikes", all_spikes)

    #find differnce
    missing = tf.sets.difference(all_spikes, used).values[None,:]
    # print("\n missing", missing)
    # z = tf.zeros(tf.shape(missing), dtype = tf.int64) #wrong...
    # z = 51*tf.ones(tf.shape(missing), dtype = tf.int64) #wrong...
    # print("z", z)

    #z should be this...
    # print("\n OG_rads", OG_rads)
    # ends = tf.math.argmax(OG_rads, axis = 0) #wrong -> not max arg, last nonzero argument!!
    zero = tf.constant(0, dtype = tf.float32)
    ends = tf.math.reduce_sum(tf.cast(tf.not_equal(OG_rads, zero), tf.int64), axis = 0) #correct
    # print("\n ends", ends)

    test = tf.gather(ends, missing[0])  #get index of last element of missing jump section
    # print("\n test", test)
    z = test[None,:]
    z -= 2 #fixes indexing bug
    # print("z", z)

    missing = tf.transpose(tf.concat((z, missing), axis = 0))
    # print(missing)

    jumps = tf.concat((jumps, missing), axis = 0) #concat missing stuff back at the end of jumps
#     print("\n jumps after fix", jumps)
    #----------------------------------------------------------------------
    
    print("\n jumps: \n", jumps)
    
    #find where the first large cluster occurs in each spike
   

    
    bounds = None

    return(bounds, jumps)

In [122]:
from utils import get_cluster, get_cluster_fast
# print("rads: \n", it.rads)

s = time.time()
bounds_old = get_cluster(it.rads, mnp = it.min_num_pts)
print("\n took", time.time() - s, " s with old method \n")
print("\n bounds_old: \n", bounds_old[:10])
print(np.shape(bounds_old))

s = time.time()
# bounds_new, jumps = gt2(it.rads, mnp = it.min_num_pts)
bounds_new = get_cluster_fast(it.rads, mnp = it.min_num_pts)
print("\n bounds_new: \n", bounds_new[:10])
print(" \n took", time.time() - s, " s with new method")


 getting cluster took 0.4227633476257324 seconds !!!

 took 0.42305421829223633  s with old method 


 bounds_old: 
 tf.Tensor(
[[ 8.5336237  14.82997036]
 [ 6.87181711 11.42481709]
 [29.31493568 42.68794632]
 [ 9.48455429 15.15051174]
 [ 7.33260059 11.61510181]
 [ 3.22240448  3.43098164]
 [ 6.44483805  9.70490646]
 [ 4.55999422  5.19567156]
 [10.68612194 12.32955456]
 [ 8.53601074 10.99322033]], shape=(10, 2), dtype=float64)
(247, 2)

 getting cluster took 0.019646406173706055 seconds !!!

 bounds_new: 
 tf.Tensor(
[[ 8.522524  14.541453 ]
 [ 6.7979364 11.412506 ]
 [27.810429  42.660294 ]
 [ 9.476366  14.949856 ]
 [ 7.323119  11.605854 ]
 [ 3.2152867  3.4199846]
 [ 6.439029   9.676741 ]
 [ 4.553308   5.183438 ]
 [10.583586  12.2757635]
 [ 8.534277  10.990534 ]], shape=(10, 2), dtype=float32)
 
 took 0.02012801170349121  s with new method


In [99]:
#identifying location of jumps without looping
print("old slow soln shape: \n", tf.shape(bounds_old)) #want to produce this same shape!!!
# print("\n bounds_old: \n", bounds_old[:10])

#get all radial measurements
#(temp-- already done inside function)-----------------------------
mask = tf.cast(tf.math.equal(it.rads, 0), tf.float32)*1000
rads = it.rads + mask
#sort in ascending order for each column in tensor
top_k = tf.math.top_k(tf.transpose(rads), k = tf.shape(rads)[0])
rads = tf.transpose(tf.gather(tf.transpose(rads), top_k[1], batch_dims = 1))
rads = tf.reverse(rads, axis = tf.constant([0]))
# print("\n rads: \n", rads[:10])
# print("\n rads: \n", np.shape(rads))
# print("\n it.rads \n", it.rads)
#------------------------------------------------------------------

# print("\n jumps: \n", tf.shape(jumps))
# print("\n jumps: \n", jumps) #[idx of jump, which spike is jumping]

# y, idx = tf.unique(jumps[:,0]) #was this
jumps_temp = tf.gather(jumps, tf.argsort(jumps[:,1]), axis=0) #reorder based on index
y, idx = tf.unique(jumps_temp[:,1]) #test
print("\n y \n", y[:15], "\n", tf.shape(y), "\n \n idx \n", idx[:15], "\n", tf.shape(idx))
# print("\n jumps_temp \n", jumps_temp[:15])
# print("\n jumps[:,_]: \n", jumps[:,0])

# get ragged tensor containing indices where jumps occur inside each wedge shaped voxel
# jumps_rag = tf.RaggedTensor.from_value_rowids(jumps[:,1], idx) #WAS THIS --wrong!!

jumps_rag = tf.RaggedTensor.from_value_rowids(jumps_temp[:,0], jumps_temp[:,1]) #TEST - should be this??
# jumps_rag = tf.RaggedTensor.from_value_rowids(jumps[:,1], jumps[:,0]) #TEST
# print("\n jumps_rag \n", jumps_rag[:15])
print("\n rads[0,_] \n", rads[:30,0])


# append 0 to beginning of each ragged elemet of jumps_rag
zeros = tf.zeros(tf.shape(jumps_rag)[0])[:,None]
zeros = tf.cast(tf.RaggedTensor.from_tensor(zeros), tf.int64)
jumps_rag = tf.concat([zeros.with_row_splits_dtype(tf.int64), jumps_rag.with_row_splits_dtype(tf.int64)], axis = 1)
print("\n jumps_rag \n", jumps_rag[:15])
# print("\n jumps_rag \n", jumps_rag.to_tensor())

#get num points between each jump 
npts_between_jumps = tf.experimental.numpy.diff(jumps_rag.to_tensor())
# print("\n npts_between_jumps:\n ",npts_between_jumps[:10,:10])
# print("\n npts_between_jumps:\n ",npts_between_jumps)

#flag spikes where all npts_between_jumps are less than mnp
biggest_jump = tf.math.reduce_max(npts_between_jumps, axis = 1)
# print("\n biggest_jump \n", biggest_jump)
mnp = 100 #minumum number of points per cluster (defined in ICET class)
good_clusters = tf.cast(tf.math.greater(biggest_jump, mnp), tf.int32)
# good_clusters = tf.RaggedTensor.from_value_rowids(good_clusters, y).to_tensor()[:,0]  #fill in skipped indices
print("\n good_clusters (hold on to this for later) \n", good_clusters)

#get idx within jumps_rag corresponding to first sufficiently large jump
big_enough = tf.cast(tf.math.greater(npts_between_jumps, 100), tf.int32)
# print(big_enough[:10])
first_big_enough = tf.math.argmax(big_enough, axis = 1)
print("\n first_big_enough: \n", first_big_enough)
# print("\n first_big_enough: \n", first_big_enough)

print("\n everything looks good up to this point :)")

old slow soln shape: 
 tf.Tensor([247   2], shape=(2,), dtype=int32)

 y 
 tf.Tensor([ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14], shape=(15,), dtype=int64) 
 tf.Tensor([247], shape=(1,), dtype=int32) 
 
 idx 
 tf.Tensor([0 0 1 1 1 1 1 1 2 2 2 2 2 2 2], shape=(15,), dtype=int32) 
 tf.Tensor([793], shape=(1,), dtype=int32)

 rads[0,_] 
 tf.Tensor(
[8.522524  8.533624  8.546218  8.55991   8.569108  8.572579  8.577195
 8.584386  8.585438  8.611754  8.617272  8.625081  8.626165  8.634203
 8.636234  8.636838  8.637456  8.651796  8.652797  8.655872  8.65624
 8.663728  8.6686325 8.670218  8.671909  8.680213  8.680515  8.685126
 8.693569  8.69501  ], shape=(30,), dtype=float32)

 jumps_rag 
 <tf.RaggedTensor [[0, 602, 603], [0, 498, 718, 719, 725, 806, 812],
 [0, 43, 87, 131, 175, 220, 264, 308, 710, 712, 798, 813, 818, 819],
 [0, 586, 587, 588], [0, 476, 519], [0, 537, 582], [0, 502], [0, 213],
 [0, 101, 102, 105, 107, 490], [0, 322, 367, 457, 502, 547, 591, 636],
 [0, 744, 745, 746, 748], 

In [111]:
#get inner and outer (temp simple way-- just use radial measurements of inner and outermost points in cluster)
# print("jumps_rag.to_tensor(): \n", jumps_rag.to_tensor(), "\n")

#get index of radial measurements that defines inner bounds of voxel 
inner_idx = tf.gather(jumps_rag.to_tensor(), first_big_enough, batch_dims=1)
# print("\n inner_idx: \n", inner_idx, "\n")
inner  = tf.gather(tf.transpose(rads), inner_idx, batch_dims=1)
# print("\n inner: \n", inner)

outer_idx = tf.gather(jumps_rag.to_tensor(), first_big_enough + 1, batch_dims=1) - 1 #DEBUG: figure out if I need -1
# print("\n outer_idx: \n", outer_idx, "\n")
outer  = tf.gather(tf.transpose(rads), outer_idx, batch_dims=1)
# print("\n outer: \n", outer)

bounds = np.array([inner, outer]).T
print(bounds)

[[ 8.522524  14.541453 ]
 [ 6.7979364 11.412506 ]
 [27.810429  42.660294 ]
 [ 9.476366  14.949856 ]
 [ 7.323119  11.605854 ]
 [ 3.2152867  3.4199846]
 [ 6.439029   9.676741 ]
 [ 4.553308   5.183438 ]
 [10.583586  12.2757635]
 [ 8.534277  10.990534 ]
 [ 7.3641324  9.163473 ]
 [ 9.827092  13.412587 ]
 [ 4.4736133  6.4647164]
 [ 5.4322524  8.0986805]
 [ 7.2821875  9.111587 ]
 [ 6.429257   8.14609  ]
 [ 5.6875544  8.22032  ]
 [ 7.3912487 11.784734 ]
 [ 3.184801   3.3724418]
 [20.301628  26.280018 ]
 [ 6.316912   7.9331627]
 [12.843592  13.716571 ]
 [ 4.45389    5.0881677]
 [ 4.2877073  4.974927 ]
 [ 8.358009  15.102823 ]
 [ 7.557136  13.214449 ]
 [ 6.534025   8.523699 ]
 [26.140818  31.868149 ]
 [ 3.570882   4.049336 ]
 [ 3.3989444  4.90052  ]
 [ 6.9524817 13.892774 ]
 [16.547949  31.744976 ]
 [ 6.164683   7.63967  ]
 [ 5.5995975  7.8524923]
 [12.339181  13.730462 ]
 [ 4.3467436  5.264814 ]
 [ 4.879744   6.309086 ]
 [ 5.2533135  7.2804375]
 [ 4.4016423  5.292855 ]
 [ 6.5445576 10.054061 ]


In [73]:
#infill zero elements throughout (use tf.ragged.from_value_rowids keyed by y)
inner = tf.RaggedTensor.from_value_rowids(inner, y).to_tensor()[:,0]
#add zeros to end of bounds to get to same number of total voxels as OG_rads 
print(tf.shape(rads)[0])
print(tf.shape(inner))
# inner = tf.pad(inner, [[0,tf.shape(rads)[1]-len(inner)]]) #old
inner = tf.pad(inner, [[0,tf.shape(rads)[0]-len(inner)-1]]) #new
# print("\n inner \n", inner)
# print("\n inner \n", tf.shape(inner))

#concat idx and y, use gather_nd instead of converting to ragged and back?? 
idx1 = tf.concat((tf.cast(tf.range(len(inner))[:,None], tf.int64), inner[:,None]), axis = 1) #wrong?
# idx1 = tf.concat((inner[:,None], tf.cast(tf.range(len(inner))[:,None], tf.int64)), axis = 1) #test
print("\n idx \n", idx1[:7])
inside_bound = tf.gather_nd(rads, idx1)
print("\n inside_bound \n", inside_bound[:15])

#repeat for outside bound
outer = tf.gather(jumps_rag.to_tensor(), first_big_enough +1, batch_dims = 1)
outer = tf.RaggedTensor.from_value_rowids(outer, y).to_tensor()[:,0]
outer = tf.pad(outer, [[0,tf.shape(rads)[0]-len(outer)-1]])
idx2 = tf.concat((outer[:,None], tf.cast(tf.range(len(outer))[:,None], tf.int64)), axis = 1) #test
outside_bound = tf.gather_nd(rads, idx2)
print("\n outside_bound \n", outside_bound[:15])


#TODO add voxel length padding 
#  (max half distance betweeen last in cluster and first point outside cluster)

tf.Tensor(871, shape=(), dtype=int32)
tf.Tensor([247], shape=(1,), dtype=int32)

 idx 
 tf.Tensor(
[[  0   0]
 [  1   0]
 [  2 308]
 [  3   0]
 [  4   0]
 [  5   0]
 [  6   0]], shape=(7, 2), dtype=int64)

 inside_bound 
 tf.Tensor(
[8.522524 8.533624 0.       8.55991  8.569108 8.572579 8.577195 8.584386
 8.585438 8.611754 8.617272 8.625081 8.626165 8.634203 8.636234], shape=(15,), dtype=float32)

 outside_bound 
 tf.Tensor(
[14.82997   11.424817  42.687946  15.150512  11.615102   3.4309816
  9.704906   5.1956716 12.329555  10.99322    9.232235  13.532985
  6.536972   8.101664   9.171636 ], shape=(15,), dtype=float32)


In [39]:
#debug

#init output vector
# bounds = np.zeros([tf.shape(first_big_enough)[0], 2]) #DEBUG: why is first_big_enough longer than total cells?
bounds = np.zeros([tf.shape(rads)[1].numpy(), 2]) #what I should have

spike_id = tf.constant([1])
print(spike_id)

start_pt_idx  = tf.gather(jumps_rag, tf.gather(first_big_enough, spike_id)).to_tensor()
print(start_pt_idx)

idx = tf.constant([spike_id, start_pt_idx])
test = tf.gather_nd(rads, idx)
print(test)

# for i in range(tf.shape(first_big_enough)[0]):

#     inner = tf.gather(rads, )
    
#     bounds[i, 0] = inner


tf.Tensor([1], shape=(1,), dtype=int32)
tf.Tensor([[  0 602 603]], shape=(1, 3), dtype=int64)


ValueError: TypeError: Scalar tensor has no `len()`
Traceback (most recent call last):

  File "/home/derm/anaconda3/envs/py39/lib/python3.9/site-packages/tensorflow/python/framework/ops.py", line 1104, in __len__
    raise TypeError("Scalar tensor has no `len()`")

TypeError: Scalar tensor has no `len()`



In [52]:
#multi-dimensional indexing test
a = tf.random.uniform([3,3])
print(a)
idx = tf.constant([[1,1],[2,2]])
b = tf.gather_nd(a, idx)
print(b)

#test adding zeros to start of each ragged tensor
# # print(tf.shape(jumps_rag))
# zeros = tf.zeros(tf.shape(jumps_rag)[0])[:,None]
# zeros = tf.cast(tf.RaggedTensor.from_tensor(zeros), tf.int64)
# # print(tf.shape(zeros))
# # print(tf.shape(jumps_rag))
# test = tf.concat([zeros.with_row_splits_dtype(tf.int64), jumps_rag.with_row_splits_dtype(tf.int64)], axis = 1)
# print(test)

tf.Tensor(
[[0.10179257 0.6037365  0.48156595]
 [0.53692627 0.8813385  0.3952589 ]
 [0.6917515  0.13837957 0.05244076]], shape=(3, 3), dtype=float32)
tf.Tensor([0.8813385  0.05244076], shape=(2,), dtype=float32)


# fit_gaussian()

In [None]:
def fg2(cloud, rag, npts):
    """new method of fitting gaussian to better handle ragged input data"""
    numSamples = 3
    
    coords = tf.gather(cloud, rag)
    mu = tf.math.reduce_mean(coords, axis = 1)[:,None]
#     mu = tf.math.reduce_mean(coords, axis = 1) #old
#     print(mu)

#   TODO: try randomly sampling 30 points from each ragged cell, use reduced num pts to calculate covariance
#     subsampled = tf.map_fn(sample, it.inside2) #works but SLOW
#     subsampled = tf.map_fn(sample, it.inside2, parallel_iterations=True)
#     subsampled = tf.gather(rag,tf.range(tf.shape(rag)[0]))[:numSamples] #wrong
#     print(subsampled)

    xpos = tf.gather(cloud[:,0], rag)
    ypos = tf.gather(cloud[:,1], rag)
    zpos = tf.gather(cloud[:,2], rag)
#     c = tfp.stats.covariance(xpos.to_tensor(), ypos.to_tensor())

#     print(xpos)
    idx = tf.range(30)
    xpos = tf.gather(xpos, idx, axis = 1)
    ypos = tf.gather(ypos, idx, axis = 1)
    zpos = tf.gather(zpos, idx, axis = 1)
    print(xpos)

    xx = tf.math.reduce_sum(tf.math.square(xpos - mu[:,:,0] ), axis = 1)/npts
    yy = tf.math.reduce_sum(tf.math.square(ypos - mu[:,:,1] ), axis = 1)/npts
    zz = tf.math.reduce_sum(tf.math.square(zpos - mu[:,:,2] ), axis = 1)/npts
    xy = tf.math.reduce_sum( (xpos - mu[:,:,0])*(ypos - mu[:,:,1]), axis = 1)/npts  #+
    xz = tf.math.reduce_sum( (xpos - mu[:,:,0])*(zpos - mu[:,:,2]), axis = 1)/npts #-
    yz = tf.math.reduce_sum( (ypos - mu[:,:,1])*(zpos - mu[:,:,2]), axis = 1)/npts #-

    sigma = tf.Variable([xx, xy, xz,
                        xy, yy, yz,
                        xz, yz, zz]) 
    sigma = tf.reshape(tf.transpose(sigma), (tf.shape(sigma)[1] ,3,3))
        
#     mu = None
    return(mu, sigma)

@tf.function
def sample(x, samples=3):
  """https://stackoverflow.com/questions/71073873/sample-from-ragged-tensor"""  
  length = tf.shape(x)[0]
#   was this
#   x = tf.cond(tf.less_equal(length, samples), lambda: x, lambda: tf.gather(x, tf.random.shuffle(tf.range(length))[:samples]))
 
#   test
#   x = tf.cond(tf.less_equal(length, samples), lambda: x, lambda: tf.gather(x, tf.range(length))[:samples])
  x = tf.gather(x,tf.range(length))[:samples]

    
  return x

In [None]:
s = time.time()
mu2, sigma2 = it.fit_gaussian(it.cloud2_tensor, it.inside2, tf.cast(it.npts2, tf.float32))
print("\n took", time.time() - s, " s with old method")

s = time.time()
mu2, sigma2 = fg2(it.cloud2_tensor, it.inside2, tf.cast(it.npts2, tf.float32))
print(" \n took", time.time() - s, " s with new method")

# print(it.npts2)
# print(it.inside2)

In [None]:
# vect = it.inside2
vect = tf.ragged.constant([[],[1,2,3,4],[5,4,3,2,1],[6],[99],[7,8,9,10,11,12,13]])
# print(tf.shape(vect)[0])
print("vect", vect)
c = tf.map_fn(sample, vect)
# print(c)

#wrong
# test = tf.gather(vect,tf.range(tf.shape(vect)[0]))[:3]
idx = tf.range(3)
print("\n idx", idx)
test = tf.gather(vect, idx , axis = 1)
print("\n test", test) #NOTE: indices with too few elements produce unexpected behavior
                        #that doesn't matter since they get suppressed anyways
    
vec2 = tf.random.categorical(vect, 2)
    