In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
# This will reload imports before executing code, allowing you to easily change contents of custom scripts
%load_ext autoreload
%autoreload 2

# Stay classification: batch of synthetic training data

In [3]:
import numpy as np
import pandas as pd

In [4]:
import os, sys
sys.path.append('/home/sandm/Notebooks/stay_classification/src/')

In [5]:
# #############################################################################
%matplotlib inline
import matplotlib.pyplot as plt

In [6]:
colors = [plt.cm.Spectral(each)
          for each in np.linspace(0, 1, 10)]

from random import shuffle

In [7]:
shuffle(colors)

# Clustering

## Split the clusters which have a temporal gap

### IQR-plotting

For each sub-cluster, plot the quantile boxes with whistkers.

**Notes**
* the boxes usually capture the baseline of the underlying stay
* the forward and backward clusters
    * usually the same clusters in the stays with similar IQRs
    * usually different in the 

### From here

At this point, it seems that the basic clusters are formed. 

Next, use the IQRs of these clusters as the new bounds for extending the cluster: essentially using the extensible box method.

Note that the IQR can be larger than the allow distance threshold; the box would therefore need to be the smaller of the two but with the same mean and/or median

### Summary

1. `get_clusters_x` finds the clusters based on nearness of events in space and time
2. `merge_clusters` merges neraby clusters based on ...
3. `merge_clusters_2`  merges neraby clusters based on ...
4. `extend_clusters` extend the clusters
5. `separate_clusters` break the overlapping clusters and then re-merge
6. `merge_clusters_2` merge the separated clusters
7. `intersect` the forward and backward clusters

### Notes

Mostly, the results are good: more than 80% of the trajectories have prec and/or recall above 0.8

The main issue
* loss of precision due to non-stays being identified
    * this (loss) is increased with higher event density but compensated by an increase in recall
        * since many clusters are identified and the stay events have a higher probability of being classified as stay events.
<br/>

**TODO** since stay events are classified and not the accuracy of the stays, it would be useful to have a measure of the stay accuracy: _ie_ once a stay is identified, how much of that stay is correctly classified.

In [8]:
from helper__3stays_v3_scripts import switch_indices

In [9]:
from synthetic_data.plotting import plot_trajectory, add_plot_trajectory, add_plot_seg_boxes

In [10]:
from helper__3stays_v3_scripts__plotting import plot_cluster_boxplots, add_plot_cluster_boxplots

In [11]:
from helper__3stays_v3_scripts__eval import get_segments_scores

In [12]:
print_clusts = lambda cluster_list : [print(f"[{c[0]:4d},{c[-1]:4d}]") for c in cluster_list]
#print_ctimes = lambda cluster_list : [print(f"[{time_arr[c[0]]:6.3f},{time_arr[c[-1]]:6.3f}]") for c in cluster_list]
#print_ctdiff = lambda cluster_list : [print(f"{time_arr[c[-1]] - time_arr[c[0]]:6.3f}") for c in cluster_list]
#print_times = lambda l: list(map(lambda x: f"{x:6.3f}",l))

# Start here

In [13]:
from synthetic_data.trajectory import get_stay_segs, get_adjusted_stays
#from synthetic_data.trajectory_class import get_rand_traj
from synthetic_data.plotting import plot_trajectory, add_plot_seg_boxes

from synthetic_data.trajectory_class import get_trajectory
#from synthetic_data.trajectory import get_stay

dsec = 1/3600.0
t_total = np.arange(0,24,dsec)

In [14]:
from synthetic_data.canonical_stays import get3e, get3

In [15]:
from helper__get_clusters import get_clusters_1, get_clusters_2, get_clusters_3, get_clusters_4

In [16]:
time_thresh = 1/6
dist_thresh=0.25

In [17]:
rand_range = lambda min_, max_, size: (max_-min_)*np.random.random_sample(size=size) + min_

event_frac = rand_range(0.01,0.001, 1)[0]
duplicate_frac = 0.30 #rand_range(1,0.3,0.05)[0]

print(f"Event frac. = {event_frac:6.3f}\nDupli. frac. = {duplicate_frac:6.3f}")

configs = {
    'time_thresh':1/6,
    'dist_thresh':0.5,
    'event_frac':event_frac,
    'duplicate_frac':duplicate_frac,    
    'noise_min':0.02,
    'noise_max':0.15
}

Event frac. =  0.007
Dupli. frac. =  0.300


#### Another way

In [18]:
def get_rand_stay():
    
    rand_range = lambda min_, max_, size: (max_-min_)*np.random.random_sample(size=size) + min_

    event_frac = rand_range(0.05,0.001, 1)[0]
    duplicate_frac = 0.30 #rand_range(1,0.3,0.05)[0]

    configs = {
        'time_thresh':1/6,
        'dist_thresh':0.5,
        'event_frac':event_frac,
        'duplicate_frac':duplicate_frac,    
        'noise_min':0.02,
        'noise_max':0.15
    }

    x_dist = rand_range(0.52,5.0, 10)[0]    
    x_dist = (-1)**np.random.randint(0,2,1)*x_dist
    
    mid_len = rand_range(0.2, 8, 10)[0]
    shift = rand_range(-5, 5, 21)[0]
    
    return configs, x_dist, mid_len, shift

In [19]:
from synthetic_data.trajectory_class import pickle_trajectory
from datetime import datetime

import os

date_tag = datetime.today().strftime('%Y%m%d')
#data_dir = f"./testdata_{date_tag}_4_no_iqr_and_wi_ranges_and_wi_shift/"

data_dir = f"./testdata_training_set__canonical_3stays/"

try:
    os.makedirs(data_dir)
except OSError as e:
    print(e)

[Errno 17] File exists: './testdata_training_set__canonical_3stays/'


In [20]:
total = 11
ii = 0
while ii < total:
            
    configs, x_dist, mid_len, shift = get_rand_stay()          

    #if verbose: print(f"{x_dist:6.3f}, {mid_len:6.3f}, {shift:6.3f}")

    val = np.random.randint(0,2,1)
    if val:
        stays = get3(x_dist, mid_len, shift)
    else:
        stays = get3e(x_dist, mid_len, shift)            

    continuation = True
    m = 0
    while continuation:
        n = 0
        try:
            time_arr, raw_arr, noise_arr, segments = get_trajectory(stays, t_total, configs)
            t_segs, x_segs = get_stay_segs(get_adjusted_stays(segments, time_arr))
            stays_tag = int((x_segs.size)/3)
            continuation = False
        except:
            print("Failed at",m,n)
            if n > 10: 
                continuation = False
            else:
                n+=1
            pass

    try:
        trajectory_tag = f"trajectory{ii}_{stays_tag}stays"
        path_to_file =  data_dir + trajectory_tag
        pickle_trajectory(time_arr, raw_arr, noise_arr, segments, path_to_file + ".pkl")
        ii+=1
    except:
        break
        
    if ii % 10 == 0:
        print(f"{ii:4d} of {total:5d}")


  10 of    11




In [23]:
import pickle

for ii in range(10, 11):

    stays_tag = 3
    trajectory_tag = f"trajectory{ii}_{stays_tag}stays"    
    path_to_file =  data_dir + trajectory_tag
    
    trajectory = pickle.load( open(path_to_file + ".pkl", "rb") )    
    
    segments = trajectory['segments']
    t_arr = trajectory['time_arr']
    r_arr = trajectory['raw_locs_arr']
    x_arr = trajectory['nse_locs_arr']
    t_segs, x_segs = get_stay_segs(get_adjusted_stays(segments, t_arr))

    ax = plot_trajectory(t_arr, r_arr, x_arr, t_segs, x_segs, dist_thresh);
    add_plot_seg_boxes(t_segs, x_segs, dist_thresh, ax)
    ax.set_xlim([5.75,18.25]);

    ylim = [x_arr.min()-2*dist_thresh, x_arr.max()+2*dist_thresh]

    plt.savefig(path_to_file + ".png")
    plt.close()
    
    if ii % 50 == 0:
        print(f"{ii:5d} of 1000")
#'''

In [22]:
data_dir

'./testdata_training_set__canonical_3stays/'

In [None]:
!ls 