In [1]:
import os
import sys
import time

import numpy as np
import pandas as pd
import math
from tqdm import tqdm

from trackml.dataset import load_event
from trackml.randomize import shuffle_hits
from trackml.score import score_event

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
import collections as coll
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KDTree

sys.path.append("../..")
import merge as merge
import extension as ext
import zroutlier as zro
import free_hits as free
import track_score as score2
import straight_tracks as strt
import eda_utils as eda
import r0outlier as r0o

%matplotlib inline

In [2]:
TRAIN_PATH = '../../../input/train_1'
event_id = 1000

In [3]:

event_prefix = 'event00000' + str(event_id)
hits, cells, particles, truth = load_event(os.path.join(TRAIN_PATH, event_prefix))

mem_bytes = (hits.memory_usage(index=True).sum() 
             + cells.memory_usage(index=True).sum() 
             + particles.memory_usage(index=True).sum() 
             + truth.memory_usage(index=True).sum())
print('{} memory usage {:.2f} MB'.format(event_prefix, mem_bytes / 2**20))

event000001000 memory usage 18.46 MB


In [4]:
helix_root_path = '../../1000_r0_exp3/event_' + str(event_id) + '_labels_train_helix'
labels_helix1 = pd.read_csv(helix_root_path + '1.csv').label.values
labels_helix2 = pd.read_csv(helix_root_path + '2.csv').label.values
labels_helix3 = pd.read_csv(helix_root_path + '3.csv').label.values
labels_helix4 = pd.read_csv(helix_root_path + '4.csv').label.values
labels_helix5 = pd.read_csv(helix_root_path + '5.csv').label.values
labels_helix6 = pd.read_csv(helix_root_path + '6.csv').label.values
labels_helix7 = pd.read_csv(helix_root_path + '7.csv').label.values
labels_helix8 = pd.read_csv(helix_root_path + '8.csv').label.values
labels_helix9 = pd.read_csv(helix_root_path + '9.csv').label.values
labels_helix10 = pd.read_csv(helix_root_path + '10.csv').label.values
labels_helix11 = pd.read_csv(helix_root_path + '11.csv').label.values
labels_helix12 = pd.read_csv(helix_root_path + '12.csv').label.values
labels_helix13 = pd.read_csv(helix_root_path + '13.csv').label.values
labels_helix14 = pd.read_csv(helix_root_path + '14.csv').label.values
labels_helix15 = pd.read_csv(helix_root_path + '15.csv').label.values
labels_helix16 = pd.read_csv(helix_root_path + '16.csv').label.values
labels_helix17 = pd.read_csv(helix_root_path + '17.csv').label.values
labels_helix18 = pd.read_csv(helix_root_path + '18.csv').label.values
labels_helix19 = pd.read_csv(helix_root_path + '19.csv').label.values
labels_helix20 = pd.read_csv(helix_root_path + '20.csv').label.values
labels_helix21 = pd.read_csv(helix_root_path + '21.csv').label.values
labels_helix22 = pd.read_csv(helix_root_path + '22.csv').label.values
labels_helix23 = pd.read_csv(helix_root_path + '23.csv').label.values
labels_helix24 = pd.read_csv(helix_root_path + '24.csv').label.values
labels_helix25 = pd.read_csv(helix_root_path + '25.csv').label.values
labels_helix26 = pd.read_csv(helix_root_path + '26.csv').label.values
labels_helix27 = pd.read_csv(helix_root_path + '27.csv').label.values
labels_helix28 = pd.read_csv(helix_root_path + '28.csv').label.values
labels_helix29 = pd.read_csv(helix_root_path + '29.csv').label.values
labels_helix42 = pd.read_csv(helix_root_path + '42.csv').label.values

In [5]:
def create_one_event_submission(event_id, hits, labels):
    sub_data = np.column_stack(([event_id]*len(hits), hits.hit_id.values, labels))
    submission = pd.DataFrame(data=sub_data, columns=["event_id", "hit_id", "track_id"]).astype(int)
    return submission

def score_one_submission(event_id, hits, labels, truth):
    submission = create_one_event_submission(event_id, hits, labels)
    score = score_event(truth, submission)
    print("Score for event %d: %.8f" % (event_id, score))

In [None]:
# Run this block if you want to see the distribution of perfect vs short vs imperfect vs horrible
# tracks, by comparing to ground truth
helix6 = np.copy(labels_helix9)
#helix6 = merge.remove_outliers(helix6, hits, smallest_track_size=6, aggressive=False, print_counts=False)
(helix6, small_count) = merge.remove_small_tracks(helix6, smallest_track_size=6)
#helix6 = r0o.remove_badr0_tracks(helix6, hits)
helix6 = merge.renumber_labels(helix6)
tracks = np.unique(helix6)
short_tracks = []
perfect_tracks = []
imperfect_tracks = []
horrible_tracks = []
for track in tracks:
    if track == 0: continue
    tix = np.where(helix6 == track)[0]
    if len(tix) < 6: continue
    else:
        (is_match, correct,incorrect) = eda.track_distance_from_truth(track, helix6, hits, truth)
        if is_match:
            perfect_tracks.append(track)
        elif incorrect == 0:
            short_tracks.append(track)
        elif incorrect <= 4 and correct >= incorrect:
            imperfect_tracks.append(track)
        else:
            horrible_tracks.append(track)

print('Total tracks:     ' + str(len(tracks)))
print('Perfect tracks:   ' + str(len(perfect_tracks)))
print('Short tracks:     ' + str(len(short_tracks)))
print('Imperfect tracks: ' + str(len(imperfect_tracks)))
print('Horrible tracks:  ' + str(len(horrible_tracks)))
# Helix1: 8143, perfect 1972, short 1364, imperfect 3547, horrible 1259
# Helix2: 7666, perfect 2020, short 1170, imperfect 3522, horrible 953
# Helix3: 6717, perfect 1783, short 1589, imperfect 2696, horrible 648
# Helix4: 6688, perfect 1547, short 1649, imperfect 2780, horrible 711
# Helix5: 6645, perfect 1600, short 1763, imperfect 2781, horrible 500
# Helix6: 7436, perfect 1755, short 1716, imperfect 3361, horrible 603
# Helix7: 7451, perfect 1795, short 1795, imperfect 3299, horrible 589
# Helix8: 7421, perfect 1734, short 1638, imperfect 3302, horrible 746
# Helix9: 7414, perfect 1807, short 1699, imperfect 3276, horrible 631

In [None]:
def display_track_quality(labels, hits, truth):
    tracks, counts = np.unique(labels, return_counts=True)
    short_tracks = 0
    perfect_tracks = 0
    imperfect_tracks = 0
    horrible_tracks = 0
    for ix, track in enumerate(tracks):
        if track == 0: continue
        if counts[ix] < 6: continue

        (is_match, correct,incorrect) = eda.track_distance_from_truth(track, labels, hits, truth)
        if is_match:
            perfect_tracks = perfect_tracks + 1
        elif incorrect == 0:
            short_tracks = short_tracks + 1
        elif incorrect <= 4 and correct >= incorrect:
            imperfect_tracks = imperfect_tracks + 1
        else:
            horrible_tracks = horrible_tracks + 1

    print('Total tracks:     ' + str(len(tracks)))
    print('Perfect tracks:   ' + str(perfect_tracks))
    print('Short tracks:     ' + str(short_tracks))
    print('Imperfect tracks: ' + str(imperfect_tracks))
    print('Horrible tracks:  ' + str(horrible_tracks))

In [None]:
labels = np.copy(labels_helix6)
score_one_submission(event_id, hits, labels, truth) # 0.63580502
(strong, medium, weak) = r0o.split_tracks_based_on_quality(labels, hits)
score_one_submission(event_id, hits, strong, truth)
score_one_submission(event_id, hits, medium, truth)
score_one_submission(event_id, hits, weak, truth)
# (0.5 weak, 0.1 strong): 0.28793004 (1776/3737), 0.31400817 (1582/5075), 0.03201160 (28/646)
# (0.5 weak, 0.15 strong): 0.35599925 (2247/4484), 0.24593896 (1111/4328), 0.03201160 (28/646)
# (0.5 weak, 0.2 strong): 0.40582813 (2604/5035), 0.19611008 (754/3777), 0.03201160 (28/646)

In [None]:
display_track_quality(strong, hits, truth)
display_track_quality(medium, hits, truth)
display_track_quality(weak, hits, truth)

In [6]:
labels_helix1 = merge.remove_outliers(labels_helix1, hits, cells, aggressive=True, print_counts=False)
labels_helix2 = merge.remove_outliers(labels_helix2, hits, cells, aggressive=True, print_counts=False)
#labels_helix3 = merge.remove_outliers(labels_helix3, hits, cells, aggressive=True, print_counts=False)
labels_helix4 = merge.remove_outliers(labels_helix4, hits, cells, aggressive=True, print_counts=False)
labels_helix5 = merge.remove_outliers(labels_helix5, hits, cells, aggressive=True, print_counts=False)
labels_helix1 = r0o.remove_badr0_tracks(labels_helix1, hits)
labels_helix2 = r0o.remove_badr0_tracks(labels_helix2, hits)
#labels_helix3 = r0o.remove_badr0_tracks(labels_helix3, hits)
labels_helix4 = r0o.remove_badr0_tracks(labels_helix4, hits)
labels_helix5 = r0o.remove_badr0_tracks(labels_helix5, hits)

In [7]:
all_labels = []
all_labels.append(labels_helix1)
all_labels.append(labels_helix2)
all_labels.append(labels_helix6)
all_labels.append(labels_helix7)
all_labels.append(labels_helix8)
all_labels.append(labels_helix9)
all_labels.append(labels_helix10)
all_labels.append(labels_helix11)
all_labels.append(labels_helix12)
all_labels.append(labels_helix13)
all_labels.append(labels_helix14)
#all_labels.append(labels_helix15) #
all_labels.append(labels_helix16)
#all_labels.append(labels_helix17) #
#all_labels.append(labels_helix18) #
#all_labels.append(labels_helix19) #
all_labels.append(labels_helix20)
#all_labels.append(labels_helix21) # 21 helps slightly, 22 hurts
all_labels.append(labels_helix23) # 23 helped 0.001, but combined with 26 had only slight impr. over 26 alone
all_labels.append(labels_helix26) # 26 helped almost 0.002
all_labels.append(labels_helix42)
all_labels.append(labels_helix5)
#all_labels.append(labels_helix1) # merging with 1 again at the end helps
#all_labels.append(labels_helix3)
all_labels.append(labels_helix4)
strong_labels = []
medium_labels = []
weak_labels = []
for label in all_labels:
    (strong, medium, weak) = r0o.split_tracks_based_on_quality(label, hits)
    strong_labels.append(strong)
    medium_labels.append(medium)
    weak_labels.append(weak)

In [8]:
def merge_all_labels(all_labels, hits, truth):
    merge_count = 0
    labels_merged = np.copy(all_labels[0])
    for i in range(len(all_labels)):
        if i == 0: continue
        labels_merged = merge.heuristic_merge_tracks(labels_merged, all_labels[i], hits, overwrite_limit=6, print_summary=False)
        merge_count = merge_count + 1
        #message = 'Merged loop 1-' + str(i+1) + ' score for event '
        #display_score(event_id, hits, labels_merged, truth, message)
        score_one_submission(event_id, hits, labels_merged, truth)
    return labels_merged

def merge_all_strong_labels(all_labels, hits, truth):
    merge_count = 0
    labels_merged = np.copy(all_labels[0])
    for i in range(len(all_labels)):
        if i == 0: continue
        labels_merged = merge.heuristic_merge_tracks(labels_merged, all_labels[i], hits, overwrite_limit=6, print_summary=False)
        merge_count = merge_count + 1
        # Periodically remove small tracks/noise to help merge performance.
        # If we're only dealing with unmatched hits from a previous round, don't filter though,
        # since we have relatively few tracks already, and removing even small tracks hurts.
        if merge_count % 4 == 0:
            (labels_merged, _) = merge.remove_small_tracks(labels_merged, smallest_track_size=3)
        #message = 'Merged loop 1-' + str(i+1) + ' score for event '
        #display_score(event_id, hits, labels_merged, truth, message)
        score_one_submission(event_id, hits, labels_merged, truth)
    return labels_merged

def merge_all_medium_labels(all_labels, hits, truth):
    merge_count = 0
    labels_merged = np.copy(all_labels[0])
    for i in range(len(all_labels)):
        if i == 0: continue
        labels_merged = merge.heuristic_merge_tracks(labels_merged, all_labels[i], hits, overwrite_limit=6, weak_tracks=True, print_summary=False)
        merge_count = merge_count + 1
        # Periodically remove small tracks/noise to help merge performance.
        # If we're only dealing with unmatched hits from a previous round, don't filter though,
        # since we have relatively few tracks already, and removing even small tracks hurts.
        if merge_count % 4 == 0:
            (labels_merged, _) = merge.remove_small_tracks(labels_merged, smallest_track_size=3)
        #message = 'Merged loop 1-' + str(i+1) + ' score for event '
        #display_score(event_id, hits, labels_merged, truth, message)
        score_one_submission(event_id, hits, labels_merged, truth)
    return labels_merged

def merge_all_weak_labels(all_labels, hits, truth):
    merge_count = 0
    labels_merged = np.copy(all_labels[0])
    for i in range(len(all_labels)):
        if i == 0: continue
        labels_merged = merge.heuristic_merge_tracks(labels_merged, all_labels[i], hits, overwrite_limit=3, weak_tracks=True, print_summary=False)
        merge_count = merge_count + 1
        # Periodically remove small tracks/noise to help merge performance.
        # If we're only dealing with unmatched hits from a previous round, don't filter though,
        # since we have relatively few tracks already, and removing even small tracks hurts.
        if merge_count % 4 == 0:
            (labels_merged, _) = merge.remove_small_tracks(labels_merged, smallest_track_size=3)
        #message = 'Merged loop 1-' + str(i+1) + ' score for event '
        #display_score(event_id, hits, labels_merged, truth, message)
        score_one_submission(event_id, hits, labels_merged, truth)
    return labels_merged

In [None]:
all_merged = merge_all_labels(all_labels, hits, truth)
# No outlier removal, order 1-9: 0.70948133
# outlier removal, 1-2,6-9,5,3,4: 0.71563907
# + r0 outlier rem: 0.71652579
# + more models(10-20):

In [9]:
strong_merged = merge_all_strong_labels(strong_labels, hits, truth)
# 0.1 cutoff: 0.47028638
# 0.15 cutoff: 0.54866460
# 0.2 cutoff: 0.59883933
# outlier rem, 0.2 cutoff: 0.60684391
# +r0 out. rem: 0.60669428
# + models 10-20: 0.62883099
# NEW: non-aggressive outlier, models 1-20: 0.62464468
# aggr. removal: 0.62883099
# over. limit 8: 0.62338910
# over. limit 3: 0.62573651
# lim 6, periodically clean <3 tracks: 0.62919876
# ++ remove 17,19: 0.62821628
# ++ remove 17,18,19,3,4: 0.62257023
# ordered 1-20: 0.63031451
# ordered 1-20, rem 3,4: 0.62557659
# --> with 3-trk cleanse: 0.62573526
# --> with 4-trk cleanse: 0.62589822
# --> with 5-trk cleanse: 0.62569532
# ordered 1-20, rem 3,4,15: 0.62405548
# remove 15,17,18,19,3,4: 0.62262955
# --> weak: 0.61891542

Score for event 1000: 0.51293564
Score for event 1000: 0.55977605
Score for event 1000: 0.57273489
Score for event 1000: 0.58420738
Score for event 1000: 0.58765860
Score for event 1000: 0.59718572
Score for event 1000: 0.60141234
Score for event 1000: 0.60528105
Score for event 1000: 0.60770404
Score for event 1000: 0.61064412
Score for event 1000: 0.61160560
Score for event 1000: 0.61303118
Score for event 1000: 0.61403313
Score for event 1000: 0.61691591
Score for event 1000: 0.61985516
Score for event 1000: 0.62612279
Score for event 1000: 0.62895606


In [None]:
#track_extension_limits = [0.02, 0.04, 0.06, 0.08, 0.10]
#sm2 = np.copy(strong_merged)
#sm2 = ext.do_all_track_extensions(sm2, hits, track_extension_limits, num_neighbours=15, use_scoring=True)
strong_merged = strt.extend_straight_tracks(strong_merged, hits)
score_one_submission(event_id, hits, strong_merged, truth)

In [10]:
#m2_lbl = medium_labels[0:13]
#m2_lbl.append(medium_labels[-3])
#m2_lbl.append(medium_labels[-2])
#m2_lbl.append(medium_labels[-1])
#medium_merged = merge_all_labels(m2_lbl, hits, truth)
medium_merged = merge_all_medium_labels(medium_labels, hits, truth)
# 0.1 cutoff: 0.50709876
# 0.15 cutoff: 0.43838218
# 0.2 cutoff: 0.38796599
# outlier rem, 0.2 cutoff: 0.36414178
# +r0 out. rem: 0.36216201
# + models 10-20: 0.45574288
# + models 1,2,6,7,8,9,10,5,3,4: 0.42359094
# + models 1,2,6-15,5,3,4: 0.44986680
# + models 1-20: 0.45574288
# overwrite limit 4: 0.45255829
# overwrite limit 8: 0.45429087
# overwrite limit 6: 0.45574288
# lim 6, periodically clean <3 tracks: 0.45601010
# ++ remove 17,19: 0.44087093
# ++ remove 17,18,19,3,4: 0.44612346
# ordered: 0.46101420
# weak: 0.45405540
# ordered 1-20, rem 3,4: 0.45618824
# --> weak: 0.45612816
# ordered 1-20, rem 3,4,5, weak: 0.45192894
# remove 15,17,18,19,3,4: 0.44360005

Score for event 1000: 0.22744578
Score for event 1000: 0.28685333
Score for event 1000: 0.31105647
Score for event 1000: 0.33316696
Score for event 1000: 0.34087531
Score for event 1000: 0.38542046
Score for event 1000: 0.40943704
Score for event 1000: 0.41715433
Score for event 1000: 0.42547329
Score for event 1000: 0.43005594
Score for event 1000: 0.43347721
Score for event 1000: 0.43586714
Score for event 1000: 0.43778796
Score for event 1000: 0.44390561
Score for event 1000: 0.44616844
Score for event 1000: 0.45111056
Score for event 1000: 0.45512096


In [None]:
#track_extension_limits = [0.02, 0.04, 0.06, 0.08, 0.10]
#sm2 = np.copy(medium_merged)
medium_merged = strt.extend_straight_tracks(medium_merged, hits)
score_one_submission(event_id, hits, medium_merged, truth)
#medium_merged = ext.do_all_track_extensions(medium_merged, hits, track_extension_limits, num_neighbours=15, use_scoring=True)
# 0.44986680->0.45397096
#score_one_submission(event_id, hits, sm2, truth)
#sm2 = strt.extend_straight_tracks(sm2, hits)
# 0.45397096->0.45556476
#score_one_submission(event_id, hits, sm2, truth)

In [11]:
#w2_lbl = weak_labels[0:8]
#w2_lbl.append(weak_labels[-3])
#w2_lbl.append(weak_labels[-2])
#w2_lbl.append(weak_labels[-1])
#weak_merged = merge_all_labels(w2_lbl, hits, truth)
weak_merged = merge_all_weak_labels(weak_labels, hits, truth)
# 0.5 cutoff: 0.09220529
# 0.5 cutoff + outlier rem: 0.08541034
# + r0 out. rem: 0.08497520
# + models 10-20: 0.11515280
# + models 1,2,6,7,8,9,10,5,3,4: 0.10249580
# + models 1-20: 0.11515280
# overwrite limit 2: 0.11508325
# overwrite limit 8: 0.11279941
# overwrite limit 3, weak: 0.11976585
# lim 3, weak, periodically clean <3 tracks: 0.11968905
# ++ remove 17,19: 0.11787688
# ++ remove 17,18,19,3,4: 0.11552246
# ordered: 0.11851233
# weak: 0.12264231
# ordered 1-20, rem 3,4: 0.12094372
# ordered 1-20, rem 3,4,5, weak: 0.11752672
# remove 15,17,18,19,3,4: 0.11401350

Score for event 1000: 0.04506415
Score for event 1000: 0.06575363
Score for event 1000: 0.07006864
Score for event 1000: 0.07853070
Score for event 1000: 0.08255572
Score for event 1000: 0.09500363
Score for event 1000: 0.10233660
Score for event 1000: 0.10574703
Score for event 1000: 0.10763563
Score for event 1000: 0.10868704
Score for event 1000: 0.10904090
Score for event 1000: 0.11027754
Score for event 1000: 0.11112603
Score for event 1000: 0.11526440
Score for event 1000: 0.11686153
Score for event 1000: 0.11891198
Score for event 1000: 0.11970010


In [12]:
#labels_merged = merge.heuristic_merge_tracks(strong_merged, medium_merged, hits, overwrite_limit=3, print_summary=False)
#mm1 = np.copy(medium_merged)
#mm1 = merge.remove_outliers(mm1, hits, cells, aggressive=False, print_counts=False)
#score_one_submission(event_id, hits, mm1, truth)

#labels_merged = merge.heuristic_merge_tracks(strong_merged, mm1, hits, weak_tracks=True, overwrite_limit=3)
labels_merged = merge.heuristic_merge_tracks(strong_merged, medium_merged, hits, weak_tracks=True, overwrite_limit=3)
score_one_submission(event_id, hits, labels_merged, truth)
# strong=0.1 cutoff: 0.7186 (overwrite=3)
# strong=0.15 cutoff: 0.7192 (overwrite=3)
# strong=0.2 cutoff: 0.7196
# outlier rem., 0.2 cutoff: 0.7246
# + r0 out. rem: 0.7260
# + models 10-20: 0.73364939
# only 10 medium models: 0.73336881
# only 15 medium models: 0.73148404
# 15 med, 10 weak: 0.73329531
# last: 0.73367862 ((2) - 0.73377616)
# models 1-20: 0.73361873
# chg overwrite: 0.73196386
# overwrite limit 8: 0.72996687
# strong 3, med 6, weak 3: 0.73041769
# ++ remove 17,19: 0.73443782
# --> + med. out. rem: 0.73407980
# ++ remove 17,18,19,3,4: 0.73393060
# --> + med. out. rem: 0.73396006
# ordered: 0.73258941
# --> out rem: 0.73207804
# weak: 0.73214341
# --> out rem: 0.73189557
# ordered 1-20, rem 3,4: 0.73250120
# --> medium-weak: 0.73288691
# --> strong-cleanse: 0.73237699
# --> strong-cleanse5: 0.73258272
# --> strong-cleanse4: 0.73267125
# ordered 1-20, rem 3,4,15: 0.73255419
# remove 15,17,18,19,3,4: 0.73534231

Score for event 1000: 0.73651231


In [13]:
#labels_merged2 = merge.heuristic_merge_tracks(labels_merged, weak_merged, hits, overwrite_limit=1, print_summary=False)

wm1 = np.copy(weak_merged)
wm1 = merge.remove_outliers(wm1, hits, cells, aggressive=True, print_counts=False)
score_one_submission(event_id, hits, wm1, truth)

labels_merged2 = merge.heuristic_merge_tracks(labels_merged, wm1, hits, weak_tracks=True, overwrite_limit=1)
#labels_merged2 = merge.heuristic_merge_tracks(labels_merged, weak_merged, hits, weak_tracks=True, overwrite_limit=1)
score_one_submission(event_id, hits, labels_merged2, truth)
# strong=0.1 cutoff: 0.7228 (overwrite=1)
# strong=0.15 cutoff: 0.7237 (overwrite=1)
# strong=0.2 cutoff: 0.7241 (overwrite=1)
# outlier rem. strong=0.2: 0.7293
# +r0 out. rem: 0.7311
# +models 10-14: 0.735?
# + models 10-20: 0.73648855
# only 10 weak models: 0.73707780
# only 15 medium models: 0.73500946
# 15 med, 10 weak: 0.73671680
# last: 0.73678249
# new weak merge: 0.73742467, 0.73759009 (4), 0.73786259 (3)
# models 1-20: 0.73793692
# chg overwrite: 0.73704419
# overwrite limit 8: 0.73361786
# str 3, med 6, weak 3: 0.73516573
# ++ remove 17,19: 0.73852283
# --> + weak aggr. out. rem: 0.73800277
# ++ remove 17,18,19,3,4: 0.73831033
# --> + med. out. rem: 0.73851074
# ordered: 0.73730571
# --> + out. rem: 0.73672076
# weak: 0.73665424
# --> out rem: 0.73640193
# ordered 1-20, rem 3,4: 0.73745883
# --> out rem: 0.73743842
# --> med-weak: 0.73756486
# --> strong-cleanse: 0.73713546
# --> strong-cleanse5: 0.73726067
# --> strong-cleanse4: 0.73710238
# ordered 1-20, rem 3,4,15: 0.73693037

Score for event 1000: 0.11790143
Score for event 1000: 0.74107931


In [14]:
labels = strt.extend_straight_tracks(labels_merged2, hits)
score_one_submission(event_id, hits, labels, truth)
labels = free.assign_free_hits(labels, hits)
score_one_submission(event_id, hits, labels, truth)
# scores: 0.73736782, 0.73874326
# only 15 medium models: 0.73845154
# 15 med, 10 weak: 0.73724001, 0.73845154
# last: 0.73673186, 0.73799254
# new weak merge: 0.73737404, 0.73856768
# models 1-20: 0.73847834, 0.73981423
# chg overwrite: 0.73763503, 0.73873021
# overwrite limit 8: 0.73436074, 0.73554492
# str 3, med 6, weak 3: 0.73581388, 0.73718725
# ++ remove 17,19: 0.73914233, 0.74031220
# --> + med/weak out rem: 0.73861231, 0.73985150
# ++ remove 17,18,19,3,4: 0.73915103, 0.74047960
# --> + med. out. rem: 0.73934148, 0.74065658
# end score: 0.74030116
# ordered: 0.73786483, 0.73927505
# --> + out. rem.: 0.73730187, 0.73858468
# weak: 0.73770705, 0.73915284
# --> out rem: 0.73752564, 0.73913394
# ordered 1-20, rem 3,4: 0.73809852, 0.73946622
# --> out rem: 0.73810009, 0.73950646
# --> med-weak: 0.73840189, 0.73963008
# --> strong-cleanse: 0.73797404, 0.73912490
# --> strong-cleanse5: 0.73809770, 0.73925513
# --> strong-cleanse4: 0.73793941, 0.73908986
# ordered 1-20, rem 3,4,15: 0.73780211, 0.73895136

Score for event 1000: 0.74183549
Score for event 1000: 0.74334149


In [None]:
labels_helix6f = remove_outliers2(labels_helix6, hits, cells, print_counts=True)
score_one_submission(event_id, hits, labels_helix6, truth)
score_one_submission(event_id, hits, labels_helix6f, truth)

In [None]:
def heuristic_merge_weak_tracks(labels1, labels2, hits, overwrite_limit=4, print_summary=False):
    """ Merge tracks from two arrays of track labels.

    Merges are handled as follows:
     - tracks from labels2 are identified and searched
     - for each track from labels2:
       - use track directly if no conflict with any tracks from labels1
       - skip if labels1 already contains the same track of equal (or longer) length
       - otherwise, if there are potentially multiple conflicting tracks from labels1
         - if labels1 only contains a single track ID, as well as un-classified (0) hits,
           re-assign '0' track ID to labels1 track ID (i.e. lengthen the track)
         - otherwise, labels1 contains multiple non-zero track IDs
           - replace any track ID 0 occurrences with the longest labels1 track ID
           - replace any occurrences of short (len <= 3) labels1 tracks with the longest labels1 track ID

    Parameters:
     - labels1: np array of labels, each entry represents a hit, the value represents the
       track ID that hit is assigned to. This should be considered the 'higher-quality' of
       the two input labels
     - labels2: np array of secondary labels, whose tracks should be merged into labels1

    Returns: The merged array of labeled tracks.
    """
    labels_merged = np.copy(labels1)
    labels_merged = merge.renumber_labels(labels_merged)
    max_track = np.amax(labels_merged)
    labels2[labels2 != 0] = labels2[labels2 != 0] + max_track
    trks2 = np.unique(labels2)
    count1 = 0
    count2 = 0
    count3 = 0
    count4 = 0
    count4_len = []
    count5 = 0
    count6 = 0
    count7 = 0
    count8 = 0
    count9 = 0
    count10 = 0
    count11 = 0
    count12 = 0
    count13 = 0
    count14 = 0
    count15 = 0
    count16 = 0
    for trk2 in trks2:
        if trk2 == 0:
            continue
        trk2_ix = np.where(labels2 == trk2)[0]
        trk2_length = len(trk2_ix)
        if trk2_length < 2:
            continue
        trk1_val = labels_merged[trk2_ix]
        #print('trk2: ' + str(trk2) + ', label1: ' + str(trk1_val))
        trk1_uniq = np.unique(trk1_val)
        # Now we know which tracks from the 1st label overlap with the tracks from the 2nd label
        if len(trk1_uniq) == 1:
            if trk1_uniq[0] == 0:
                #print('Good candidate to replace!')
                # This track was not found by labels1, just directly use the
                # track from labels2.
                count1 = count1 + 1
                labels_merged[trk2_ix] = trk2
            else:
                # We found a track that is at least as long as the current
                # track in labels1. Nothing more needed, at least for now.
                # We could consider scenarios where the labels1 track contains
                # hits from 2 different tracks, where labels2 only has a
                # shorter single track. In this case, it may be good to split
                # the labels1 track into two pieces. However, this condition
                # would be very hard to detect, for now we want to favour
                # longer tracks whenever possible.
                #print('Same track found, skipping...')
                count2 = count2 + 1
        else:
            found_tracks = 0
            # Get counts for all identified tracks from labels1 that match trk2
            trk1_counts = coll.Counter(trk1_val).most_common(len(trk1_uniq))
            longest_track_id = trk1_counts[0][0]
            longest_track_count = trk1_counts[0][1]
            second_track_id = trk1_counts[1][0]
            second_track_count = trk1_counts[1][1]
            # If longest track in labels1 was 0, create a new track, but only
            # from free hits, or from small tracks. Also, if there is not
            # enough overlap (less than half the hits overlap), also create
            # a new track.
            if longest_track_id == 0:
                count5 = count5 + 1
                longest_track_id = trk2
                # See if we should instead lengthen the longest non-zero track
                if len(trk1_uniq) == 2:
                    test_track_ix = np.where(labels_merged == second_track_id)[0]
                    if len(test_track_ix) <= (second_track_count + 3):
                        outliers1 = zro.find_track_outliers_zr(second_track_id, labels_merged, hits, find_all=True)
                        labelx = np.copy(labels_merged)
                        labelx[trk2_ix] = second_track_id
                        outliers2 = zro.find_track_outliers_zr(second_track_id, labelx, hits, find_all=True)
                        if len(outliers2) <= len(outliers1):
                            longest_track_id = second_track_id
                            longest_track_count = second_track_count
            elif (trk2_length > 20) or (longest_track_count > 20):
                count9 = count9 + 1
                longest_track_id = trk2
            elif (trk2_length > 6) and (longest_track_count < int(trk2_length/2)) and second_track_id != 0:
                # Try to avoid creating crossed tracks, do not lengthen existing track if not
                # enough overlap.
                count10 = count10 + 1
                trk1a = np.where(labels_merged == longest_track_id)[0]
                if longest_track_count + 3 >= len(trk1a):
                    #print('Top 2 tracks, new: ' + str(trk2_length) + ', len1: '  + str(longest_track_count) + ', len1a: ' + str(len(trk1a)) + ', len2: ' + str(second_track_count) + ', len2a: ' + str(len(trk2a)))
                    # Lengthen the longest track, it's fully contained by our new/proposed track.
                    # Reset 2nd longest track if mostly contained in new/proposed track.
                    count14 = count14 + 1
                    trk2a = np.where(labels_merged == second_track_id)[0]
                    if second_track_count + 1 >= len(trk2a):
                        count15 = count15 + 1
                        labels_merged[trk2a] = longest_track_id
                else:
                    # Not much overlap, start a new track to avoid hurting existing tracks.
                    longest_track_id = trk2
            else:
                # If the old track had too many hits not part of the new/proposed track, do
                # not lengthen it - that may lose majority. Better to start a new track.
                trk1a = np.where(labels_merged == longest_track_id)[0]
                if longest_track_count + 3 < len(trk1a): # change from 6
                    count16 = count16 + 1
                    longest_track_id = trk2

            for trk1 in trk1_uniq:
                if trk1 == 0:
                    continue
                trk1_ix = np.where(labels_merged == trk1)[0]
                if len(trk1_ix) > 1:
                    found_tracks = found_tracks + 1
            if found_tracks > 1:
                #print('Found ' + str(found_tracks) + ' non-trivial tracks.')
                count3 = count3 + 1
                # If there are un-classified hits, assign those to the track
                # ID with the most hits.
                for label_ix in trk2_ix:
                    if labels_merged[label_ix] == 0:
                        labels_merged[label_ix] = longest_track_id
                        count6 = count6 + 1

                # If there are tracks of length 2 or less, and one or both
                # of those hits are included in the target track, re-assign
                # those matching the labels2 track to the most common
                # original track ID.
                for trk1_count in trk1_counts:
                    trk1_count_ix = np.where(labels_merged == trk1_count[0])[0]
                    if len(trk1_count_ix) <= overwrite_limit:
                        outliers = zro.find_track_outliers_zr(trk2, labels2, hits, find_all=True)
                        for label_ix in trk2_ix:
                            if labels_merged[label_ix] == trk1_count[0] and label_ix in outliers:
                                count13 = count13 + 1
                            elif labels_merged[label_ix] == trk1_count[0]:# and label_ix not in outliers:
                                labels_merged[label_ix] = longest_track_id
                                count7 = count7 + 1
                    #else:
                    #    outliers = zro.find_track_outliers_zr(trk1_count[0], labels_merged, hits, find_all=True)
                    #    for label_ix in trk2_ix:
                    #        if labels_merged[label_ix] == trk1_count[0] and label_ix in outliers:
                    #            labels_merged[label_ix] = longest_track_id
                    #            count12 = count12 + 1

            else:
                # Only the track ID, as well as track ID 0, were found in labels1.
                # Replace any occurrences of ID 0 with the labels1 track ID.
                count4 = count4 + 1
                count4_len.append(len(trk2_ix))
                # If there are un-classified hits, assign those to the track
                # ID with the most hits (lengthens the track).
                for label_ix in trk2_ix:
                    if labels_merged[label_ix] == 0:
                        labels_merged[label_ix] = longest_track_id
                        count8 = count8 + 1

    if print_summary:
        print('Simple replacement of unclassified hits: ' + str(count1))
        print('Similar tracks (no-op): ' + str(count2))
        print('New track creations from little overlap(0): ' + str(count5))
        print('New track creations from huge tracks(>20): ' + str(count9))
        print('Test for new track creations from little overlap(non-0): ' + str(count10))
        print('--> Lengthen longest overlap instead: ' + str(count14))
        print('  --> And clear 2nd longest track: ' + str(count15))
        print('Skip extension due to too little overlap: ' + str(count16))
        print('Multiple non-trivial tracks: ' + str(count3))
        print('--> of which partial track ID 0 hits were updated: ' + str(count6))
        print('--> of which partial track ID non-0 hits were updated: ' + str(count7))
        print('--> of which partial track ID non-0 hits were skipped: ' + str(count13))
        print('--> of which outliers were overwritten: ' + str(count12))
        print('Tracks to be lengthened: ' + str(count4))
        print('--> of which track ID 0 hits were updated: ' + str(count8))
        print('--> from which new tracks were created instead: ' + str(count11))
        noises = np.unique(np.asarray(count4_len))
        print('--> of which labels2 unique track lengths were: ' + str(noises))

    return labels_merged


In [None]:
def remove_track_outliers2(track, labels, hits, cells, aggressive):
    labels = np.copy(labels)
    found_bad_volume = 0
    found_bad_cell = 0
    found_bad_dimension = 0
    found_bad_slope = 0
    found_bad_z = 0
    found_bad_zr = 0

    if True:
        outlier_zr = zro.find_track_outliers_zr(track, labels, hits)
        if len(outlier_zr) > 0:
            #print('track ' + str(track) + ' zr outliers: ' + str(outlier_zr))
            found_bad_zr = found_bad_zr + len(outlier_zr)
            for oix in outlier_zr:
                labels[oix] = 0

    if True:
        # Check if the sorted hits (on z-axis) go through the volumes
        # and layers in the expected order
        duplicatez_ix = merge.find_duplicate_z_using_zr(track, labels, hits)
        if len(duplicatez_ix) > 0:
            #print('track ' + str(track) + ' duplicate z: ' + str(duplicatez_ix))
            found_bad_z = found_bad_z + len(duplicatez_ix)
            for bzix in duplicatez_ix:
                labels[bzix] = 0

    if False:#True:
        # Check the helix slope, discard hits that do not match
        outlier_slope_ix = merge.remove_track_outliers_slope(track, labels, hits)
        if len(outlier_slope_ix) > 0:
            #print('track ' + str(track) + ' slope outliers: ' + str(outlier_slope_ix))
            found_bad_slope = found_bad_slope + len(outlier_slope_ix)
            for oix in outlier_slope_ix:
                labels[oix] = 0

    return (labels, found_bad_volume, found_bad_dimension, found_bad_z, found_bad_slope, found_bad_zr, found_bad_cell)


def remove_outliers2(labels, hits, cells, smallest_track_size=2, aggressive=False, print_counts=True):
    tracks = np.unique(labels)
    hits['z_abs'] = hits.z.abs()
    hits['r'] = np.sqrt(hits.x**2+hits.y**2)
    hits['a0'] = np.arctan2(hits.y,hits.x)
    hits['zr'] = hits['z'] / hits['r']
    count_rem_volume = 0
    count_rem_dimension = 0
    count_duplicatez = 0
    count_rem_slope = 0
    count_small_tracks = 0
    count_zr = 0
    count_cell = 0
    for track in tracks:
        if track == 0:
            continue
        track_hits = np.where(labels == track)[0]
        if len(track_hits) > 3:
            (labels, c1, c2, c3, c4, c5, c6) = remove_track_outliers2(track, labels, hits, cells, aggressive)
            count_rem_volume = count_rem_volume + c1
            count_rem_dimension = count_rem_dimension + c2
            count_duplicatez = count_duplicatez + c3
            count_rem_slope = count_rem_slope + c4
            count_zr = count_zr + c5
            count_cell = count_cell + c6

    # Remove small tracks, we do not get any score for those. This is done
    # last, in case removing the outliers (above) removed enough hits
    # from a track to make them smaller than the threshold.
    (labels, count_small_tracks) = merge.remove_small_tracks(labels, smallest_track_size=smallest_track_size)

    if print_counts:
        print('Total removed due to bad cells: ' + str(count_cell))
        print('Total removed due to bad volumes: ' + str(count_rem_volume))
        print('Total removed due to bad zr values: ' + str(count_zr))
        print('Total removed due to bad dimensions: ' + str(count_rem_dimension))
        print('Total removed due to duplicate zs: ' + str(count_duplicatez))
        print('Total removed due to bad slopes: ' + str(count_rem_slope))
        print('Total removed small tracks (<' + str(smallest_track_size) + ') hits: ' + str(count_small_tracks))

    return labels