In [3]:
import numpy as np
import pywt
import matplotlib.pyplot as plt
import pandas as pd
import cv2
import os
import time
import cv2
from sklearn.ensemble import IsolationForest

from sklearn.manifold import TSNE
from scipy.interpolate import griddata
from mpl_toolkits.axes_grid1 import make_axes_locatable

from astronomaly.utils.utils import ImageCycler
from astronomaly.feature_extraction.power_spectrum import psd_2d
from astronomaly.data_management.image_reader import ImageThumbnailsDataset
from astronomaly.preprocessing.image_preprocessing import image_transform_scale
from astronomaly.postprocessing import scaling
from astronomaly.anomaly_detection.human_loop_learning import NeighbourScore, ScoreConverter
from astronomaly.anomaly_detection import isolation_forest, lof
from astronomaly.feature_extraction import shape_features
from astronomaly.dimensionality_reduction import pca

%matplotlib inline

In [6]:
presentation_plots = False

latex_dpi = 72.27
textwidth = 469.75502/latex_dpi
textheight = 650.43001/latex_dpi

figsize_square = (textwidth/2, textwidth/2)
figsize_fullwidth = (textwidth, textwidth/2)
figsize_panels = (textwidth, 0.75*textheight/3)


plt.rcParams.update({'font.family':'serif', 'font.size':10, 'figure.dpi':latex_dpi, 'text.usetex':True,
                    'savefig.format':'pdf'})
figure_suffix = '.pdf'

fig_dir = '../paper/figures/'

if presentation_plots:
    fig_dir = '../presentation_figures/'
    plt.rcParams.update({'font.family':'sans-serif', 'font.size':10, 'figure.dpi':300, 'text.usetex':True,
                    'savefig.format':'png'})
    presentation_plot_size = (textwidth, 2*textwidth/3)
    figure_suffix = '.png'

In [7]:
def rank_weighted_score(y_true, scores, N):
    w_i = (N + 1 - np.arange(1, N+1))
    S_0 = N/2*(N+1)
    
    inds = np.argsort(scores)[::-1]
#     scores = scores[inds][:N]
    y_true = y_true[inds][:N]
#     print(y_true, scores)
    
    I = y_true
    
    return np.sum(I*w_i)/S_0

In [8]:
def make_rws_curve(labels, anomalies, scores, plot=True, N_vals=[]):
    
#     if plot:
#         plt.figure()
    
    clean_labels = labels.loc[anomalies.index, 'label'].values
    print(clean_labels.shape)
    print(scores.shape)
    if len(N_vals) == 0:
        N_vals = np.arange(10, 510, 10)
    rws = []
    for N in N_vals:
        rws.append(rank_weighted_score(clean_labels, scores, N))

    rws_150 = rank_weighted_score(clean_labels, scores, 150)

    if plot:
#         plt.plot(N_vals, rws)
        return N_vals, rws
    return rws_150

In [9]:
def get_rws(labels, anomalies, scores, N):
    clean_labels = labels.loc[anomalies.index, 'label'].values
    return rank_weighted_score(clean_labels, scores, N)

In [10]:
def compute_ind_sum(found_inds, all_inds):
    this_found_inds = found_inds.copy()
    this_found_inds.sort()
    out = np.zeros(len(all_inds))
    for i in this_found_inds:
        out[i:] += 1
    return out

In [11]:
def cumulative_sum(anomalies, sort_by='score'):
    sorted_inds = anomalies.sort_values(sort_by, ascending=False).index
    labs = labels.loc[anomalies.index]
    anom_inds = labs[labs['label']==1].index

    found_inds= []

    for i in anom_inds:
        found_inds.append(np.where(sorted_inds==i)[0][0])
            
    ind_sum = compute_ind_sum(found_inds, sorted_inds)
    
    return ind_sum

## Switch to Scattering features

In [13]:
data_dir = os.path.join('/Users/mrgr/Documents/GitHub/KiDS_astronomaly/example_data/')
image_dir = os.path.join(data_dir, 'KiDS_cutouts', 'ugri_images')

# Where output should be stored
out_dir = os.path.join(
    data_dir, 'astronomaly_output', 'kids_ugri', '')

data_dir = image_dir

scale the features with feature scaler

In [41]:
# from sklearn.preprocessing import StandardScaler
# scl = StandardScaler()
# output = scl.fit_transform(features)
# features = pd.DataFrame(data=output, index=features.index, 
#                             columns=features.columns)

get features with Iforest

In [18]:
pipeline_anomaly = isolation_forest.IforestAlgorithm(output_dir = out_dir, force_rerun=True)
anomalies = pipeline_anomaly.run(features)


# pipeline_score_converter = ScoreConverter(force_rerun=True, output_dir = out_dir)
# anomalies = pipeline_score_converter.run(anomalies)
# anomalies = anomalies.sort_values('score', ascending=False)

Running IforestAlgorithm ...
Done! Time taken: 1.1689479351043701 s


In [19]:
anomalies

Unnamed: 0,score
tile_KIDS_40.6_-28.2_ID_KiDSDR4 J024015.245-284053.94,0.082015
tile_KIDS_40.6_-28.2_ID_KiDSDR4 J024015.289-284106.07,0.049280
tile_KIDS_40.6_-28.2_ID_KiDSDR4 J024015.325-283927.71,0.095958
tile_KIDS_40.6_-28.2_ID_KiDSDR4 J024016.534-284002.35,0.073862
tile_KIDS_40.6_-28.2_ID_KiDSDR4 J024017.408-283943.43,0.072461
...,...
tile_KIDS_40.6_-28.2_ID_KiDSDR4 J024442.804-283959.75,0.089504
tile_KIDS_40.6_-28.2_ID_KiDSDR4 J024443.486-284010.61,0.096462
tile_KIDS_40.6_-28.2_ID_KiDSDR4 J024444.198-284032.84,-0.065023
tile_KIDS_40.6_-28.2_ID_KiDSDR4 J024445.092-284001.48,0.067909


load features

In [14]:
features = pd.read_parquet(out_dir + 'FeatureScaler_output.parquet')
anomalies = pd.read_parquet(out_dir + 'ScoreConverter_output.parquet') 

In [17]:
#here i import the dataframe with the labels
df = pd.read_csv('/home/michelle/BigData/Anomaly/GalaxyZoo/galaxy-zoo-the-galaxy-challenge/training_solutions_rev1.csv')
df=df.set_index('GalaxyID')
df.index = df.index.astype('str')

FileNotFoundError: [Errno 2] No such file or directory: '/home/michelle/BigData/Anomaly/GalaxyZoo/galaxy-zoo-the-galaxy-challenge/training_solutions_rev1.csv'

In [14]:
kaggle_key = pd.read_csv('/home/michelle/BigData/Anomaly/GalaxyZoo/KAGGLE_allgals_randomgalaxyid.csv', index_col=0)
kaggle_key.index = kaggle_key.index.astype('str')

In [16]:
features.columns

Index(['feat0', 'feat1', 'feat2', 'feat3', 'feat4', 'feat5', 'feat6', 'feat7',
       'feat8', 'feat9',
       ...
       'feat1270', 'feat1271', 'feat1272', 'feat1273', 'feat1274', 'feat1275',
       'feat1276', 'feat1277', 'feat1278', 'feat1279'],
      dtype='object', length=1280)

In [16]:
labels = pd.DataFrame(data=(df['Class6.1']>0.9).astype('int'))
labels.columns = ['label']
#labels['human_label'] = np.round(df['Class6.1']*5).astype('int')

In [17]:
len(labels)

61578

In [18]:
## ONLY label a set once
anomalies['human_label'] = [-1]*len(anomalies)
inds = anomalies.sort_values('score', ascending=False).index[:200]
anomalies.loc[inds, 'human_label'] = labels.loc[inds, 'human_label']

ns = NeighbourScore(alpha=0.1, force_rerun=True, output_dir = out_dir)
features_with_labels = ns.combine_data_frames(features, anomalies)
final_score = ns.run(features_with_labels)
anomalies['final_score'] = final_score.trained_score

Running NeighbourScore ...
Done! Time taken: 2.541348695755005 s


In [35]:
# other_df = anomalies[anomalies.human_label==-1]
# kaggle_key.loc[other_df.sort_values('final_score', ascending=False).index].head(20)

In [19]:
# msk = [True]*len(anomalies)
msk = anomalies.human_label==-1
ind_sum_ml = cumulative_sum(anomalies[msk], sort_by='score')
ind_sum_hitl = cumulative_sum(anomalies[msk], sort_by='final_score')

In [20]:
ind_sum_ml[100]

4.0

In [21]:
cols = plt.rcParams['axes.prop_cycle'].by_key()['color']
if presentation_plots:
    figsize = presentation_plot_size
else:
    figsize = figsize_square
    
plt.figure(figsize=figsize)
x = np.arange(len(anomalies[msk]))
# plt.plot(x,x,'k', alpha=0.5)

plt.plot(x, ind_sum_hitl, '-', color=cols[0], label='Active learning')
plt.plot(x, ind_sum_ml, '-', color=cols[1], label='No active learning')
rand_inds = np.random.choice(x, replace=False, size=labels.label.sum())
plt.plot(x, compute_ind_sum(rand_inds, x), '-k', alpha=0.2, label='Random')


plt.legend(loc='lower right')
plt.xlim([0,2000])
plt.ylim([0,300])
plt.xlabel('Index in ranked list')
plt.ylabel('Number of anomalies detected')

plt.tight_layout()


plt.savefig(fig_dir + 'galaxy_cumulative' + figure_suffix)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [24]:
# plt.figure()
# plt.plot(x, ind_sum_hitl/ind_sum_ml, '-', color=cols[0])
# plt.xlim([0,2000])
# plt.ylabel('Ratio active learning to machine learning')
# plt.xlabel('Index in ranked list')

# plt.tight_layout()

# plt.savefig(fig_dir + 'galaxy_ratio_with_labels')

# Compare RWS

In [49]:
# Before any active learning
scores = anomalies.score.values
N_vals = np.arange(10, 510, 10)
N_vals, rws_before = make_rws_curve(labels, anomalies, scores, plot=True, N_vals=N_vals)
N_vals, rws_after = make_rws_curve(labels, anomalies, anomalies.final_score, plot=True, N_vals=N_vals)
plt.figure(figsize=figsize_square)
plt.plot(N_vals, rws_after, label='Active learning')
plt.plot(N_vals, rws_before, label='No active learning')
plt.xlim([10,500])
plt.ylim([0,1.05])
plt.xlabel('N')
plt.ylabel('Rank Weighted Score')
plt.legend(loc='lower right')
plt.tight_layout()
plt.savefig(fig_dir+'galaxy_rws')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [52]:
image_dataset = ImageThumbnailsDataset(directory=data_dir, transform_function=image_transform_scale, 
                                       output_dir=out_dir)

## Random examples

In [53]:
N=12
nrows = 2
ncols = 6

examples_figure_size = figsize_panels

if presentation_plots:
    nrows = 3
    ncols = 4
    examples_figure_size = presentation_plot_size

left = 0.005
right = 0.99
bottom = 0.05
top = 0.95

In [54]:
fig = plt.figure(figsize=examples_figure_size)

inds = np.random.choice(features.index, size=N, replace=False)
for i in range(N):
    plt.subplot(nrows, ncols, i+1)
    plt.imshow(image_dataset.get_sample(inds[i]))
    plt.gca().axis('off')
plt.tight_layout()
plt.subplots_adjust(left=left, right=right, bottom=bottom, top=top)

plt.savefig(fig_dir + 'galaxy_examples_0')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## Before HITL

In [55]:
plt.close('all')
fig = plt.figure(figsize=examples_figure_size)

wd = 20 # Thickness of box line in pixels
col = [1,0,0]

inds = anomalies.sort_values('score', ascending=False).index[:N]
for i in range(N):
    plt.subplot(nrows, ncols, i+1)
    img = image_dataset.get_sample(inds[i])
    if i in [1,4,6]:
        img[0:wd, :] = col
        img[img.shape[0]-wd:img.shape[0], :] = col
        img[:, 0:wd] = col
        img[:, img.shape[1]-wd:img.shape[1]] = col
    plt.imshow(img)
    plt.gca().axis('off')
plt.tight_layout()
plt.subplots_adjust(left=left, right=right, bottom=bottom, top=top)

plt.savefig(fig_dir + 'galaxy_examples_1')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## After HITL

In [56]:
plt.figure(figsize=examples_figure_size)

anomalies = anomalies.sort_values('final_score', ascending=False)
#inds = anomalies.loc[anomalies.human_label == -1].index[:N]
inds = anomalies.index[:N]
for i in range(N):
    plt.subplot(nrows, ncols, i+1)
    plt.imshow(image_dataset.get_sample(inds[i]))
    plt.gca().axis('off')
plt.tight_layout()
plt.subplots_adjust(left=left, right=right, bottom=bottom, top=top)

plt.savefig(fig_dir + 'galaxy_examples_2')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

# TSNE anomaly score plot

In [31]:
inds = anomalies.sort_values('score', ascending=False).index[:2000]

ts = TSNE(perplexity=30)
ts.fit(features.loc[inds])

TSNE(angle=0.5, early_exaggeration=12.0, init='random', learning_rate=200.0,
     method='barnes_hut', metric='euclidean', min_grad_norm=1e-07,
     n_components=2, n_iter=1000, n_iter_without_progress=300, n_jobs=None,
     perplexity=30, random_state=None, verbose=0)

In [32]:
X = ts.embedding_
X[:,0] = (X[:,0] - X[:,0].min())/(X[:,0].max()-X[:,0].min())
X[:,1] = (X[:,1] - X[:,1].min())/(X[:,1].max()-X[:,1].min())

In [33]:
x = np.linspace(X[:,0].min(), X[:,0].max(), 200)
y = np.linspace(X[:,1].min(), X[:,1].max(), 200)
xgrid, ygrid = np.meshgrid(x, y)
zgrid = griddata(X, anomalies.loc[inds,'score'].values, (xgrid, ygrid), method='nearest')
zgrid2 = griddata(X, anomalies.loc[inds,'final_score'].values, (xgrid, ygrid), method='nearest')

In [34]:
plt.close('all')

In [35]:
fig = plt.figure(figsize=figsize_fullwidth)

ax1 = plt.subplot(1,2,1)
im1=ax1.imshow(zgrid, origin='lower', extent=(x.min(), x.max(), y.min(),y.max()), cmap='magma')
# plt.scatter(X[:,0], X[:,1], c=anomalies.loc[inds,'score'], s=10, cmap='magma')



for lab in np.unique(anomalies.human_label)[1:]:
    msk = np.where(anomalies.loc[inds, 'human_label'] == lab)
#     plt.scatter(X[msk,0], X[msk,1], facecolors='none', edgecolors=cols[lab], s=15, marker='s',alpha=0.5)
    plt.scatter(X[msk,0], X[msk,1], c='k', s=25, marker='$%d$' %lab, linewidths=0.5)
    plt.scatter(X[msk,0], X[msk,1], c='w', s=20, marker='$%d$' %lab, linewidths=0.5)

plt.xlabel('Raw anomaly score')

divider = make_axes_locatable(ax1)
cax = divider.append_axes("right", size="5%", pad=0.1)
plt.tight_layout()

fig.colorbar(im1, cax=cax)

ax2 = plt.subplot(1,2,2)
im2=ax2.imshow(zgrid2, origin='lower', cmap='magma', extent=(x.min(), x.max(), y.min(),y.max()))

# plt.scatter(X[:,0], X[:,1], c=anomalies.loc[inds,'score'], s=10, cmap='magma')


for lab in np.unique(anomalies.human_label)[1:]:
    msk = np.where(anomalies.loc[inds, 'human_label'] == lab)
    plt.scatter(X[msk,0], X[msk,1], c='k', s=25, marker='$%d$' %lab, linewidths=0.5)
    plt.scatter(X[msk,0], X[msk,1], c='w', s=20, marker='$%d$' %lab, linewidths=0.5)
    
plt.xlabel('Trained anomaly score')



divider = make_axes_locatable(ax2)
cax = divider.append_axes("right", size="5%", pad=0.1)

fig.colorbar(im2, cax=cax)

plt.subplots_adjust(wspace=10, top=1)

plt.tight_layout()

plt.savefig(fig_dir+'galaxy_feature_space')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [36]:
# xvals = np.arange(0, N_outliers, n_recompute)+n_recompute
# perc_no_hitl = []
# for x in xvals:
#     n = labels.loc[anomalies.sort_values('score', ascending=False).index[:x]].label.sum()
#     perc_no_hitl.append(n/labels.label.sum())

In [37]:
# plt.figure()
# plt.plot(np.arange(0, N_outliers, n_recompute)+n_recompute, np.array(perc)*labels.label.sum())
# plt.plot(xvals, np.array(perc_no_hitl)*labels.label.sum())
# plt.legend(('HITL', 'ML'))
# plt.xlabel('Number of Labelled Examples')
# plt.ylabel('Cumulative Total')

In [38]:
# plt.figure()
# plt.plot(np.arange(0, N_outliers, n_recompute)+n_recompute, RWS)
# plt.legend()
# plt.xlabel('Number of Labelled Examples')
# plt.ylabel('Rank Weighted Score N=150')

In [39]:
# Before any active learning
# scores = anomalies.score.values
# msk = anomalies.human_label==-1
# N_vals, rws_before = make_rws_curve(labels.loc[msk], anomalies.loc[msk], anomalies.loc[msk, 'score'], plot=True)
# N_vals, rws_after = make_rws_curve(labels.loc[msk], anomalies.loc[msk], anomalies.loc[msk].final_score, plot=True)

# plt.figure()
# plt.plot([100, 100], [0,0.4], '--k', alpha=0.5)
# plt.plot(N_vals, rws_before, label='Without HITL')
# plt.plot(N_vals, rws_after, label='With HITL')


# plt.legend()
# plt.xlim([0,250])
# plt.ylim([0,1])

## Demonstrating features

In [45]:
from astronomaly.preprocessing import image_preprocessing

In [131]:
f_spiral = '949415'
f_elliptical = '824608'
f_merger = '516751'
f_skinny = '565692'
f_star = '732929'

In [129]:
plt.close('all')
plt.figure(figsize=(1,1))
plt.imshow(image_dataset.get_sample(anomalies.index[13]))

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<matplotlib.image.AxesImage at 0x7fa18f422df0>

In [132]:
def plot_image(img, flname, figsize):
    plt.figure(figsize=figsize)
    plt.imshow(img)
    plt.gca().axis('off')
    plt.tight_layout()
    plt.savefig(fig_dir+flname)

In [133]:
gal_type = 'star'
if gal_type == 'normal':
    ind = f_spiral
elif gal_type == 'star':
    ind = f_star
else:
    ind = f_merger
    
img = image_dataset.get_sample(ind)

plot_image(img, 'galaxy_'+gal_type+'_1', figsize_square)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [134]:
img_clipped = image_preprocessing.image_transform_sigma_clipping(img)
plot_image(img_clipped, 'galaxy_'+gal_type+'_2', figsize_square)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [135]:
percentiles=[90, 80, 70, 60, 50, 0]
chosen_percentiles = [90, 50, 0]
this_image = img_clipped.copy()[:,:,0]
image_contours = img_clipped.copy()

contours_dict = {}

x0 = y0 = -1
x_cent = this_image.shape[0] // 2
y_cent = this_image.shape[1] // 2

for p in percentiles:
    thresh = np.percentile(this_image[this_image > 0], p)
    contours, hierarchy = shape_features.find_contours(this_image, thresh)

    x_contours = np.zeros(len(contours))
    y_contours = np.zeros(len(contours))

    # First attempt to find the central point of the inner most contour
    if len(contours) != 0:
        for k in range(len(contours)):
            M = cv2.moments(contours[k])
            try:
                x_contours[k] = int(M["m10"] / M["m00"])
                y_contours[k] = int(M["m01"] / M["m00"])
            except ZeroDivisionError:
                pass
        if x0 == -1:
            x_diff = x_contours - x_cent
            y_diff = y_contours - y_cent
        else:
            x_diff = x_contours - x0
            y_diff = y_contours - y0

        # Will try to find the CLOSEST contour to the central one
        r_diff = np.sqrt(x_diff**2 + y_diff**2)

        ind = np.argmin(r_diff)

        if x0 == -1:
            x0 = x_contours[ind]
            y0 = y_contours[ind]

        c = contours[ind]
        
        contours_dict[p] = c
    if p in chosen_percentiles:
        image_contours = shape_features.draw_contour(c, image_contours)


In [136]:
plot_image(image_contours, 'galaxy_'+gal_type+'_3', figsize_square)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [137]:
ellipse_image = img_clipped.copy()
for p in percentiles:
    if p in chosen_percentiles:
        ellipse_image = shape_features.fit_ellipse(contours_dict[p], ellipse_image, filled=False)
plot_image(ellipse_image, 'galaxy_'+gal_type+'_4', figsize_square)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [113]:
plt.close('all')

## Comparison with Darg merger catalogue and other groups catalogue

In [64]:
anomalies = anomalies.merge(kaggle_key, left_index=True, right_index=True)

In [58]:
groups_cat = pd.read_csv('/home/michelle/BigData/Anomaly/GalaxyZoo/groups_catalogue.tsv', delimiter=';', comment='#')
groups_cat = groups_cat[2:]

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [73]:
anomalies['group'] = np.zeros(len(anomalies))
group_obj_ids = anomalies.merge(groups_cat, left_on='dr7objid', right_on='objID', how='inner', left_index=True).objID
anomalies.loc[np.in1d(anomalies.dr7objid, group_obj_ids), 'group'] = 1

In [86]:
mergers = pd.read_csv('/home/michelle/BigData/Anomaly/GalaxyZoo/darg_mergers.csv')
merger_objs = list(mergers.object1)+list(mergers.object2)
merger_objs = np.unique(merger_objs)

In [91]:
anomalies['merger'] = np.zeros(len(anomalies))
anomalies.loc[np.in1d(anomalies.dr7objid, merger_objs), 'merger'] = 1

In [94]:
anomalies = anomalies.sort_values('final_score', ascending=False)

merger_cum_sum = np.zeros(len(anomalies))
group_cum_sum = np.zeros(len(anomalies))
total_cum_sum = np.zeros(len(anomalies))
rand_cum_sum = np.zeros(len(anomalies))

total_anoms = (anomalies.group==1)|(anomalies.merger==1)

rand_list = np.random.choice(total_anoms, size=len(total_anoms), replace=False)

for i in range(len(anomalies)):
    merger_cum_sum[i] = anomalies['merger'][:i].sum()
    group_cum_sum[i] = anomalies['group'][:i].sum()
    total_cum_sum[i] = merger_cum_sum[i] + group_cum_sum[i]
    rand_cum_sum[i] = rand_list[:i].sum()
    

In [100]:
N_mergers = anomalies.merger.sum()
N_groups = anomalies.group.sum()
N_anoms = N_mergers + N_groups

In [102]:
plt.figure(figsize=(9,9), dpi=100)
plt.plot([len(anomalies)*0.1, len(anomalies)*0.1], [0,1], 'k--', alpha=0.5)
plt.plot(total_cum_sum/N_anoms)
plt.plot(merger_cum_sum/N_mergers)
plt.plot(group_cum_sum/N_groups)
plt.plot(rand_cum_sum/N_anoms, 'k')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

[<matplotlib.lines.Line2D at 0x7f668a2a3940>]

In [117]:
inds = []
for idx in anomalies.index[:200]:
    if not anomalies.loc[idx, 'group'] and not anomalies.loc[idx, 'merger'] and not idx in df[df['Class8.6']>0.8].index:
        inds.append(idx)

In [119]:
imgs = []
for idx in inds:
    imgs.append(image_dataset.get_sample(idx))

In [120]:
plt.rcParams.update({'figure.dpi':100})
cycler = ImageCycler(imgs, xlabels=inds)
cycler.cycle()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [110]:
kaggle_key.loc['373505']

dr7objid    588013383816904793
Usage                 training
Name: 373505, dtype: object

Index(['100520', '108742', '127231', '133499', '155564', '176576', '180253',
       '236236', '271943', '316412', '330517', '333669', '350139', '373505',
       '395411', '398159', '407825', '430220', '433488', '446716', '487641',
       '489626', '499023', '501768', '516751', '524354', '553727', '577678',
       '626793', '637115', '647440', '654297', '670070', '674805', '696497',
       '700256', '714674', '718275', '726532', '735007', '769081', '790978',
       '803787', '807226', '813394', '826038', '833171', '855799', '912722',
       '925544', '938173', '938264', '968553'],
      dtype='object', name='GalaxyID')

## Repeated training plot

In [125]:
anomalies2 = anomalies[['score', 'human_label', 'final_score']].copy()

In [130]:
n_recompute = 100
N_outliers = 1100
rerun = True
    
if rerun:

    anomalies2['human_label'] = np.array([-1]*len(anomalies2), dtype='int')
    anomalies2['final_score'] = anomalies2.score

    RWS = np.zeros(N_outliers//n_recompute)
    perc = np.zeros(N_outliers//n_recompute)
    total = np.zeros(N_outliers//n_recompute)


    t1 = time.time()

    for n in range(N_outliers//n_recompute):
        print((anomalies2.human_label!=-1).sum())

        anomalies2 = anomalies2.sort_values('final_score', ascending=False)
        inds = anomalies2.loc[anomalies2.human_label==-1].index[:n_recompute]
        anomalies2.loc[inds, 'human_label'] = labels.loc[inds, 'human_label']

        ns = NeighbourScore(alpha=0.1, force_rerun=True, output_dir = out_dir)
        features_with_labels = ns.combine_data_frames(features, anomalies2)
        final_score = ns.run(features_with_labels)
        anomalies2['final_score'] = final_score.trained_score
        
        anomalies2 = anomalies2.sort_values('final_score', ascending=False)

        this_rws = make_rws_curve(labels, anomalies2, anomalies2.final_score, plot=False)
        RWS[n] = this_rws

        tot = (labels.loc[anomalies2.index[anomalies2.human_label!=-1]].label==1).sum()
        perc[n] = tot/((labels.label==1).sum())
        total[n] = tot

        print()


    print('Time', time.time()-t1)

    anomalies2.to_parquet('anomalies_gz.parquet')
    np.save('percentages_gz.npy', perc)
    np.save('rws_gz.npy', RWS)
    np.save('total_gz.npy', total)
    
else:
    anomalies = pd.read_parquet('anomalies_gz.parquet')
    perc = np.load('percentages_gz.npy')
    RWS = np.load('rws_gz.npy')
    total = np.load('total_gz.npy')

0
Running NeighbourScore ...
Done! Time taken: 2.522036552429199 s

100
Running NeighbourScore ...
Done! Time taken: 3.2990734577178955 s

200
Running NeighbourScore ...
Done! Time taken: 2.68585467338562 s

300
Running NeighbourScore ...
Done! Time taken: 3.035444736480713 s

400
Running NeighbourScore ...
Done! Time taken: 3.2974531650543213 s

500
Running NeighbourScore ...
Done! Time taken: 3.4656612873077393 s

600
Running NeighbourScore ...
Done! Time taken: 4.0316572189331055 s

700
Running NeighbourScore ...
Done! Time taken: 3.86476469039917 s

800
Running NeighbourScore ...
Done! Time taken: 4.517053127288818 s

900
Running NeighbourScore ...
Done! Time taken: 4.416330575942993 s

1000
Running NeighbourScore ...
Done! Time taken: 4.593501091003418 s

Time 121.45626759529114


In [131]:
plt.figure(figsize=figsize_square)
plt.plot(np.arange(0, N_outliers, n_recompute)+n_recompute, total)
plt.xlabel('Number of Labelled Examples')
plt.ylabel('Number of Anomalies Found')
plt.tight_layout()

# plt.savefig(fig_dir+'sim_cumulative')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [132]:
plt.figure(figsize=figsize_square)

plt.plot(np.arange(0, N_outliers, n_recompute)+n_recompute, RWS)
# plt.legend(loc='best')
plt.xlabel('Number of Labelled Examples')
plt.ylabel('Rank Weighted Score N=150')
plt.tight_layout()
# plt.savefig(fig_dir+'sim_rws_150')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …