### Understanding answers
Exploring the two testing sets in low dimensional space to see how they are distributed. We expect for Ia and II to be equally discernible in both testing sets, since this would explain why the performance is equal between them.

In [None]:
from evaluation.lowd_test import *

In [None]:
main()

In [None]:
from sklearn.manifold import TSNE
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pylab import rcParams

import os
import random
import math

from models.multi_model.multi_model import MultiModel
from evaluation.plotting import *
from estimate.get_data import *
from evaluation.sampling_test import *

from evaluation.lowd_test import fit_and_plot

FIG_WIDTH = 6
FIG_HEIGHT = 4
DPI = 600


exp = str(random.randint(1, 10**10))
output_dir = "../figures/evaluation/testing/"
# os.mkdir(output_dir)

cols = ["g_mag", "r_mag", "i_mag", "z_mag", "y_mag",
        "W1_mag", "W2_mag", "H_mag", "K_mag", 'J_mag',
        'redshift']
model = MultiModel(cols=cols,
                   class_labels=['Unspecified Ia', 'Unspecified II'],
                   transform_features=False,
                   min_class_size=40
                   )
model.dir = output_dir

thex_dataset = pd.concat([model.X, model.y], axis=1)

Ia_sampled, Ia_rand_sample = get_THEx_sampled_data(class_name="Ia",
                                                   max_rmag=None,
                                                   num_samples=200,
                                                   thex_dataset=thex_dataset,
                                                   output_dir=output_dir)
II_sampled, II_rand_sample = get_THEx_sampled_data(class_name="II",
                                                   max_rmag=None,
                                                   num_samples=200,
                                                   thex_dataset=thex_dataset,
                                                   output_dir=output_dir)
lsst_sampled_X, lsst_sampled_y = get_source_target(
    pd.concat([Ia_sampled, II_sampled]))

orig_sampled_X, orig_sampled_y = get_source_target(
    pd.concat([Ia_rand_sample, II_rand_sample]))

print("Original size of training set " + str(model.X.shape[0]))
# Update training data to remove testing sets
train_X, train_y = get_training_data(
    lsst_sampled_X, orig_sampled_X, model.X, model.y)
print("New size of training set " + str(train_X.shape[0]))
model.X = train_X[ordered_mags]
model.y = train_y

lsst_sampled_X = lsst_sampled_X[ordered_mags]
orig_sampled_X = orig_sampled_X[ordered_mags]



In [None]:
orig_sampled_y.groupby('transient_type').size()

In [None]:
lsst_sampled_y.groupby('transient_type').size()

In [None]:

fit_and_plot(X=orig_sampled_X,
             y=orig_sampled_y,
             data_type="Original",
             output_dir=output_dir,
             perplexity=15, # 5- 50, ~number of neighbors per pixel
             early_exaggeration=2.0, #exaggerates early clusters
             learning_rate=10, #[10.0, 1000.0]
             n_iter=50000,
             n_iter_without_progress=1000)

In [None]:

fit_and_plot(X=lsst_sampled_X,
             y=lsst_sampled_y,
             data_type="LSST",
             output_dir=output_dir,
             perplexity=15, # 5- 50, ~number of neighbors per pixel
             early_exaggeration=2.0, #exaggerates early clusters
             learning_rate=10, #[10.0, 1000.0]
             n_iter=50000,
             n_iter_without_progress=1000)

In [None]:

fit_and_plot(X=lsst_sampled_X,
             y=lsst_sampled_y,
             data_type="LSST",
             output_dir=output_dir,
             perplexity=10, # 5- 50, ~number of neighbors per pixel
             early_exaggeration=5.0, #exaggerates early clusters
             learning_rate=10, #[10.0, 1000.0]
             n_iter=50000,
             n_iter_without_progress=1000)

Same as above but with NO early exaggeration at all.

Roughly the same.

In both cases Orig is just not good, with data dispersed in curve. But LSST always does ok. It's not great, but there is at least some cluster with red only, which is good and makes sense with the redshift test (where Ia did better than II at high redshifts).

In [None]:

fit_and_plot(X=orig_sampled_X,
             y=orig_sampled_y,
             data_type="Original",
             output_dir=output_dir,
             perplexity=15, # 5- 50, ~number of neighbors per pixel
             early_exaggeration=1.0, #exaggerates early clusters
             learning_rate=10, #[10.0, 1000.0]
             n_iter=50000,
             n_iter_without_progress=1000)

fit_and_plot(X=lsst_sampled_X,
             y=lsst_sampled_y,
             data_type="LSST",
             output_dir=output_dir,
             perplexity=15, # 5- 50, ~number of neighbors per pixel
             early_exaggeration=1.0, #exaggerates early clusters
             learning_rate=10, #[10.0, 1000.0]
             n_iter=50000,
             n_iter_without_progress=1000)