
## Running KDE vs. NADE

Use given training/testing data that has already been processed to match that used by NADE

Have 1 set for training and 1 for testing.

In [None]:
%matplotlib inline   
from models.multi_model.multi_model import MultiModel


mags = ["g_mag",  "r_mag", "i_mag", "z_mag", "y_mag",
        "W1_mag", "W2_mag",
        "J_mag", "K_mag", "H_mag"]
 
# initialize arbitrarily because we will replace data
# mags = ["g_mag"]
model = MultiModel(
       cols = mags,
       folds = 3, 
       min_class_size = 40, 
       transform_features = True
       )
 

## Run on NADE-used data

In [None]:
# from thex_data.data_transform import scale_data
# X_train, X_test = scale_data(X_train, X_test)

# model.train_model(X_train, y_train)

# probabilities = model.get_all_class_probabilities(X_test)
# probs = pd.DataFrame(probabilities, columns = model.class_labels)
# probs.to_csv(test_path + "OUTPUT/" + "kde_probabilities.csv", index=False)

In [None]:
from collections import OrderedDict
import numpy as np
import pandas as pd
from thex_data.data_transform import scale_data

def get_performance(lls, y, classes):
    class_mets = get_metrics(lls, y)

    # Purity TP / (TP+FP)
    purities = OrderedDict()
    for cname in classes:
        m = class_mets[cname]
        p = m["TP"] / (m["TP"] + m["FP"] + 0.0)
        purities[cname] = p
        print(cname + ' purity : ' + str(round(p * 100, 1)) + "%")

    total_TPs = 0
    for cname in classes:
        total_TPs += class_mets[cname]["TP"]
    acc = total_TPs / (y.shape[0] + 0.0)
    print('Accuracy : ' + str(round(acc * 100, 1)) + "%")

    return purities, acc

def get_metrics(lls, y):

    class_names = list(lls)

    label_col = list(y)[0] 

    # Map from class name to TP, FP, FN, and TN rates in dict.
    class_mets = {cn: {"TP": 0, "TN": 0, "FP": 0, "FN": 0} for cn in class_names}

    for class_name in class_names:
        for index, ll in lls.iterrows():
            label = str(y.iloc[index][label_col])
            values = ll.tolist()
            max_class_index = values.index(max(values))
            pred_class = class_names[max_class_index]

            # this class is the label, and we predicted it as max
            if label == class_name and pred_class == label:
                class_mets[class_name]["TP"] += 1

            # this class is not the label, but it was the max likelihood
            elif label != class_name and pred_class == class_name:
                class_mets[class_name]["FP"] += 1

            # this class is not the label, and its not the max (TN)
            elif label != class_name and pred_class != class_name:
                class_mets[class_name]["TN"] += 1

            # this class is label, but its not the max (FN)
            elif label == class_name and pred_class != class_name:
                class_mets[class_name]["FN"] += 1

    return class_mets


def run_for_dataset(X_train, y_train, X_test, y_test):
    
#     X_train, X_test = scale_data(X_train, X_test)
    
    model.train_model(X_train, y_train)

    probabilities = model.get_all_class_probabilities(X_test)
    probs = pd.DataFrame(probabilities, columns = model.class_labels)
#     probs.to_csv(test_path + "OUTPUT/" + "kde_probabilities.csv", index=False)
    
    purities, acc=get_performance(probs, y_test, model.class_labels)
    return purities, acc

def get_mean_stdev(l):
    """
    Get mean and standard deviation of list
    """
    arr = np.array(l)
    mean = np.average(arr)
    mean = str(round(mean * 100, 1)) + "%"

    stdev = np.std(arr)
    stdev = str(round(stdev * 100, 1))
    return mean, stdev

Estimate performance

In [None]:
############################################################

# model.class_labels = ['dog', 'cat', 'mouse']
model.class_labels = ['Unspecified Ia', 'Unspecified II']
class_label = "transient_type" #"quality"
# model.class_labels = ["4","5","6","7"]
runs=10

############################################################


test_path="/Users/marina/Documents/PhD/research/astro_research/data/testing/"
dpath = test_path+"PROCESSED_DATA/"

# Set X and y to training data given
X_train = pd.read_csv(dpath + "train_X.csv")
y_train = pd.read_csv(dpath + "train_y.csv")

X_test = pd.read_csv(dpath + "test_X.csv")
y_test = pd.read_csv(dpath + "test_y.csv")

# Rename column label
if class_label != 'transient_type':
    y_train['transient_type']= y_train[class_label].map(str)
    y_train.drop(columns=class_label,inplace=True)
    y_test['transient_type']= y_test[class_label].map(str)
    y_test.drop(columns=class_label,inplace=True)

model.X = X_train
model.y = y_train


In [None]:
all_ps = []
all_accs = []
for i in range(runs):
    purities, acc = run_for_dataset(X_train.copy(deep=True), 
                                    y_train.copy(deep=True), 
                                    X_test.copy(deep=True), 
                                    y_test.copy(deep=True))
    all_ps.append(purities)
    all_accs.append(acc)

avg, stdev = get_mean_stdev(all_accs)
print("Average accuracy " + avg + u"\u00B1" + stdev)

for class_name in model.class_labels:
    class_p = []
    for p_map in all_ps:
        class_p.append(p_map[class_name])
    avg, stdev = get_mean_stdev(class_p)
    print("Purity " + class_name + " :" + avg + u"\u00B1" + stdev)