In [1]:
import pandas as pd
from pandas import Series
import numpy as np
import json
import matplotlib.pyplot as plt
from sklearn.feature_selection import VarianceThreshold
from net import LSSTNet
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import matplotlib.pyplot as plt

## Preparation

In [2]:
# load parameters
params_df = pd.read_csv("ML_param.csv")
# Just doing what the repo did
del params_df['coord_dec'] 
del params_df['coord_ra']
del params_df['id']
del params_df['parent']

In [3]:
# load classfication data
data_df = pd.read_csv("../classifications/lsst_run_one.csv")
data_df.head()
classify_df = data_df.loc[data_df['workflow_name'] == "Difference Imaging Classifier"]

image_class = np.zeros(classify_df.shape[0], dtype='int, object')
idx = 0
for _, row in classify_df.iterrows():
    s_data = json.loads(row.subject_data) #Subject Data
    s_data = s_data.get(list(s_data.keys())[0])
    # cut "/home/......./cutout" and ".png"
    image_num = s_data.get(list(s_data.keys())[1])[47:-4]
    # annotations
    a_data = json.loads(row.annotations)[0]
    classification = a_data['value']
    image_class[idx] = (int(image_num), classification)
    idx += 1

In [251]:
# add a new classification column, init with None
params_df = params_df.assign(Classification=Series(np.full(params_df.shape[0], None)))
for image in image_class:
    params_df.loc[image[0], "Classification"] = image[1]
# drop the rows with no classification
df = params_df[params_df["Classification"].notnull()]

In [252]:
# drop some columns
for col in df.columns:
    if df[col].isnull().all():
        del df[col]   
    elif df[col].dtype != 'object' and np.mean(df[col]) == np.inf:
        del df[col]
    elif "flag" in col: #Flags don't contribute to ML based on initial testing
        del df[col]
    
df = df.fillna(0)

## Training, dev, test

In [308]:
# shuffle data
df.sample(frac=1)

# normalize
for col in df.columns:
    if df[col].dtype == "float64" or df[col].dtype == "int64":
        df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())

# get rid of (super) low variance features
selector = VarianceThreshold(0.01)
training = selector.fit_transform(df.drop(columns=['Classification']))
labels = df['Classification']

length = training.shape[0]
train_split = int(length * 0.7)
dev_split = int(length * 0.9)

train_X = training[0 : train_split]
dev_X = training[train_split : dev_split]
test_X = training[dev_split:]

# type to vectors
label_dict = dict()
trans_dict = dict()
for i, l in enumerate(labels.unique()):
    label_dict[l] = i
    trans_dict[i] = l

    
print(label_dict)
    
Y = []
for i in labels:
    Y.append(label_dict[i])
    
    
train_y = Y[0 : train_split]
dev_y = Y[train_split : dev_split]
test_y = Y[dev_split:]

{'Dipole': 0, 'Possible Transient': 1, 'Subtraction Error': 2, 'Possible Variable Star': 3, 'Pixel Artifact': 4, 'Noise': 5}


## Neural Network 

In [340]:
class LSSTNet(nn.Module):

    def __init__(self):
        super(LSSTNet, self).__init__()
        self.model = torch.nn.Sequential(
                         nn.Linear(73, 60),
                         nn.Linear(60, 45),
                         nn.BatchNorm1d(45, momentum=0.5),
                         nn.Linear(45, 30),
                         nn.Linear(30, 20),
                         nn.BatchNorm1d(20, momentum=0.5),
                         nn.Linear(20, 15),
                         nn.Linear(15, 10),
                         nn.Linear(10, 6))
        
        
    def forward(self, x):
        """
        Use relu
        """
        return F.log_softmax(self.model(x), dim=1)

In [341]:
net = LSSTNet().double()

# optim and loss
optim = torch.optim.Adam(net.parameters(), lr=1e-4)
critetion = nn.NLLLoss()

In [342]:
# need variable wrapper
X = Variable(torch.from_numpy(train_X).contiguous())
y = Variable(torch.from_numpy(np.asarray(train_y)).contiguous()).long()

In [343]:
T = 1900
# run 500 times
for t in range(T):
    # forward
    y_pred = net(X)
    loss = critetion(y_pred, y)
    # Zero the gradients before running the backward pass.
    net.zero_grad()
    loss.backward()
    optim.step()
    if t % 100 == 0:
        print("t: ", t, " loss: ", loss.data[0])

t:  0  loss:  1.8377871365580016
t:  100  loss:  1.5178732219140585
t:  200  loss:  1.2915377409608477
t:  300  loss:  1.1227400822000113
t:  400  loss:  0.9821514923922263
t:  500  loss:  0.8678167566242215
t:  600  loss:  0.7821163072439973
t:  700  loss:  0.717887112670382
t:  800  loss:  0.668026117151191
t:  900  loss:  0.6261360412035483
t:  1000  loss:  0.5889225004639783
t:  1100  loss:  0.5551043674802563
t:  1200  loss:  0.52604187553763
t:  1300  loss:  0.5026572136581108
t:  1400  loss:  0.48313467578057945
t:  1500  loss:  0.46618286414199395
t:  1600  loss:  0.457715709774658
t:  1700  loss:  0.4391918073746846
t:  1800  loss:  0.4297531045441598


## dev testing

In [334]:
development_X = Variable(torch.from_numpy(dev_X).contiguous())
development_y = Variable(torch.from_numpy(np.asarray(dev_y)).contiguous()).long()

In [335]:
dev_pred = net(development_X)

In [336]:
print(critetion(dev_pred, development_y).data[0])

0.5529023410887203


In [337]:
# check how many we got right:
correct = 0
for ii, pred in enumerate(dev_pred.data):
    i = np.argmax(pred)
    if i == dev_y[ii]:
        correct += 1
        
        
print(correct)
print(float(correct) / len(dev_y))

158
0.797979797979798


In [338]:
testing_X = Variable(torch.from_numpy(test_X).contiguous())
testing_y = Variable(torch.from_numpy(np.asarray(test_y)).contiguous()).long()

In [339]:
test_pred = net(testing_X)
print(critetion(test_pred, testing_y).data[0])

# check how many we got right:
correct = 0
for ii, pred in enumerate(test_pred.data):
    i = np.argmax(pred)
    if i == test_y[ii]:
        correct += 1
        
        
print(correct)
print(float(correct) / len(test_y))

0.6482681444723833
76
0.76
