In [1]:
import pandas as pd
from pandas import Series
import numpy as np
import json
import matplotlib.pyplot as plt
from sklearn.feature_selection import VarianceThreshold
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import matplotlib.pyplot as plt

## Preparation

In [2]:
# load parameters
params_df = pd.read_csv("ML_param.csv")
# Just doing what the repo did
del params_df['coord_dec'] 
del params_df['coord_ra']
del params_df['id']
del params_df['parent']

In [3]:
# load classfication data
data_df = pd.read_csv("../classifications/lsst_run_one.csv")
data_df.head()
classify_df = data_df.loc[data_df['workflow_name'] == "Difference Imaging Classifier"]

image_class = np.zeros(classify_df.shape[0], dtype='int, object')
idx = 0
for _, row in classify_df.iterrows():
    s_data = json.loads(row.subject_data) #Subject Data
    s_data = s_data.get(list(s_data.keys())[0])
    # cut "/home/......./cutout" and ".png"
    image_num = s_data.get(list(s_data.keys())[1])[47:-4]
    # annotations
    a_data = json.loads(row.annotations)[0]
    classification = a_data['value']
    image_class[idx] = (int(image_num), classification)
    idx += 1

In [4]:
# add a new classification column, init with None
params_df = params_df.assign(Classification=Series(np.full(params_df.shape[0], None)))
for image in image_class:
    params_df.loc[image[0], "Classification"] = image[1]
# drop the rows with no classification
df = params_df[params_df["Classification"].notnull()]

In [5]:
# drop some columns
for col in df.columns:
    if df[col].isnull().all():
        del df[col]   
    elif df[col].dtype != 'object' and np.mean(df[col]) == np.inf:
        del df[col]
    elif "flag" in col: #Flags don't contribute to ML based on initial testing
        del df[col]
    
df = df.fillna(0)
df.head()

Unnamed: 0,image,base_CircularApertureFlux_70_0_flux,base_CircularApertureFlux_6_0_flux,base_SdssShape_psf_xx,base_SdssShape_psf_xy,base_SdssCentroid_x,ip_diffim_PsfDipoleFlux_pos_fluxSigma,ip_diffim_PsfDipoleFlux_pos_flux,base_SdssShape_psf_yy,base_SdssShape_flux_xy_Cov,...,base_CircularApertureFlux_4_5_fluxSigma,ip_diffim_DipoleFit_separation,ip_diffim_DipoleFit_pos_fluxSigma,base_SdssShape_yy,base_CircularApertureFlux_9_0_flux,ip_diffim_PsfDipoleFlux_neg_centroid_x,base_CircularApertureFlux_12_0_flux,ip_diffim_DipoleFit_orientation,base_CircularApertureFlux_25_0_flux,Classification
0,0,9631.897876,714.726562,2.828649,-0.453969,225.833654,2515.781488,108909.800974,2.625221,-19.500092,...,176.950531,0.075128,675.617063,0.531181,661.367737,226.625613,418.723064,130.761641,1011.415786,Dipole
1,1,4203.040442,1035.672363,2.828649,-0.453969,1403.524349,0.0,0.0,2.625221,-24.382818,...,102.062622,0.0,0.0,2.142461,1011.458618,0.0,1113.044229,0.0,1657.226911,Possible Transient
2,2,1545.866122,-148.509018,2.828649,-0.453969,1516.439084,3339.446858,169312.388582,2.625221,0.0,...,242.372284,0.066358,1329.376864,410.077555,159.960388,1516.969982,50.727865,-23.66628,194.104162,Subtraction Error
3,3,0.0,32.771393,2.828649,-0.453969,65.176307,0.0,0.0,2.625221,-15.389731,...,145.34729,0.0,0.0,0.618887,209.429108,0.0,-94.296964,0.0,538.538759,Subtraction Error
4,4,10168.91493,718.301331,2.828649,-0.453969,834.630699,2950.619325,111744.638382,2.625221,-0.557023,...,203.650833,0.087032,859.469912,0.224596,1180.052612,834.469876,988.245036,85.94859,1932.348548,Subtraction Error


## Training, dev, test

In [6]:
# shuffle data
df.sample(frac=1)

# get rid of (super) low variance features
selector = VarianceThreshold(0.01)
training = selector.fit_transform(df.drop(columns=['Classification']))
labels = df['Classification']


# normalize
for col in df.columns:
    if df[col].dtype == "float64" or df[col].dtype == "int64":
        df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())


length = training.shape[0]
train_split = int(length * 0.65)
dev_split = int(length * 0.80)

train_X = training[0 : train_split]
dev_X = training[train_split : dev_split]
test_X = training[dev_split:]

# type to vectors
label_dict = dict()
trans_dict = dict()
for i, l in enumerate(labels.unique()):
    label_dict[l] = i
    trans_dict[i] = l

    
print(label_dict)
    
Y = []
for i in labels:
    Y.append(label_dict[i])
    
    
train_y = Y[0 : train_split]
dev_y = Y[train_split : dev_split]
test_y = Y[dev_split:]

{'Dipole': 0, 'Possible Transient': 1, 'Subtraction Error': 2, 'Possible Variable Star': 3, 'Pixel Artifact': 4, 'Noise': 5}


## Neural Network 

In [21]:
class LSSTNet(nn.Module):

    def __init__(self):
        super(LSSTNet, self).__init__()
        self.fc1 = nn.Linear(85, 100)
        self.fc2 = nn.Linear(100, 50)
        self.fc3 = nn.Linear(50, 6)
        nn.init.xavier_uniform(self.fc1.weight)
        nn.init.xavier_uniform(self.fc2.weight)
        nn.init.xavier_uniform(self.fc3.weight)
        self.model = torch.nn.Sequential(
                         self.fc1,
                         nn.BatchNorm1d(100, momentum=0.5),
                         nn.ReLU(),
                         self.fc2,
                         nn.BatchNorm1d(50, momentum=0.5),
                         nn.ReLU(),
                         self.fc3)
        
        
    def forward(self, x):
        """
        Use relu
        """
        return F.log_softmax(self.model(x), dim=1)

In [22]:
net = LSSTNet().double()
if torch.cuda.is_available():
    net.cuda()

# optim and loss
optim = torch.optim.Adam(net.parameters(), lr=1e-4)
critetion = nn.NLLLoss()

In [23]:
# need variable wrapper
X = Variable(torch.from_numpy(train_X).contiguous())
y = Variable(torch.from_numpy(np.asarray(train_y)).contiguous()).long()

In [24]:
T = 1100
# run 500 times
for t in range(T):
    # forward
    y_pred = net(X)
    loss = critetion(y_pred, y)
    # Zero the gradients before running the backward pass.
    net.zero_grad()
    loss.backward()
    optim.step()
    if t % 100 == 0:
        print("t: ", t, " loss: ", loss.data[0])

t:  0  loss:  1.8362058065796047
t:  100  loss:  1.4808222475315262
t:  200  loss:  1.220803505094759
t:  300  loss:  1.0757745335379525
t:  400  loss:  0.9774573316043764
t:  500  loss:  0.882890037273286
t:  600  loss:  0.8000528809310044
t:  700  loss:  0.7254027282957688
t:  800  loss:  0.6661557312846765
t:  900  loss:  0.6185176727868701
t:  1000  loss:  0.5821009390816765


## dev testing

In [25]:
development_X = Variable(torch.from_numpy(dev_X).contiguous())
development_y = Variable(torch.from_numpy(np.asarray(dev_y)).contiguous()).long()

In [26]:
dev_pred = net(development_X)

In [27]:
print(critetion(dev_pred, development_y).data[0])

0.6504002154199562


In [28]:
# check how many we got right:
correct = 0
for ii, pred in enumerate(dev_pred.data):
    i = np.argmax(pred)
    if i == dev_y[ii]:
        correct += 1
        
        
print(correct)
print(float(correct) / len(dev_y))

115
0.7718120805369127


In [29]:
testing_X = Variable(torch.from_numpy(test_X).contiguous())
testing_y = Variable(torch.from_numpy(np.asarray(test_y)).contiguous()).long()

In [30]:
test_pred = net(testing_X)
print(critetion(test_pred, testing_y).data[0])

# check how many we got right:
correct = 0
for ii, pred in enumerate(test_pred.data):
    i = np.argmax(pred)
    if i == test_y[ii]:
        correct += 1
        
        
print(correct)
print("accuracy: ", 
      float(correct) / len(test_y))

0.5829032630228582
162
accuracy:  0.8140703517587939
