In [1]:
import os
import glob
import numpy as np
import tensorflow as tf
import pandas as pd

# dataset

In [2]:
labelDf = pd.read_csv("train_labels.csv")
labelDf = labelDf.set_index('planet_id')

mean = np.mean(labelDf.mean())
std = np.std(labelDf.std())
max = np.max(labelDf.max())
min = np.min(labelDf.min())
mean, std, max, min

for col in labelDf.columns:
    labelDf.loc[:,col] = (labelDf[col] - mean) / (std)

In [3]:
tf.random.set_seed(42)
files = glob.glob(os.path.join('train/', '*/*'))
stars = []
for file in files:
    file_name = file.split('\\')[1]
    stars.append(file_name)
stars = np.unique(stars)

import random
random.seed(42)

def split_star_list(file_list, test_ratio=0.2):
    random.shuffle(file_list)
    split_index = int(len(file_list) * (1 - test_ratio))
    train_files = file_list[:split_index]
    test_files = file_list[split_index:]
    return train_files, test_files

train_stars, test_stars = split_star_list(stars)

def calcMeanAndStdOfTrain(train_stars):
    i = 0
    for star in train_stars:
        file_path = 'train/'+str(star)+'/combined.npz'
        with np.load(file_path) as data:
            x = data['a'][0,:,0:283,:]
            if i ==0:
                mean = np.mean(x,axis=(0))
                sumS = np.sum(x**2,axis=0)
            else:
                mean = mean + np.mean(x, axis=(0))
                sumS += np.sum(x**2,axis=0)
            i=i+1
    meanTrain = mean / i
    stdTrain = np.sqrt(sumS / (i*x.shape[0]) - meanTrain**2)    
    return meanTrain, stdTrain

meanTrain, stdTrain = calcMeanAndStdOfTrain(train_stars)

def preprocess_data(features, labels):
    features = (features - meanTrain) / (stdTrain + 1e-6)
    return features, labels

def load_npz(star):
    integer_value = tf.strings.to_number(star, out_type=tf.int64)
    python_int = integer_value.numpy()

    file_path = 'train/'+str(python_int)+'/combined.npz'
    try:
        with np.load(file_path) as data:
            features = data['a'][0,:,0:283,:]
            labels = labelDf.loc[python_int].to_numpy()

            features, labels = preprocess_data(features,labels)
            return features, labels
    except Exception as e:
        print("Error loading file:", e, python_int)
    

def create_dataset(star_list, batch_size, shuffle=True):
    dataset = tf.data.Dataset.from_tensor_slices(star_list)
    if shuffle:
        dataset = dataset.shuffle(buffer_size=len(star_list))
    def load_and_process(x):
        features, labels = tf.py_function(
            func=load_npz,
            inp=[x],
            Tout=[tf.float64, tf.float32]
        )
        return features, labels

    dataset = dataset.map(load_and_process, num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.map(lambda x, y: (tf.ensure_shape(x,tf.TensorShape([5625, 283, 4])), tf.ensure_shape(y, tf.TensorShape([283]))))
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
    return dataset


In [4]:
tf.random.set_seed(42)
batch_size = 12

train_dataset = create_dataset(train_stars, batch_size, shuffle=True)
test_dataset = create_dataset(test_stars, batch_size, shuffle=False)

# model

In [None]:
tf.random.set_seed(42)

class ReduceDim(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
    def call(self, x):
        x = tf.squeeze(x, axis=-1)
        return x
    
class Reshape1(tf.keras.layers.Layer):
    def __init__(self, timepoints, representations, wavelengths, **kwargs):
        super().__init__(**kwargs)
        self.timepoints = timepoints
        self.wavelengths = wavelengths
        self.representations = representations
    def call(self, x):
        x = tf.transpose(x, perm=[0,2,1,3])
        #x = tf.reshape(x, [-1, self.timepoints, tf.cast(self.wavelengths * self.representations, tf.int32)])
        return x
    
class Reshape11(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
    def call(self, x):
        x = tf.transpose(x, perm=[0,2,1])
        #x = tf.reshape(x, [-1, self.timepoints, tf.cast(self.wavelengths * self.representations, tf.int32)])
        return x

class Reshape2(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
    def call(self, x_pred, x_confidence):
        x = tf.concat([x_pred, x_confidence], axis = -1)
        
        return x
    
class Reshape22(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
    def call(self, x_pred, x_confidence):
        x_pred = tf.expand_dims(x_pred, axis=-1)
        x_confidence = tf.expand_dims(x_confidence, axis=-1)
        x = tf.concat([x_pred, x_confidence], axis = -1)
        
        return x

class TransformerEncoder(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, feed_forward_dim):
        super().__init__()
        self.att = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim//num_heads)
        self.ffn2 = tf.keras.layers.Dense(embed_dim)
        self.ffn1 = tf.keras.layers.Dense(feed_forward_dim)
        self.layer_norm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layer_norm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    def call(self, x):
        residual = x
        x = self.att(x, x)
        x = x + residual
        residual = x
        x = self.ffn1(x)
        x = self.layer_norm1(x)
        x = self.ffn2(x)
        x = self.layer_norm2(x + residual)
        return x
    

timepoints = 5625
representations = 4
wavelengths = 283
targetWavelengths = 283
def buildModel_bad(outputDim = 283):
    inp = tf.keras.Input(shape=(timepoints, wavelengths, representations))
    x = inp

    # plan:
    # 1. use cnn filter so go from timepoints, wavelengths, representations -> timepoints, wavelengths (283 1d filter)
    # 2. transpose, timepoints,wavelengths -> wavelengths,timepoints

    # edit: doesn't make sense since transformer should not learn similarities between wavelengths but rather similarities between timepoints

       # batch, wavelengths, time, representation

    x = Reshape1(timepoints, representations, wavelengths)(x) #make to [batch_size, wavelengths, time,repr]
    # Use a 1D Convolutional layer with kernel size of 1 to reduce the last dimension
    x = tf.keras.layers.Conv2D(filters=1, kernel_size=(1, 1), padding='valid')(x)
    x = ReduceDim()(x)
    dim = timepoints #int(timepoints/4)# x.shape[2]
    x = tf.keras.layers.Dense(dim)(x)

    for i in range(5):
        x = TransformerEncoder(embed_dim=dim, num_heads=4, feed_forward_dim=int(dim/(2)))(x)
        dim = int(dim/2)

    x_pred = tf.keras.layers.Dense(1, activation='relu')(x)
    x_confidence = tf.keras.layers.Dense(1, activation='relu')(x)
    x = Reshape2()(x_pred, x_confidence)

    model = tf.keras.Model(inp, x)
    return model


def buildTransfModel(outputDim = 283):
    inp = tf.keras.Input(shape=(timepoints, wavelengths, representations))
    x = inp[:,:,:,0]

    # plan:
    # 1. use cnn filter so go from timepoints, wavelengths, representations -> timepoints, wavelengths (283 1d filter)
    # Use a 1D Convolutional layer with kernel size of 1 to reduce the last dimension
    #x = tf.keras.layers.Conv2D(filters=1, kernel_size=(1, 1), padding='valid')(x)
    #x = ReduceDim()(x)
    dim = wavelengths*4 #int(timepoints/4)# x.shape[2]
    #x = tf.keras.layers.Dense(dim)(x)
    #x = tf.keras.layers.Conv2D(filters=dim, kernel_size=(3, 3), padding='valid')(x)

    #x = tf.keras.layers.Conv1D(filters=dim, kernel_size=8, padding='same', activation='relu')(x)

    for i in range(5):
        x = TransformerEncoder(embed_dim=wavelengths, num_heads=4, feed_forward_dim=wavelengths*4)(x)
        #dim = int(dim/2)

    x = Reshape11()(x)  # reshape to wavelengths, timestamps
    #x = tf.keras.layers.Flatten()(x)
    x = tf.keras.layers.Dense(3000)(x)
    x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(x)
    # = tf.keras.layers.Dense(1000)(x)
    # = tf.keras.layers.LayerNormalization(epsilon=1e-6)(x)
    # = tf.keras.layers.Dense(200)(x)
    # = tf.keras.layers.LayerNormalization(epsilon=1e-6)(x)
    # = tf.keras.layers.Dense(20)(x)
    # = tf.keras.layers.LayerNormalization(epsilon=1e-6)(x)
    #x = tf.keras.layers.Flatten()(x)

    x_pred = tf.keras.layers.Dense(1, activation='linear')(x)
    x_confidence = tf.keras.layers.Dense(1, activation='linear')(x)
    x = Reshape2()(x_pred, x_confidence)

    model = tf.keras.Model(inp, x)
    return model

def buildFCNModel(outputDim = 283):
    inp = tf.keras.Input(shape=(timepoints, wavelengths, representations))
    x = inp[:,:,:,0]
    #x = tf.keras.layers.LayerNormalization(epsilon=1e-6, axis=[2])(x)
    # plan:
    # 1. use cnn filter so go from timepoints, wavelengths, representations -> timepoints, wavelengths (283 1d filter)
    # Use a 1D Convolutional layer with kernel size of 1 to reduce the last dimension

    #x = tf.keras.layers.Conv2D(filters=20, kernel_size=(1, 1), padding='valid')(x) # seems to be worse
    #x = tf.keras.layers.Dense(1)(x)
    #x = ReduceDim()(x)
    x = tf.keras.layers.Flatten()(x)
    #x = tf.keras.layers.Dense(dim)(x)
    #x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(x)

    factor = 1
    x = tf.keras.layers.Dense(283*factor, activation='relu')(x)
    #x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(x)
    x = tf.keras.layers.Dense(283*factor, activation='relu')(x)
    #x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(x)
    x = tf.keras.layers.Dense(283*factor, activation='relu')(x)
    x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(x)
    x_pred = tf.keras.layers.Dense(283, activation='linear')(x)
    x_confidence = tf.keras.layers.Dense(283, activation='linear')(x)
    x = Reshape22()(x_pred, x_confidence)

    model = tf.keras.Model(inp, x)
    return model


model = buildFCNModel() 
#model = buildTransfModel()
model.summary()

In [None]:
batch = next(iter(train_dataset))
out = model(batch[0])
test_batch = next(iter(test_dataset))
batch[0].dtype ,batch[1].dtype, out.dtype,batch[0].shape ,batch[1].shape, out.shape

#normData = np.zeros_like(batch[0])
#normTest = np.zeros_like(test_batch[0])
#for i in range(batch[0].shape[2]):
#    for dim in range(batch[0].shape[3]):
#        meanNorm = np.mean(batch[0][:,:,i,dim])
#        stdNorm = np.std(batch[0][:,:,i,dim])
#        normData[:,:,i,dim] = ((batch[0][:,:,i,dim] - meanNorm) / (stdNorm + 1e-7))
#        normTest[:,:,i,dim] = ((test_batch[0][:,:,i,dim] - meanNorm) / (stdNorm + 1e-7))
#
##normData.dtype
#out = model(normData)
##normData = normData.astype(np.float64)
#normData, batch[0]

In [None]:
def log_likelihood_zScoreTarget(y_trueZScore, y_pred):
    # stdDev_zScorePred = 1/n * sqrt((y_zScore - y_zScoreMean)^2) = 1/n *sqrt(sum( (y-mean)/std - (y_mean-mean)/std )^2) = 1/n * sqrt(sum( (y-y_mean)/std )^2 )) = 1/std * 1/n * sqrt(sum(y-y_mean)^2) = stdDev / std
    # stdDev_zScorePred = stdDev_pred / std
    # y_pred contains 1. y_zScore 2. log(stdDev_zScore)

    y_true = y_trueZScore * std + mean   # y_zScore = (y - mean) / std -> y = y_zScore *std + mean

    y_predZScore = y_pred[:, :,0]
    log_sigma = y_pred[:, :,1]  # Log of the standard deviation / we predict log(stdDev_zScore) = log(stdDev / std) = log(stdDev) - log(std) -> log(stdDev) = log(stdDev_zScore) + log(std)

    y_pred0 = y_predZScore * std + mean
    stdDev = tf.exp(log_sigma)*std  # Exponentiate to get variance + scale back from zscore 
    logStdDev = log_sigma + tf.math.log(std)

    L_pred = -0.5*(tf.math.log(2*np.pi) + logStdDev + tf.square(y_true - y_pred0) / stdDev)
    L_ref = -0.5*(tf.math.log(2*np.pi) +  tf.math.log(std*std) + tf.square(y_trueZScore))   # ( (y_true - mean)/std )^2 = y_trueZScore^2  (y_true = y_trueZScore * std + mean)
    L_ideal = -0.5*(tf.math.log(2*np.pi) + tf.math.log(1e-10))

    L = (tf.reduce_sum(L_pred) -tf.reduce_sum(L_ref)) / (tf.reduce_sum(L_ideal)*283*5625 - tf.reduce_sum(L_ref))
    
    return L

def log_loss_zScoreTarget(y_trueZScore, y_pred):
    # stdDev_zScorePred = 1/n * sqrt((y_zScore - y_zScoreMean)^2) = 1/n *sqrt(sum( (y-mean)/std - (y_mean-mean)/std )^2) = 1/n * sqrt(sum( (y-y_mean)/std )^2 )) = 1/std * 1/n * sqrt(sum(y-y_mean)^2) = stdDev / std
    # stdDev_zScorePred = stdDev_pred / std
    # y_pred contains 1. y_zScore 2. log(stdDev_zScore)

    y_true = y_trueZScore * std + mean   # y_zScore = (y - mean) / std -> y = y_zScore *std + mean

    y_predZScore = y_pred[:, :,0]
    log_sigma = y_pred[:, :,1]  # Log of the standard deviation / we predict log(stdDev_zScore) = log(stdDev / std) = log(stdDev) - log(std) -> log(stdDev) = log(stdDev_zScore) + log(std)

    y_pred0 = y_predZScore * std + mean
    stdDev = tf.exp(log_sigma)*std  # Exponentiate to get variance + scale back from zscore 
    logStdDev = log_sigma + tf.math.log(std)

    L_pred = -0.5*(tf.math.log(2*np.pi) + logStdDev + tf.square(y_true - y_pred0) / stdDev)
    L_ref = -0.5*(tf.math.log(2*np.pi) +  tf.math.log(std*std) + tf.square(y_trueZScore))   # ( (y_true - mean)/std )^2 = y_trueZScore^2  (y_true = y_trueZScore * std + mean)
    L_ideal = -0.5*(tf.math.log(2*np.pi) + tf.math.log(1e-10))

    L = (L_pred -L_ref) / (L_ideal - L_ref)
    loss = -(L - 1)
    #print(L)
    
    return tf.reduce_mean(loss)
#log_likelihood_zScoreTarget(batch[1], out),log_loss_zScoreTarget(batch[1], out)
log_likelihood_zScoreTarget(batch[1], out)

In [19]:
from keras.callbacks import LearningRateScheduler
def decay_schedule(epoch, lr):
    # decay by 0.1 every 5 epochs; use `% 1` to decay after each epoch
    if (epoch % 5 == 0) and (epoch != 0):
        lr = lr * 0.1
    return lr

lr_scheduler = LearningRateScheduler(decay_schedule)

In [None]:
tf.random.set_seed(42)

def loss_fn0(y_true_zScore, y_pred):
    y_predZScore = y_pred[:, :,0]  # y_zScore = (y - mean)/std
    logConfidence = y_pred[:, :,1] # logStdDev = log(stdDev / std)

    loss = tf.math.abs(y_true_zScore-y_predZScore)
    loss_2 = tf.math.abs(loss-tf.exp(logConfidence))
    return tf.reduce_mean(loss+0.0001*loss_2)

#optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(loss=loss_fn0,metrics=[log_likelihood_zScoreTarget], optimizer=optimizer)

history = model.fit(train_dataset, 
                    #batch[0],batch[1], #verbose=2,
                    #normData, batch[1].numpy(),
                    validation_data=test_dataset,
                    #validation_data=(normTest, test_batch[1]),
                    epochs=400, batch_size=batch_size,
                    #callbacks=[lr_scheduler]
                    #callbacks=[validation_callback(val_metric_list), lr_callback, WeightDecayCallback()]
                    )


In [None]:
# fcn loss on one batch ~1e-8
#gaussian_log_likelihood: -0.1196 - loss: 309017632.0000 - val_gaussian_log_likelihood: 0.8712 - val_loss: 1794403072.0000
#gaussian_log_likelihood: -24139.4531 - loss: 117853856.0000 - val_gaussian_log_likelihood: -18151.4355 - val_loss: 987686784.0000

# FCN: with conv (~70 loss -> doesnt converge as well)
# FCN: without conv, best after 400 epochs ~40 -> still values are very similar from one position

# experiments, only 1 target
# ~0.17 for fcn we can fit the data well
# ~0.16-0.17 for fcn + conv NN in the beginning
# ~19 for normalization only before last layer / same for normalization all throughout
# normalization after first dense -> helped, got better towards the end (after input doesn't converge)
# normalization of input features over time -> nans

# ---- 2 targets working --- (normalization of input features!)
# normalization of input features over time + normalization of last layer -> waaay better ~0.12
# normalization of input features over time + normalization of last alyer + conv encoder ~ 0.11
# normalization layer instead of input f norm doesn't seem to work
# making network smaller (factor 1) ->~0.04
# instead of 3 fc layer only 1 fc layer + 2 output layer  -> 0.033

# ----- 12 samples ---
# 1 fc layer +2 output layers -> ~39loss / 59 loss test (overfitting after ~200 epochs)
# 3fc layer * 4 size -> ~44logg / 66 test loss (overfitting after ~85 epochs)
# transformer predicts same values again, even with norm values, loss ~70

# ----- 1 target + transformers
# base ~4.5 loss on one target 
# with only 1 fcn layer between output and trans a lot worse ~25 loss

# ---- 2 targets + transf
# larger encoder of transformer same issue, only fits mean


In [None]:
#pred = model.predict(normData)
pred = model.predict(batch[0])
pred[:,0:10,0], batch[1][:,0:10] ,np.exp(pred[:,0:10,1])

In [None]:
pred[:,0:2,0], batch[1][:,0:2] ,np.exp(pred[:,0:2,1])

In [None]:
print('overall',loss_fn0(batch[1],pred))
for i in range(batch_size):
    print(f'batch {i}',loss_fn0(batch[1][i,:],pred[i:i+1,:,:]))