In [1]:
import os
import glob
import numpy as np
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import scipy

In [None]:
labelDf0 = pd.read_csv("train_labels.csv")
labelDf0 = labelDf0.set_index('planet_id')
labelDf0

# data

In [2]:
tf.random.set_seed(42)
files = glob.glob(os.path.join('train/', '*/*'))
stars = []
for file in files:
    file_name = file.split('\\')[1]
    stars.append(file_name)
stars = np.unique(stars)

import random
random.seed(42)

def split_star_list(file_list, test_ratio=0.6):
    random.shuffle(file_list)
    split_index = int(len(file_list) * (1 - test_ratio))
    train_files = file_list[:split_index]
    test_files = file_list[split_index:]
    return train_files, test_files

train_stars, test_stars = split_star_list(stars)

labelDf = pd.read_csv("train_labels.csv")
labelDf = labelDf.set_index('planet_id')
meanLabels = np.mean(labelDf.mean())
stdLabels = np.std(labelDf.std())
maxLabels = np.max(labelDf.max())
minLabels = np.min(labelDf.min())

trainLabels = labelDf.loc[[int(star) for star in train_stars]]
meanTrainLabels = np.mean(trainLabels.mean())
stdTrainLabels = np.std(trainLabels.std())
maxTrainLabels = np.max(trainLabels.max())
minTrainLabels = np.min(trainLabels.min())

for col in labelDf.columns:
    labelDf.loc[:,col] = (labelDf[col]) / (maxTrainLabels)

# normalize over time and all samples, so we have a mean and a std dev per wavelength for all samples
def calcMeanAndStdOfTrain(train_stars):
    i = 0
    for star in train_stars:
        file_path = 'train/'+str(star)+'/combined.npz'
        with np.load(file_path) as data:
            x = data['a'][0,:,0:283,:]
            if i ==0:
                mean = np.mean(x,axis=(0))
                sumS = np.sum(x**2,axis=0)
            else:
                mean = mean + np.mean(x, axis=(0))
                sumS += np.sum(x**2,axis=0)
            i=i+1
    meanTrain = mean / i
    stdTrain = np.sqrt(sumS / (i*x.shape[0]) - meanTrain**2)    
    return meanTrain, stdTrain
meanTrain, stdTrain = calcMeanAndStdOfTrain(train_stars)

def normalize_over_train(features, labels):
    features = (features - meanTrain) / (stdTrain + 1e-6)
    return features, labels

# normalize over time per samples, so we have a mean and a std dev per wavelength for all samples
def calcMeanAndStdOfTrainPerStar(x):
    mean = np.mean(x,axis=(0))
    sumS = np.sum(x**2,axis=0)
    stdTrain = np.sqrt(sumS / (x.shape[0]) - mean**2)    
    return mean, stdTrain
def normalize_per_sample(features, labels):
    m,s = calcMeanAndStdOfTrainPerStar(features)
    features = (features) / (s + 1e-6)
    return features, labels




def load_npz(star):
    integer_value = tf.strings.to_number(star, out_type=tf.int64)
    python_int = integer_value.numpy()

    file_path = 'train/'+str(python_int)+'/combined.npz'
    try:
        with np.load(file_path) as data:
            features = data['a'][0,:,0:283,:]
            labels = labelDf.loc[python_int].to_numpy()
            features = np.reshape(features,(-1,25,283,4))
            features = np.mean(features,axis=1)
            #features, labels = normalize_per_sample(features,labels)
            features, labels = normalize_over_train(features,labels)
            return features, labels
    except Exception as e:
        print("Error loading file:", e, python_int)
    

def create_dataset(star_list, batch_size, shuffle=True):
    dataset = tf.data.Dataset.from_tensor_slices(star_list)
    if shuffle:
        dataset = dataset.shuffle(buffer_size=len(star_list))
    def load_and_process(x):
        features, labels = tf.py_function(
            func=load_npz,
            inp=[x],
            Tout=[tf.float64, tf.float32]
        )
        return features, labels

    dataset = dataset.map(load_and_process, num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.map(lambda x, y: (tf.ensure_shape(x,tf.TensorShape([225, 283, 4])), tf.ensure_shape(y, tf.TensorShape([283])))) #5625
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
    return dataset


In [46]:
np.savez('helpers_origiData_meanPred.npz',meanTrain=meanTrain, stdTrain=stdTrain,meanLabels=meanLabels,stdLabels=stdLabels,maxTrainLabels=maxTrainLabels)

In [3]:
tf.random.set_seed(42)
batch_size = 64

train_dataset = create_dataset(train_stars, batch_size, shuffle=True)
test_dataset = create_dataset(test_stars, batch_size, shuffle=False)

# CNN

In [4]:
class Reshape1(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
    def call(self, x):
        x = tf.transpose(x, perm=[0,2,1,3])
        #x = tf.reshape(x, [-1, self.timepoints, tf.cast(self.wavelengths * self.representations, tf.int32)])
        return x
    
class Reshape11(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
    def call(self, x):
        x = tf.transpose(x, perm=[0,2,1])
        #x = tf.reshape(x, [-1, self.timepoints, tf.cast(self.wavelengths * self.representations, tf.int32)])
        return x

class Reshape2(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
    def call(self, x_pred, x_confidence):
        x = tf.concat([x_pred, x_confidence], axis = -1)
        
        return x
    
class Reshape22(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
    def call(self, x_pred, x_confidence):
        x_pred = tf.expand_dims(x_pred, axis=-1)
        x_confidence = tf.expand_dims(x_confidence, axis=-1)
        x = tf.concat([x_pred, x_confidence], axis = -1)
        return x
    
class reduce(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
    def call(self, x):
        mean = tf.reduce_sum(x,axis=-1)
        mean = tf.expand_dims(mean, axis=-1)
        return mean
class reduce1(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
    def call(self, x):
        mean = tf.reduce_sum(x,axis=-1)
        return mean
    
class tile(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
    def call(self, x,mean):
        x = tf.concat([x,mean],axis=-1)
        return x
    
class tile2(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
    def call(self, x,mean):
        x = tf.concat([x,tf.expand_dims(mean,axis=-1)],axis=-2)
        return x
    
class meanOfWavelengths(tf.keras.layers.Layer):
    def __init__(self, concat=True,**kwargs):
        self.concat=concat
        super().__init__(**kwargs)
    def call(self, x):
        m = tf.expand_dims(tf.reduce_mean(x,axis=-1),axis=-1)
        x = tf.concat([x,m],axis=-1)
        return x if self.concat else m


timepoints = 225
representations = 4
wavelengths = 283
targetWavelengths = 283

def cnnM(outputDim = 283):
    inp = tf.keras.Input(shape=(timepoints, wavelengths, representations))
    x = inp[:,:,:,1]
    x = meanOfWavelengths()(x)
    
    #x = Reshape11()(x)
    dim = timepoints
    for i in range(3):
        # convolution with n_wavelengths of channels -> applying same operation across all channels
        # after first convolution we have timepoints*wavelengths -> timepoints*284 filter outputs (* 1 channel)
        x = tf.keras.layers.Conv1D(filters=284, kernel_size=(5), padding='valid')(x) 
        x = tf.keras.layers.AveragePooling1D(2)(x)

    x = Reshape11()(x)
    x = tf.keras.layers.Dense(1000)(x)
    x = tf.keras.layers.Flatten()(x)
    x = tf.keras.layers.Dense(1000)(x)
    mean = tf.keras.layers.Dense(100,activation='relu')(x)
    mean = tf.keras.layers.Dense(50,activation='relu')(mean)
    mean = tf.keras.layers.Dense(1,activation='linear')(mean)
    x_pred = tf.keras.layers.Dense(283, activation='linear')(x)
    x_pred = tile()(x_pred,mean)
    #x_pred = x_pred+mean
    x_confidence = tf.keras.layers.Dense(283, activation='linear')(x)
    x_confidence = tile()(x_confidence,mean)
    x = Reshape22()(x_pred, x_confidence)

    model = tf.keras.Model(inp, x)
    return model


def cnnMeanOnly(outputDim = 283):
    inp = tf.keras.Input(shape=(timepoints, wavelengths, representations))
    x = inp[:,:,:,1]
    x = meanOfWavelengths(False)(x) #
    
    #x = Reshape11()(x)
    #dim = timepoints
    for i in range(3):
        x = tf.keras.layers.Conv1D(filters=40*(i+1), kernel_size=(5), padding='valid')(x)
        x = tf.keras.layers.MaxPooling1D()(x)

    x = tf.keras.layers.Flatten()(x)
    x = tf.keras.layers.Dense(1000)(x)
    #mean = tf.keras.layers.Dense(100,activation='relu')(x)
    #mean = tf.keras.layers.Dense(50,activation='relu')(mean)
    mean = tf.keras.layers.Dense(1,activation='linear')(x)
    x_pred = tf.keras.layers.Dense(283, activation='linear')(x)
    x_pred = tile()(x_pred,mean)
    #x_pred = x_pred+mean
    x_confidence = tf.keras.layers.Dense(283, activation='linear')(x)
    x_confidence = tile()(x_confidence,mean)
    x = Reshape22()(x_pred, x_confidence)

    model = tf.keras.Model(inp, x)
    return model


def cnnDepthwise():
    inp = tf.keras.Input(shape=(timepoints, wavelengths, representations))
    x = inp[:,:,:,1]
    x0 = meanOfWavelengths(False)(x)
    # timepoints (225) * wavelengths (284)
    
    #x = Reshape11()(x)
    dim = timepoints
    for i in range(3):
        # depthwise1d filter -> one filter per channel (=wavelength), depth_multiplier tells us how many filters per channel
        x = tf.keras.layers.DepthwiseConv1D(kernel_size=5,strides=1,padding='same', depth_multiplier=2,activation='relu')(x)
        #x = tf.keras.layers.Conv1D(filters=284, kernel_size=(5), padding='valid')(x)
        x = tf.keras.layers.AveragePooling1D(2)(x)
        #x = tf.keras.layers.Dense(284)(x)

    for i in range(4):
        x0 = tf.keras.layers.Conv1D(filters=64*(i+1), kernel_size=(5), padding='valid')(x0)
        x0 = tf.keras.layers.AveragePooling1D(2)(x0)
    #x0 = tf.keras.layers.Dense(280)(x0)
    x0 = Reshape11()(x0)
    x0 = tf.keras.layers.Dense(1000)(x0)
    mean = tf.keras.layers.Flatten()(x0)
    mean = tf.keras.layers.Dense(1000)(mean)
    mean = tf.keras.layers.Dense(1000)(mean)
    mean = tf.keras.layers.Dense(50)(mean)
    mean = tf.keras.layers.Dense(1,activation='linear')(mean)

    x = tf.keras.layers.Dense(283)(x)
    x = tf.keras.layers.DepthwiseConv1D(kernel_size=5,strides=1,padding='same', depth_multiplier=2,activation='relu')(x)
    x = tf.keras.layers.Dense(283)(x)
    x = Reshape11()(x)
    x = tf.keras.layers.Dense(1000)(x)
    x = tf.keras.layers.Dense(100)(x)
    
    x_pred = tf.keras.layers.Dense(1, activation='linear')(x)
    x_pred = tile2()(x_pred,mean)
    x_confidence = tf.keras.layers.Dense(1, activation='linear')(x)
    x_confidence = tile2()(x_confidence,mean)
    x = Reshape2()(x_pred, x_confidence)

    model = tf.keras.Model(inp, x)
    return model

def cnn2():
    inp = tf.keras.Input(shape=(timepoints, wavelengths, representations))
    x = inp[:,:,:,1]
    x0 = meanOfWavelengths(True)(x)
    # timepoints (225) * wavelengths (284)
    
    for i in range(3*2):
        x0 = tf.keras.layers.Conv1D(filters=8, kernel_size=(20), padding='valid')(x0)
#
    #x = tf.keras.layers.Dense(1000)(x)
    x0 = tf.keras.layers.Flatten()(x0)
    #x = tf.keras.layers.Dense(1000)(x)
    x0 = tf.keras.layers.Dense(100,activation='relu')(x0)
    #x = tf.keras.layers.Dense(50,activation='relu')(x)
    mean = tf.keras.layers.Dense(1,activation='linear')(x0)

    for i in range(3):
        # depthwise1d filter -> one filter per channel (=wavelength), depth_multiplier tells us how many filters per channel
        x = tf.keras.layers.DepthwiseConv1D(kernel_size=10,strides=1,padding='same', depth_multiplier=1,activation='relu')(x)
        x = tf.keras.layers.AveragePooling1D(2)(x)
        #x = tf.keras.layers.Dense(284)(x)

    x = tf.keras.layers.DepthwiseConv1D(kernel_size=5,strides=1,padding='same', depth_multiplier=2,activation='relu')(x)
    x = tf.keras.layers.Dense(283)(x)
    x = Reshape11()(x)
    x = tf.keras.layers.Dense(1000)(x)
    x = tf.keras.layers.Dense(100)(x)
    
    x_pred = tf.keras.layers.Dense(1, activation='linear')(x)
    x_pred = tile2()(x_pred,mean)
    x_confidence = tf.keras.layers.Dense(1, activation='linear')(x)
    x_confidence = tile2()(x_confidence,mean)
    x = Reshape2()(x_pred, x_confidence)

    model = tf.keras.Model(inp, x)
    return model

#model = cnnDepthwise() 
#model = cnnM() 
model = cnn2() 
model.summary()




In [5]:
batch = next(iter(train_dataset))
out = model(batch[0])
test_batch = next(iter(test_dataset))
batch[0].dtype ,batch[1].dtype, out.dtype,batch[0].shape ,batch[1].shape, out.shape

(tf.float64,
 tf.float32,
 tf.float32,
 TensorShape([64, 225, 283, 4]),
 TensorShape([64, 283]),
 TensorShape([64, 284, 2]))

In [6]:
def log_likelihood_maxScaling(y_trueMax, y_predAll):
    # stdDev_zScorePred = 1/n * sqrt((y_zScore - y_zScoreMean)^2) = 1/n *sqrt(sum( (y-mean)/std - (y_mean-mean)/std )^2) = 1/n * sqrt(sum( (y-y_mean)/std )^2 )) = 1/std * 1/n * sqrt(sum(y-y_mean)^2) = stdDev / std
    # stdDev_zScorePred = stdDev_pred / std
    # y_pred contains 1. y_zScore 2. log(stdDev_zScore)
    y_pred=y_predAll[:,0:283,:]
    y_predMean = y_predAll[:,283,0:1]

    y_true = y_trueMax * maxTrainLabels #std + mean   # y_zScore = (y - mean) / std -> y = y_zScore *std + mean

    y_predMax = y_pred[:, :,0]+y_predMean
    log_sigma = y_pred[:, :,1]  # Log of the standard deviation / we predict log(stdDev_zScore) = log(stdDev / std) = log(stdDev) - log(std) -> log(stdDev) = log(stdDev_zScore) + log(std)

    y_pred0 = y_predMax *maxTrainLabels #* std + mean
    sigma = tf.exp(log_sigma)*maxTrainLabels  # Exponentiate to get variance + scale back from zscore 
    logStdDev = tf.math.log(sigma*sigma)# + tf.math.log(max)

    L_pred = -0.5*(tf.math.log(2*np.pi) + logStdDev + tf.square((y_true - y_pred0) / sigma))
    L_ref = -0.5*(tf.math.log(2*np.pi) +  tf.math.log(stdLabels**4) + tf.square((y_true - meanLabels)/(stdLabels*stdLabels)))   # ( (y_true - mean)/std )^2 = y_trueZScore^2  (y_true = y_trueZScore * std + mean)
    L_ideal = -0.5*(tf.math.log(2*np.pi) + tf.math.log((1e-5)**4)) * tf.ones_like(y_predMax)
    #print(L_pred)
    #print(L_ref)
    #print(L_ideal)
    #print(tf.reduce_sum(L_pred),tf.reduce_sum(L_ideal),tf.reduce_sum(L_ref))
    L = (tf.reduce_sum(L_pred) -tf.reduce_sum(L_ref)) / (tf.reduce_sum(L_ideal) - tf.reduce_sum(L_ref))
    
    return L

def log_likelihood_maxScaling_scipy(y_trueMax, y_predAll):
    # stdDev_zScorePred = 1/n * sqrt((y_zScore - y_zScoreMean)^2) = 1/n *sqrt(sum( (y-mean)/std - (y_mean-mean)/std )^2) = 1/n * sqrt(sum( (y-y_mean)/std )^2 )) = 1/std * 1/n * sqrt(sum(y-y_mean)^2) = stdDev / std
    # stdDev_zScorePred = stdDev_pred / std
    # y_pred contains 1. y_zScore 2. log(stdDev_zScore)
    y_pred=y_predAll[:,0:283,:]
    y_predMean =y_predAll[:,283,0:1]

    y_true = y_trueMax * maxTrainLabels #std + mean   # y_zScore = (y - mean) / std -> y = y_zScore *std + mean

    y_predMax = y_pred[:, :,0]+y_predMean
    log_sigma = y_pred[:, :,1]  # Log of the standard deviation / we predict log(stdDev_zScore) = log(stdDev / std) = log(stdDev) - log(std) -> log(stdDev) = log(stdDev_zScore) + log(std)

    y_pred0 = y_predMax *maxTrainLabels #* std + mean
    sigma = tf.exp(log_sigma)*maxTrainLabels  # Exponentiate to get variance + scale back from zscore 

    GLL_pred = np.sum(scipy.stats.norm.logpdf(y_true, loc=y_pred0, scale=sigma))
    GLL_true = np.sum(scipy.stats.norm.logpdf(y_true, loc=y_true, scale=(1e-10) * np.ones_like(y_true)))
    GLL_mean = np.sum(scipy.stats.norm.logpdf(y_true, loc=meanLabels * np.ones_like(y_true), scale=(stdLabels*stdLabels) * np.ones_like(y_true)))

    submit_score = (GLL_pred - GLL_mean)/(GLL_true - GLL_mean)
    #print(GLL_pred, GLL_true, GLL_mean)
    
    return submit_score

log_likelihood_maxScaling(batch[1], out),log_likelihood_maxScaling_scipy(batch[1],out)

(<tf.Tensor: shape=(), dtype=float32, numpy=1.0>, 0.9999999999998052)

In [9]:
def combined_loss_mse(y_trueScaled, y_predAll):
    y_pred=y_predAll[:,0:283,:]
    y_predMean =y_predAll[:,283,0:1]

    y_trueMean = tf.expand_dims(tf.reduce_mean(y_trueScaled,axis=-1),axis=-1)
    lossMean = tf.square(y_predMean-y_trueMean)
    y_predScaled = y_pred[:, :,0]
    y_trueDiff2Mean = y_trueScaled - y_trueMean
    lossDiff2Mean = tf.square(y_trueDiff2Mean - y_predScaled)#tf.math.abs(y_true_zScore-y_predZScore)

    logConfidence = tf.math.exp(y_pred[:, :,1]) # logSigma = log(sigma / std)  we predict sigma NOT stdDev!!
    largerThanT = tf.greater(logConfidence, tf.exp(20.0))
    logConfidence = tf.where(largerThanT, y_pred[:,:,1] + tf.exp(20.0), logConfidence)

    lossPred =lossMean + lossDiff2Mean
    loss_2 = tf.square(lossPred-(logConfidence))

    #tf.print(lossMean.shape, lossDiff2Mean.shape, loss_2.shape)

    rmLossMean = tf.reduce_mean(lossMean)
    rmLossDiff2Mean = tf.reduce_mean(lossDiff2Mean)
    rmLossLog = tf.reduce_mean(loss_2)
    combinedLoss = 10*rmLossMean + rmLossDiff2Mean + 0.1*rmLossLog
    #tf.print(rmLossMean, rmLossDiff2Mean, rmLossLog)

    #combinedLoss = tf.reduce_sum(lossMean*100+lossDiff2Mean+loss_2*0.1, axis=-1)
    #tf.print(tf.reduce_sum(lossMean*100,axis=-1), tf.reduce_sum(lossDiff2Mean,axis=-1),tf.reduce_sum(loss_2*0.1,axis=-1))
    #combinedLoss = tf.reduce_sum(lossMean, axis=-1)
    return combinedLoss

def mse(y_trueScaled, y_predAll):
    y_pred=y_predAll[:,0:283,:]
    y_predMean =y_predAll[:,283,0:1]

    y_predScaled = (y_pred[:, :,0] + y_predMean)

    y_true = y_trueScaled*maxTrainLabels
    y_pred = y_predScaled*maxTrainLabels
    loss = tf.square(y_true-y_pred)

    #combinedLoss = tf.reduce_sum(lossMean, axis=-1)
    return tf.reduce_mean(loss,axis=-1)

def mean_mae(y_trueScaled, y_predAll):
    y_predMean =y_predAll[:,283,0:1]
    y_trueMean = tf.expand_dims(tf.reduce_mean(y_trueScaled,axis=-1),axis=-1)
    mae = tf.reduce_mean(tf.abs(y_predMean-y_trueMean))
    return mae

def deviation_mae(y_trueScaled, y_predAll):
    y_pred=y_predAll[:,0:283,:]
    y_trueMean = tf.expand_dims(tf.reduce_mean(y_trueScaled,axis=-1),axis=-1)
    y_predScaled = y_pred[:, :,0]
    y_trueDiff2Mean = y_trueScaled - y_trueMean
    lossPred = tf.abs(y_trueDiff2Mean - y_predScaled)#tf.math.abs(y_true_zScore-y_predZScore)

    combinedLoss = tf.reduce_mean(lossPred)
    return combinedLoss

def logLoss_mae(y_trueScaled, y_predAll):
    y_pred=y_predAll[:,0:283,:]
    y_predMean =y_predAll[:,283,0:1]

    y_trueMean = tf.expand_dims(tf.reduce_mean(y_trueScaled,axis=-1),axis=-1)
    lossMean = tf.abs(y_predMean-y_trueMean)
    y_predScaled = y_pred[:, :,0]
    y_trueDiff2Mean = y_trueScaled - y_trueMean
    lossDiff2Mean = tf.abs(y_trueDiff2Mean - y_predScaled)#tf.math.abs(y_true_zScore-y_predZScore)

    logConfidence = tf.math.exp(y_pred[:, :,1]) # logSigma = log(sigma / std)  we predict sigma NOT stdDev!!
    largerThanT = tf.greater(logConfidence, tf.exp(20.0))
    logConfidence = tf.where(largerThanT, y_pred[:,:,1] + tf.exp(20.0), logConfidence)

    lossPred =lossMean + lossDiff2Mean
    loss_2 = tf.abs(lossPred-(logConfidence))
    rmLossLog = tf.reduce_mean(loss_2)
    return rmLossLog

def log_loss_maxScaling(y_trueMax, y_pred):
    y_true = y_trueMax * maxTrainLabels #std + mean   # y_zScore = (y - mean) / std -> y = y_zScore *std + mean

    y_predMax = y_pred[:, :,0]
    log_sigma = y_pred[:, :,1]  # Log of the standard deviation / we predict log(stdDev_zScore) = log(stdDev / std) = log(stdDev) - log(std) -> log(stdDev) = log(stdDev_zScore) + log(std)

    y_pred0 = y_predMax *maxTrainLabels #* std + mean
    sigma = tf.exp(log_sigma)*maxTrainLabels  # Exponentiate to get variance + scale back from zscore 
    logStdDev = tf.math.log(sigma*sigma)# + tf.math.log(max)

    L_pred = -0.5*(tf.math.log(2*np.pi) + logStdDev + tf.square((y_true - y_pred0) / sigma))
    L_ref = -0.5*(tf.math.log(2*np.pi) +  tf.math.log(stdLabels**4) + tf.square((y_true - meanLabels)/(stdLabels*stdLabels)))   # ( (y_true - mean)/std )^2 = y_trueZScore^2  (y_true = y_trueZScore * std + mean)
    L_ideal = -0.5*(tf.math.log(2*np.pi) + tf.math.log((1e-5)**4)) * tf.ones_like(y_predMax)
    L = -((tf.reduce_sum(L_pred) -tf.reduce_sum(L_ref)) / (tf.reduce_sum(L_ideal) - tf.reduce_sum(L_ref)) -1)
    return L

#loss_mse(batch[1],out)
#combined_loss_mse(batch[1],out)
mean_mae(batch[1],out),deviation_mae(batch[1],out),logLoss_mae(batch[1],out),combined_loss_mse(batch[1],out)

(<tf.Tensor: shape=(), dtype=float32, numpy=1.3194044>,
 <tf.Tensor: shape=(), dtype=float32, numpy=0.005194291>,
 <tf.Tensor: shape=(), dtype=float32, numpy=1.1058421>,
 <tf.Tensor: shape=(), dtype=float32, numpy=30.693428>)

In [10]:
tf.random.set_seed(42)

lr_callback = tf.keras.callbacks.LearningRateScheduler(lambda step: LR_SCHEDULE[step], verbose=0)
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath="C:/Users/uic33116/Documents/documents/ariel-data-challenge-2024/training_full_model/model-{epoch:02d}.weights.h5",
    save_weights_only=True,  # Set to False if you want to save the entire model
    save_freq=100 * 5,
    verbose=1
)

#optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(loss=combined_loss_mse            
              #,metrics=[log_likelihood_maxScaling]
              ,metrics=[mean_mae,deviation_mae,logLoss_mae]
              , optimizer=optimizer)

history = model.fit(train_dataset, 
                    #batch[0],batch[1], #verbose=2,
                    validation_data=test_dataset,
                    #validation_data=(test_batch[0],test_batch[1]),
                    epochs=800, batch_size=batch_size,
                    callbacks=[checkpoint_callback]
                    #callbacks=[lr_callback]
                    )


Epoch 1/800
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 3s/step - deviation_mae: 0.0054 - log_loss_mae: 0.6341 - loss: 15.3297 - mean_mae: 0.9232 - val_deviation_mae: 0.0041 - val_log_loss_mae: 0.8745 - val_loss: 0.4354 - val_mean_mae: 0.1267
Epoch 2/800
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 3s/step - deviation_mae: 0.0056 - log_loss_mae: 0.6654 - loss: 1.7713 - mean_mae: 0.3328 - val_deviation_mae: 0.0041 - val_log_loss_mae: 0.7132 - val_loss: 1.3386 - val_mean_mae: 0.2808
Epoch 3/800
[1m1/5[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m6s[0m 2s/step - deviation_mae: 0.0036 - log_loss_mae: 0.7016 - loss: 1.3068 - mean_mae: 0.2929

In [None]:
history = model.fit(#train_dataset, 
                    batch[0],batch[1], #verbose=2,
                    #validation_data=test_dataset,
                    validation_data=(test_batch[0],test_batch[1]),
                    epochs=800, batch_size=batch_size,
                    callbacks=[checkpoint_callback]
                    #callbacks=[lr_callback]
                    )

In [None]:
# experiments with original data (faster processing)
# equal weighted loss of mean & stddev -> mse ~ 7/7 -> but can't fit targes at all!!!!!! this is the issue

# predicting mean only doesn't work at all, only getting ~0.17 mae on the mean, pretty much 2 modes
# solution: mean has to have activation function linear instead of relu!! network has to be able to see that negative values are bad, otherwise no feedback signal!!!

# predicting mean only for 1 batch:
#   base run -> 0.08 mae
#   compensate targets to 0 mean -> no benefit (mae loss:0.3)
#   with big model also works, mae < 0.08 for one batch & ones in other outputs

# predicting everything for 1 batch:
#   loss goes down nicely, hitting <0.7 for mean mae

# fitting full thing
# massive blowup at epoch 801, before loss 3/3
# loss 1.5/3.0 -> we fit train well but test has some issues / wavelengths are not continuous since each point is individually predicted


In [60]:
#model.save('originalData_fullM_866_epochs_blowup59_63.keras')
# Save weights
model.save_weights('800_fullModel_linearMean0078_0017_fullLoss_1_3.weights.h5')

# Load weights
#loaded_weights = model.load_weights('170_epochs_accLoss_reluActivation_23_23.weights.h5')

In [None]:
loaded_model = tf.keras.models.load_model('120_epochs_accLoss31_30.keras')

In [None]:
# first try couldn't fit the values, just predicted mean if I kept the shape (output layer of shape 1 - tensor 283x100 -> 283x1)
# having a flatten layer between converges

# flatten layer and 12 samples -> predict the same for all 12 samples, maybe not enough filters

# PROBLEM why we can't fit multiple targets: layer normalization!! use batch norm instead

#---- with batch norm
# cnn model + mean estimation, loss ~80, but predicting differnt mean
# fcn model, loss ~81
# fcn model / min scaling -> loss 0.8 / 27 (lots of negative predictions)
# fcn model / max scaling / relu activation -> 3.1/8 (lots of 0 predictions) / with scale of 100, loss =14.9/43691
# cnn model / max scaling / mean pred -> 6.0/inf
# cnn ... no layer norm in beginning -> 15

# loss function for every output (batch,283) / 100 epochs
# cnn 1.5 loss
# cnn with smaller LR 0.22(also after 200 epochs)
# cnn with separate mean prediction loss 20.5 (lr0.0001) vs 3.5(lr0.0005) / can't even fit 2 samples (0.5 for lr 0.0005)

# cnn without mean prediction (2 samples, lr0.0005) 22.4   / lr0.001 0.4 loss, but targets still fit badly / only fitting target noVar 0.08 still bad

# difference between train / test = batch norm has significant effect here
#fcn + mean, 2 samples LR0.0005 -> 
#fcn + mean, 2 samples only loss on target -> 
#fcn + mean, 1 sample, only loss on target -> 0.03 targets are far off
#fcn, 1 sample, only loss on target -> 0.4 targets are far off
# -> train data was not normalized!!

# with regularization / without regularization doesn't matter that much as long as sample is normalized
# normalization per sample -> predict the same for all targets ~0.0978
# norm per sample + bis estimation -> predict same for all targets (besides 1) ~0.0978

# with learning rate schedule -> 0.06 lots more possible to not get stuck in local minima

#cnn / norm over train / bias estimation / lr0.01 / only target -> ~45 sum loss
#cnn / norm over train / bias estimation / lf0.01 / target + loss2 -> ~47 after 95 epochs (15 after~150epochs)

#cnn / norm over train / bais est / lr0.01 / target + loss / activation function relu instead of linear (conf + bias / still nan bc stddev =0, log(0) = nan)
# 39/40 but training seems to be a lot more stable
# after 170 epochs 23.7/23.5
# after 220 epochs 11/19 (but already went down to 14/16)
# after 250 epochs 12/16 (but already 16/15)
# after 300 epochs 12/17


# Assuming 'history' is your model's training history
train_loss = history.history['loss']
test_loss = history.history['val_loss']

epochs = range(1, len(train_loss) + 1)

plt.figure(figsize=(12, 6))
plt.plot(epochs, train_loss, 'b', label='Training loss')
#plt.plot(epochs, test_loss, 'r', label='Test loss')
plt.title('Training and Test Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

# eval

In [None]:
#pred = model.predict(normData)
outputs = model.predict(test_batch[0])
pred = outputs[:,0:283,:]
pred[:,:,0] = pred[:,:,0]+outputs[:,283,0:1]
pred[0:10:,0:2,0]*maxLabels, test_batch[1][0:10:,0:2]*maxLabels ,np.exp(pred[0:10:,0:2,1])*maxLabels

In [None]:
outputs[:,283,0], np.mean(batch[1],axis=-1),np.sum(np.abs(outputs[:,283,0]- np.mean(batch[1],axis=-1)))/batch[1].shape[0]

In [None]:
combined_loss_mse(test_batch[1],outputs)

In [None]:
outputs = model.predict(test_batch[0])
print('overall',(log_likelihood_maxScaling(test_batch[1],outputs)))
#for i in range(batch_size):
#    print(f'batch {i}',(log_likelihood_maxScaling(batch[1][i,:],outputs[i:i+1,:,:])))

In [None]:
for x,y in test_dataset:
    outputs = model.predict(x)
    outputs[:,0:283,0]=y
    outputs[:,283,0] = 0
    m = mse(y,outputs)
    s = tf.exp(outputs[:,0:283,1])*maxTrainLabels
    print(np.mean(m), np.mean(s), np.min(s),np.max(s))

In [None]:
outputs.shape, y.shape

In [None]:
fig = go.Figure()
for i in range(4):#[2,6,10,20,100]:
    fig.add_trace(go.Scatter(y=batch[0][i,:,0,0],mode='markers',name=f'f_{i}',marker=dict(size=3)))

fig.show()

In [None]:
outputs = model.predict(batch[0])
pred = outputs[:,0:283,:]
pred[:,:,0] = pred[:,:,0]+outputs[:,283,0:1]

fig = go.Figure()
for i in range(10): #range(12):# 
    fig.add_trace(go.Scatter(y=batch[1][i,:],mode='markers',name=f'gt_{i}',marker=dict(size=3)))
    fig.add_trace(go.Scatter(y=pred[i,:,0],mode='markers',name=f'pred_{i}',marker=dict(size=3)))
fig.show()

In [None]:
for i in range(10): #range(12):#
    fig = go.Figure()
    fig.add_trace(go.Scatter(y=batch[1][i,:],mode='markers',name=f'gt_{i}',marker=dict(size=3)))
    fig.add_trace(go.Scatter(y=pred[i,:,0],mode='markers',name=f'pred_{i}',marker=dict(size=3)))
    fig.show()