In [None]:
import dask.dataframe as dd
import matplotlib.pyplot as plt
import os
from sklearn.metrics import r2_score
import dask_ml
import dask
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.preprocessing import QuantileTransformer, PowerTransformer

import warnings
warnings.filterwarnings("ignore", category=UserWarning, message=".*Sending large graph.*")

from dask.distributed import Client, LocalCluster
import dask.multiprocessing

cluster = LocalCluster(processes=True,n_workers=6, threads_per_worker=1)
client = Client(cluster)
import sys
import pickle 

from data_helpers import *
from metrics import *

In [None]:
folders = [
    'train0_25',
    'train25_50',
    'train50_75',
    'train75_100'
]

# Read Parquet files from each folder into Dask DataFrames
dfs = [dd.read_parquet(folder) for folder in folders]

# Concatenate all DataFrames into a single DataFrame
data = dd.concat(dfs)

In [None]:
np.random.seed(42)

orig_partitions = [i for i in range(0,int(data.npartitions))]
np.random.shuffle(orig_partitions) #shuffles inplace

trainSep = int(0.7* data.npartitions)
valEnd = data.npartitions #int(0.05* data.npartitions) + trainSep

sampledPartIdxTrain = orig_partitions[0:trainSep]
sampledPartIdxTest  = orig_partitions[trainSep:valEnd]

In [None]:
with open('meanDict_allT.pkl', 'rb') as f:
    meanDict = pickle.load(f)

with open('stdDict_allT.pkl', 'rb') as f:
    stdDict = pickle.load(f)

with open('minVal_allT2.pkl', 'rb') as f:
    minDict = pickle.load(f)

with open('zScore_allT2.pkl', 'rb') as f:
    zscoreDict = pickle.load(f)

In [None]:
min0 = 1000
minT = ''
max0 = 0
maxT = ''
for f in allT2:
    #if f == 'cam_out_SOLS' or f == 'cam_out_SOLL' or f == 'cam_out_NETSW':
    #    continue
    if minDict[f]['min'] < min0:
        min0=minDict[f]['min']
        minT = f
    if minDict[f]['max'] > max0:
        max0=minDict[f]['max']
        maxT = f

# single features
# sols and soll features are crazy small, to 1e-300 (log -> -744, with lots of values), next one is ptend_q0002_21 to 2e-62 (log -> 528)
# max can be up to 1100 for netsw (log -> 7)

In [None]:
min0, minT, max0, maxT, np.log(min0/max0), np.log(max0)

In [None]:
baseTrain=[]
nPartitions=20
for i in range(int(nPartitions/5)):
    baseTrain.append(data.partitions[sampledPartIdxTrain[i*5:i*5+5]].compute())
baseTrain = pd.concat(baseTrain)

baseVal=[]
nPartitions=10
for i in range(int(nPartitions/5)):
    baseVal.append(data.partitions[sampledPartIdxTest[i*5:i*5+5]].compute())
baseVal = pd.concat(baseVal)

In [None]:
largeV = pd.read_parquet('large_training_df_0001')

In [None]:
sep = int(largeV.shape[0]*1)
end = largeV.shape[0]-1
train=pd.concat([baseTrain, largeV.iloc[0:sep]])
val = pd.concat([baseVal, largeV.iloc[sep:end]])

In [None]:
del baseTrain, baseVal, largeV

In [None]:
# ensure the max value of each feature is in the set
for f in allT2:
    if max(abs(train[f])) < minDict[f]['max']*0.95:
        print(f, max(abs(train[f])), minDict[f]['max'])

# TODO: try with a better dataset -> find all max values of all targets

# transform targets

In [None]:
"""
absolute
"""
def custom_log_3(x, minValue, maxValue):  #offset of works for [-403:403] of x values otherwise sign is lost
    y = abs(x.copy())
    minValue = max(1e-60, abs(minValue)) #map all smaller things to 0
    
    nullValueFeat = abs(minValue)
    y[y<minValue] = nullValueFeat                             # will make problems bc 0 could be positive but also negative! dynamics will point in different directions
    
    y = y/maxValue  * 0.01
    y = np.log(abs(y))
    #y = y + abs(np.log(nullValueFeat))         #move curve down such that we have a bigger domain that always has negative values as an outcome [-403:403]
    return y

"""
absolute
"""
def inv_custom_log_3(y,minValue, maxValue):
    #minValue = max(1e-60, abs(minValue))
    nullValueFeat = abs(minValue)
    nullValueLog  = abs(np.log(abs(nullValueFeat)))
    #print(nullValueFeat, nullValueLog)

    x = y.copy()
    #x = np.clip(x, 0, 1e300) #can not have something smaller than 0
    #x = x - nullValueLog                                                        # add offset
    x = np.exp(x)
    x = np.clip(x,1e-300,0.01)
    x = x*maxValue/0.01
    return x


"""
continuous
"""
def custom_log_4(x, minValue, maxValue):  #offset of works for [-403:403] of x values otherwise sign is lost
    y = abs(x.copy())
    minValue = max(1e-30, abs(minValue)) #map all smaller things to 0
    
    nullValueFeat = abs(minValue)
    y[y<minValue] = nullValueFeat                             # will make problems bc 0 could be positive but also negative! dynamics will point in different directions
    
    y = y/maxValue  * 0.0001  # all values are max 0.01
    y = np.log(abs(y))      # -> all values are negative

    offset = -np.log(nullValueFeat/maxValue *0.0001)
    y = y + offset # -> all values are positive
    y = abs(y) * np.sign(x)
    #y = y + abs(np.log(nullValueFeat))         #move curve down such that we have a bigger domain that always has negative values as an outcome [-403:403]
    return y

"""
continuous region
"""
def inv_custom_log_4(y,minValue, maxValue):
    minValue = max(1e-30, abs(minValue))
    nullValueFeat = abs(minValue)
    nullValueLog  = abs(np.log(abs(nullValueFeat)))
    offset = -np.log(nullValueFeat/maxValue *0.0001)

    x = y.copy()
    x = abs(x)
    x = x - offset
    #x = np.clip(x, 0, 1e300) #can not have something smaller than 0
    #x = x - nullValueLog                                                        # add offset
    x = np.exp(x)
    x = np.clip(x, 0, 1e300)
    x = x * np.sign(y)
    x = x*maxValue/0.0001
    return x

def custom_log_5(x, minValue, maxValue):  #offset of works for [-403:403] of x values otherwise sign is lost
    y = abs(x.copy())
    minValue = max(1e-30, abs(minValue)) #map all smaller things to 0
    y[y<minValue] = minValue

    y = 1/y                 # take inverse, small values get very large
    #print(y)
    y = y / (1/minValue)    # normalize to 1, initial large values get very small
    print(y)
    y = np.log(y)           # -> all values are negative
    print(y)
    y = abs(y) * np.sign(x)
    return y

"""
continuous region
"""
def inv_custom_log_5(y,minValue, maxValue):
    minValue = max(1e-30, abs(minValue))

    x = -abs(y.copy())
    x = np.exp(x)
    x = x * (1/minValue)
    x = 1/x
    x = x * np.sign(y)
    return x

In [None]:
f = 'ptend_q0002_55'
logF = f+'_log'
invLogF = logF+'_inv'
train[logF] = custom_log_3(train[f], min0, max0)
train[invLogF] = inv_custom_log_3(train[logF], min0, max0)

train[[f, logF, invLogF]]

In [None]:
f = 'ptend_q0002_55'
logF = f+'_log'
invLogF = logF+'_inv'
train[logF] = custom_log_4(train[f], min0, max0)
train[invLogF] = inv_custom_log_4(train[logF], min0, max0)

train[[f, logF, invLogF]]

In [None]:
f = 'ptend_q0002_55'
logF = f+'_log'
invLogF = logF+'_inv'
train[logF] = custom_log_5(train[f], min0, max0)
train[invLogF] = inv_custom_log_5(train[logF], min0, max0)

train[[f, logF, invLogF]]

In [None]:
logFeatures = []
for f in allT2:
    logF = f+'_log'
    #train[logF] = custom_log_3(train[f], min0)
    #val[logF] = custom_log_3(val[f], min0)
    train[logF] = custom_log_3(train[f], minDict[f]['min'],minDict[f]['max'])
    val[logF] = custom_log_3(val[f], minDict[f]['min'],minDict[f]['max'])
    logFeatures.append(logF)

In [None]:
logFeatures = []
for f in allT2:
    logF = f+'_log'
    #train[logF] = custom_log_3(train[f], min0)
    #val[logF] = custom_log_3(val[f], min0)
    train[logF] = custom_log_4(train[f], minDict[f]['min'],minDict[f]['max'])
    val[logF] = custom_log_4(val[f], minDict[f]['min'],minDict[f]['max'])
    logFeatures.append(logF)

In [None]:
logFeatures = []
for f in allT2:
    logF = f+'_log'
    #train[logF] = custom_log_3(train[f], min0)
    #val[logF] = custom_log_3(val[f], min0)
    train[logF] = custom_log_5(train[f], minDict[f]['min'],minDict[f]['max'])
    val[logF] = custom_log_5(val[f], minDict[f]['min'],minDict[f]['max'])
    logFeatures.append(logF)

## correlations

In [None]:
corrMat = train[allF+logFeatures].corr()

In [None]:
import seaborn as sns
plt.figure(figsize=(8, 6))  # Adjust the figure size as needed
sns.heatmap(corrMat)
plt.title(f"Correlation Matrix")
plt.show()

# lgbms

In [None]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'l2',
    #'num_leaves': 15,
    'num_threads': 7,
    #'learning_rate': 0.05,
    #'feature_fraction': 0.9,
    #'bagging_fraction': 0.8,
    #'bagging_freq': 5,
    'verbose': -1
}

i = 0
log = {}
for f in ['cam_out_SOLL', 'ptend_u_21','ptend_q0002_55','ptend_q0002_26','cam_out_FLWDS']:#allT2:
    print('processing ',f)
    fileName = 'individualLGBMs_log/model_'+f+'.txt'
    logF = f+'_log'
    gbm = lgb.Booster(model_file=fileName) if i != 0 else None
    trainSet = lgb.Dataset(train[allF], label=train[logF], free_raw_data=False)
    valSet = lgb.Dataset(val[allF], val[logF], free_raw_data=False)
    gbm = lgb.train(params,
                trainSet,
                num_boost_round=500, 
                valid_sets=valSet,
                init_model=gbm)
    
    predTrain0 = gbm.predict(train[allF])
    predVal0 = gbm.predict(val[allF])
    #predTrain = inv_custom_log_3(predTrain0, min0)
    #predVal = inv_custom_log_3(predVal0, min0)
    predTrain = inv_custom_log_5(predTrain0, minDict[f]['min'],minDict[f]['max'])
    predVal = inv_custom_log_5(predVal0, minDict[f]['min'],minDict[f]['max'])
    r2train =r2_score(abs(train[f]), predTrain)
    r2test =r2_score(abs(val[f]), predVal)
    r2trainT = r2_score(train[logF], predTrain0)
    r2testT = r2_score(val[logF], predVal0)
    print('r2 scores', r2train,r2test, 'transormed',r2trainT,r2testT)
    log[f]={'train':r2train,'test':r2test,'trainTransf':r2trainT,'testTransf':r2testT}

    plt.scatter(x=range(train.shape[0]),y=abs(train[f]), s=1,label=f)
    plt.scatter(x=range(train.shape[0]),y=predTrain, s=1,label='pred_train')
    plt.legend()
    plt.show()
    plt.scatter(x=range(train.shape[0]),y=predTrain0, s=1,label='pred_train')
    plt.scatter(x=range(train.shape[0]),y=train[logF], s=1,label=f)
    plt.legend()
    plt.show()

    gbm.save_model(fileName)
    gbm.save_model('individualLGBMs_log/checkpoints/model_'+f+'_'+str(i)+'_'+str(round(r2test,3))+'.txt')

In [None]:
""" in feature space """
plt.scatter(x=range(train.shape[0]),y=abs(train[f]), s=1,label=f)
plt.scatter(x=range(train.shape[0]),y=predTrain, s=1,label='pred_train')
plt.legend()
plt.show()

plt.scatter(x=range(val.shape[0]),y=abs(val[f]), s=1,label=f)
plt.scatter(x=range(val.shape[0]),y=predVal, s=1,label='pred_test')
plt.legend()
plt.show()

""" in transformed space """
plt.scatter(x=range(train.shape[0]),y=predTrain0, s=1,label='pred_train')
plt.scatter(x=range(train.shape[0]),y=train[logF], s=1,label=f)
plt.legend()
plt.show()

plt.scatter(x=range(val.shape[0]),y=predVal0, s=1,label='pred_test')
plt.scatter(x=range(val.shape[0]),y=val[logF], s=1,label=f)
plt.legend()
plt.show()

# fully connected network

In [None]:
import tensorflow as tf

from tensorflow.keras import Input
from tensorflow.keras.layers import Dense, LSTM, Embedding, Concatenate,BatchNormalization, Reshape
from tensorflow.keras.models import Model

In [None]:
tf.random.set_seed(42)

numF = len(allF)
numT = len(allT2)

input = Input(shape=(numF))

x = BatchNormalization()(input)
#x = Dense(numF, activation='relu')(input)

print(x.shape)
for i in range(1):
    x = Dense((i+1)*numF, activation='relu')(x)
    print(x.shape)
for i in range(1):
    x = Dense(1/(i+1)*x.shape[1], activation='relu')(x)
    print(x.shape)
output = Dense(numT, activation='linear',name='output')(x)
print(output.shape)

model = Model(inputs=input, outputs=output)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), loss='mse', metrics=[RSquaredMetric()])
#model.summary()

hist = model.fit(train[allF], train[logFeatures], epochs=25, batch_size=512, validation_data=(val[allF],val[logFeatures]))

In [None]:
predTrain0 = model.predict(train[allF])
predVal0 = model.predict(val[allF])


In [None]:
predTrain0 = pd.DataFrame(predTrain0, columns=logFeatures)
predVal0 = pd.DataFrame(predVal0, columns=logFeatures)

In [None]:
for f in allT2:
    logF = f+'_log'
    invF = logF+'i'
    predTrain0[invF] = inv_custom_log_3(predTrain0[logF], min0)
    predVal0[invF] = inv_custom_log_3(predVal0[logF], min0)

In [None]:
for f in allT2:
    logF = f+'_log'
    invF = logF+'i'
    r2train =r2_score(train[f], predTrain0[invF])
    r2test =r2_score(val[f], predVal0[invF])
    print(f,'r2 scores', r2train,r2test, 'transormed',r2_score(train[logF], predTrain0[logF]),r2_score(val[logF], predVal0[logF]))
    """ in feature space """
    #plt.scatter(x=range(train.shape[0]),y=train[f], s=1,label=f)
    #plt.scatter(x=range(train.shape[0]),y=predTrain0[invF], s=1,label='pred_train')
    #plt.legend()
    #plt.show()

    """ in transformed space """
    plt.scatter(x=range(train.shape[0]),y=predTrain0[logF], s=1,label='pred_train')
    plt.scatter(x=range(train.shape[0]),y=train[logF], s=1,label=f)
    plt.legend()
    plt.show()

In [None]:
predTrain0.loc[predTrain0[invF] > 10000000000000000][[invF,logF]]