In [None]:
import dask.dataframe as dd
import matplotlib.pyplot as plt
import os
from sklearn.metrics import r2_score
import dask_ml
import dask
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.preprocessing import QuantileTransformer, PowerTransformer

import warnings
warnings.filterwarnings("ignore", category=UserWarning, message=".*Sending large graph.*")

from dask.distributed import Client, LocalCluster
import dask.multiprocessing

cluster = LocalCluster(processes=True,n_workers=6, threads_per_worker=1)
client = Client(cluster)
import sys
import pickle 

from data_helpers import *
from metrics import *

In [None]:
folders = [
    'train0_25',
    'train25_50',
    'train50_75',
    'train75_100'
]

# Read Parquet files from each folder into Dask DataFrames
dfs = [dd.read_parquet(folder) for folder in folders]

# Concatenate all DataFrames into a single DataFrame
data = dd.concat(dfs)

In [None]:
np.random.seed(42)

orig_partitions = [i for i in range(0,int(data.npartitions))]
np.random.shuffle(orig_partitions) #shuffles inplace

trainSep = int(0.7* data.npartitions)
valEnd = data.npartitions #int(0.05* data.npartitions) + trainSep

sampledPartIdxTrain = orig_partitions[0:trainSep]
sampledPartIdxTest  = orig_partitions[trainSep:valEnd]

In [None]:
with open('meanDict_allT.pkl', 'rb') as f:
    meanDict = pickle.load(f)

with open('stdDict_allT.pkl', 'rb') as f:
    stdDict = pickle.load(f)

with open('minVal_allT2.pkl', 'rb') as f:
    minDict = pickle.load(f)

with open('zScore_allT2.pkl', 'rb') as f:
    zscoreDict = pickle.load(f)

In [None]:
baseTrain = data.partitions[sampledPartIdxTrain[0:15]].compute()
largeV_26 = pd.read_csv('large_ptend_q0002_26_allData.csv')
largeV = pd.read_parquet('large_training_df_0001')

In [None]:
val = data.partitions[sampledPartIdxTest[15:30]].compute()

# log function 2

In [None]:
"""
custom log function to map into a continuous region, gives more resolution to the small values
"""
def custom_log_2(x, minValue, offset=6, nullValFactor=1):  #offset of works for [-403:403] of x values otherwise sign is lost
    nullValueFeat = -minValue*nullValFactor             # define the 0-value in the feature space
    x[x==0] = nullValueFeat                             # will make problems bc 0 could be positive but also negative! dynamics will point in different directions
    y = np.log(abs(x))
    y = y - offset                                      #move curve down such that we have a bigger domain that always has negative values as an outcome [-403:403]
    nullValueLog = np.log(abs(nullValueFeat)) - offset  # transform 0-value into log space
    y[x>0] = nullValueLog - (y[x>0] - nullValueLog)
    print(nullValueLog,nullValueFeat)
    y = y - nullValueLog
    return y


def inv_custom_log_2(y,minDict, offset=6, nullValFactor=1):
    nullValueFeat = -minDict['min']*nullValFactor
    nullValueLog  = np.log(abs(nullValueFeat)) - offset 

    #print(nullValueLog,nullValueFeat)
    x = y.copy()
    x = x + nullValueLog
    x[y<nullValueLog] = nullValueLog - (x[y<nullValueLog] - nullValueLog) # remap to log function
    #print('mirror values',x)
    x = x + offset                                                        # add offset
    x = np.exp(x)                                                         # apply exp funciton (all pos values aftewards)
    #print('exp values',x)
    x[x<nullValueFeat] = 0                                                # map to 0
    x[y>nullValueLog] = -x[y>nullValueLog]                                # find negative values

    # clip to physical values
    x = np.clip(x, minDict['maxNeg'], minDict['maxPos'])
    return x

In [None]:
minValue

In [None]:
train[transfF] = custom_log_2(train[f].copy(), minValue=4e-23)
val[transfF] = custom_log_2(val[f].copy(), minValue=4e-23)

In [None]:
abs(train.loc[train[f] != 0][f]).sort_values(), f

In [None]:
f2 = 'ptend_t_1'
datafeat2 = data['ptend_t_1'].compute()

In [None]:
abs(data.loc[data[f] != 0][f]).min().compute() 
# more smaller values in data that are not in training! 
# need to find a dataset with small and large values -> the extremes

In [None]:
dataFeat = data[f].compute()

In [None]:
dataFeat.loc[(dataFeat !=0) & (abs(dataFeat)<1e-24)]

In [None]:
res

In [None]:
fig, ax = plt.subplots()
res = ax.hist(np.log(abs(datafeat2.loc[datafeat2!=0])), bins=20)

valArr = res[1]
ax.set_xticks(valArr)
ax.set_xticklabels(np.exp(valArr))

# maybe log transform is not the best since it makes the space between 0 and the next small values so large -> rescaling might be better

# for q0002_26 extreme values are sparse -> log transform works perfectly
# for q0002_55 tiny values are sparse -> uninteresting

In [None]:
zscoreDict

# find correct weighting

In [None]:
minDimNeg = abs(datafeat2.loc[datafeat2 < 0]).min()
minDimPos = abs(datafeat2.loc[datafeat2 > 0]).min() 
maxDimNeg = abs(datafeat2.loc[datafeat2 < 0]).max()
maxDimPos = abs(datafeat2.loc[datafeat2 > 0]).max() 

minDimNeg, minDimPos, maxDimNeg, maxDimPos

In [None]:
def findBin(x,bins):
        foundBin = False
        for i in range(len(bins)-1):

            if x>=bins[i] and x<=bins[i+1]:# and foundBin==False:
                foundBin=True
                relevantBin = bins[i]
                break
            # if rounding errors occure
            if foundBin == False and i == len(bins)-2:
                isBinCloseToTopEdge = x>=bins[i] and x<=bins[i+1]*1.1
                isBinCloseToBottomEdge = x>=bins[0]*0.9 and x<=bins[1]
                if isBinCloseToTopEdge:
                    relevantBin = bins[i]
                    foundBin=True
                if isBinCloseToBottomEdge:
                    relevantBin = bins[0]
                    foundBin=True
        if foundBin == False:
            print('didnt find bin, something is wrong!',x)
            return 0
        else:
            return relevantBin

""" calc normalized weight of histogram distribution """
def calcNormWeightLogDist(datafeat2,binsPos=10, binsNeg=10):
    resNeg = np.histogram(np.log(abs(datafeat2.loc[datafeat2 < 0])), bins = binsNeg)
    resPos = np.histogram(np.log(abs(datafeat2.loc[datafeat2 > 0])), bins = binsPos)
    nZero  = datafeat2.loc[datafeat2 == 0].shape[0]

    counts, bins = resPos
    nBinsP = len(resPos[1])
    nBinsN = len(resNeg[1])
    overallBins = np.zeros((nBinsP+1+nBinsN))
    overallBins[0:nBinsN] = np.sort(-np.exp(resNeg[1]))
    overallBins[nBinsN:nBinsN+1] = 0
    overallBins[nBinsN+1:nBinsN+1+nBinsP] = np.exp(resPos[1])

    counts, bins = np.histogram(datafeat2, bins=overallBins)

    df = pd.DataFrame(datafeat2, columns=[f2])
    df['bins'] = df[f2].apply(lambda x: findBin(x,bins))

    valueCountsDict = df['bins'].value_counts()
    df['weightBins'] = df['bins'].apply(lambda x: valueCountsDict[x])

    df['normWeightBins'] = 1/df['weightBins']
    df['normWeightBins'] = df['normWeightBins'] / df.normWeightBins.sum() # weights sum to 1
    return df

def calcNormWeightLogDistDf(df,f, binsPos=10, binsNeg=10):
    datafeat2 = df[f]
    resNeg = np.histogram(np.log(abs(datafeat2.loc[datafeat2 < 0])), bins = binsNeg)
    resPos = np.histogram(np.log(abs(datafeat2.loc[datafeat2 > 0])), bins = binsPos)
    nZero  = datafeat2.loc[datafeat2 == 0].shape[0]

    counts, bins = resPos
    nBinsP = len(resPos[1])
    nBinsN = len(resNeg[1])
    overallBins = np.zeros((nBinsP+1+nBinsN))
    overallBins[0:nBinsN] = np.sort(-np.exp(resNeg[1]))
    overallBins[nBinsN:nBinsN+1] = 0
    overallBins[nBinsN+1:nBinsN+1+nBinsP] = np.exp(resPos[1])

    counts, bins = np.histogram(datafeat2, bins=overallBins)

    df['bins'] = df[f].apply(lambda x: findBin(x,bins))

    valueCountsDict = df['bins'].value_counts()
    df['weightBins'] = df['bins'].apply(lambda x: valueCountsDict[x])

    df['normWeightBins'] = 1/df['weightBins']
    df['normWeightBins'] = df['normWeightBins'] / df.normWeightBins.sum() # weights sum to 1
    return df


def calcWeightZScore(df, f, meanDict, stdDict):
    df['zScore'] = (df[f] - meanDict[f])/stdDict[f]
    df['weight'] = abs(df['zScore'])
    df['normWeightZScore'] = df['weight'] / df['weight'].sum() # sums weights to 1 / make them comparable
    return df

In [None]:
f2 = 'ptend_q0001_17'
datafeat2 = data[f2].compute()
df_q0001_17 = calcNormWeightLogDist(datafeat2)
df_q0001_17 = calcWeightZScore(df_q0001_17, f2, meanDict, stdDict)

In [None]:
df_q0001_17 = df_q0001_17.drop(['bins','weightBins','zScore','weight'], axis=1)
df_q0001_17['weight'] = df_q0001_17['normWeightBins'] + df_q0001_17['normWeightZScore']
df_q0001_17

In [None]:
#filtered = largeV.loc[(largeV[f] >= (meanDict[f] + 10*stdDict[f])) | (largeV[f] <= (meanDict[f] - 10*stdDict[f]))]
#train = pd.concat([baseTrain, filtered], axis=0)
train = pd.concat([baseTrain, largeV], axis=0)

f = 'ptend_q0001_17'
train = calcNormWeightLogDistDf(train, f, binsNeg=50, binsPos=50)
train = calcWeightZScore(train, f, meanDict, stdDict)

In [None]:
sampled = []
for bin in train.bins.unique():
    sampled.append(train.loc[train.bins == bin].sample(n=1000, replace=True))
sampled=pd.concat(sampled)

In [None]:
sampled

In [None]:
valSet = lgb.Dataset(val[allF], label=val[f], free_raw_data=False)
#trainSet = lgb.Dataset(train[allF], train[f], weight=train['normWeightBins'], free_raw_data=False)
#trainSet = lgb.Dataset(sampled[allF], sampled[f], free_raw_data=False)
trainSet = lgb.Dataset(baseTrain[allF], baseTrain[f], free_raw_data=False)

params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'l2',
    #'num_leaves': 15,
    #'learning_rate': 0.05,
    #'feature_fraction': 0.9,
    #'bagging_fraction': 0.8,
    #'bagging_freq': 5,
    'verbose': 1
}


gbm = None

gbm = lgb.train(params,
            trainSet,
            num_boost_round=500, 
            valid_sets=valSet,
            init_model=gbm)

predTrain = gbm.predict(train[allF])
predVal = gbm.predict(val[allF])
r2train =r2_score(train[f], predTrain)
r2test =r2_score(val[f], predVal)
print('r2 scores', r2train,r2test)# 'transormed',r2_score(train[transfF], predTrain0),r2_score(val[transfF], predVal0))

In [None]:
""" in feature space """
plt.scatter(x=range(train.shape[0]),y=predTrain, s=1,label='pred_train')
plt.scatter(x=range(train.shape[0]),y=train[f], s=1,label=f)
plt.legend()
plt.show()

plt.scatter(x=range(val.shape[0]),y=predVal, s=1,label='pred_test')
plt.scatter(x=range(val.shape[0]),y=val[f], s=1,label=f)

plt.legend()
plt.show()

# with both weights added: r2 scores 0.620568128775495 -12.886684419310917
# with only bin weight:    r2 scores 0.38436556429906865 -7.106889138729176

# only bin weight + filtered large Value; r2 scores 0.970014629131281 -30.927699550397456

# sampled dataframe based on bin, no weight:  r2 scores 0.38266688119407166 -6.823952224204593
# just with base train                        r2 scores 0.00028948769681691466 -0.2786449039559715 (overfitting to small)


TODO: define weighting base on:
- how many samples are in the respective dimension: build histogram over pos & negative samples + 0 -> equally weight all of them
- z score of samples -> the larger the more important

# test weighted lgbm

In [None]:
f = 'ptend_q0001_17'

filtered = largeV.loc[abs(largeV[f]) >= abs(meanDict[f])]
filtered = largeV.loc[(largeV[f] >= (meanDict[f] + 20*stdDict[f])) | (largeV[f] <= (meanDict[f] - 20*stdDict[f]))]
print(filtered.shape, largeV.shape)
#train = pd.concat([baseTrain,largeV_f], axis = 0)
#train = pd.concat([baseTrain,largeV_26], axis = 0)
train = pd.concat([baseTrain,filtered], axis = 0)
#train = pd.concat([baseTrain,largeV], axis = 0)

valSet = lgb.Dataset(val[allF], label=val[f], free_raw_data=False)
weight = (((train[f] - meanDict[f])/stdDict[f])**2)#specific weighting based on feature
weight = abs(((train[f] - meanDict[f])/stdDict[f]))
#weight = weight / min(weight)
trainSet = lgb.Dataset(train[allF], train[f], weight=train['weight'], free_raw_data=False)

params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'l2',
    #'num_leaves': 15,
    #'learning_rate': 0.05,
    #'feature_fraction': 0.9,
    #'bagging_fraction': 0.8,
    #'bagging_freq': 5,
    'verbose': 1
}


gbm = None

gbm = lgb.train(params,
            trainSet,
            num_boost_round=500, 
            valid_sets=valSet,
            init_model=gbm)

predTrain = gbm.predict(train[allF])
predVal = gbm.predict(val[allF])
r2train =r2_score(train[f], predTrain)
r2test =r2_score(val[f], predVal)
print('r2 scores', r2train,r2test)# 'transormed',r2_score(train[transfF], predTrain0),r2_score(val[transfF], predVal0))

In [None]:
""" in feature space """
plt.scatter(x=range(train.shape[0]),y=predTrain, s=1,label='pred_train')
plt.scatter(x=range(train.shape[0]),y=train[f], s=1,label=f)
plt.legend()
plt.show()

plt.scatter(x=range(val.shape[0]),y=predVal, s=1,label='pred_test')
plt.scatter(x=range(val.shape[0]),y=val[f], s=1,label=f)

plt.legend()
plt.show()

# test transformed lgbm

In [None]:
f = 'ptend_q0001_17'
minValue = minDict[f]['min']
transfF = f+'_transf'

filtered = largeV.loc[abs(largeV[f]) >= abs(meanDict[f])]
print(filtered.shape, largeV.shape)
#train = pd.concat([baseTrain,largeV_f], axis = 0)
#train = pd.concat([baseTrain,largeV_26], axis = 0)
#train = pd.concat([baseTrain,filtered], axis = 0)
train = pd.concat([baseTrain,largeV], axis = 0)

train[transfF] = custom_log_2(train[f].copy(), minValue=minValue)
val[transfF] = custom_log_2(val[f].copy(), minValue=minValue)

valSet = lgb.Dataset(val[allF], label=val[transfF], free_raw_data=False)
weight = (((train[f] - meanDict[f])/stdDict[f])**2)#specific weighting based on feature
weight = abs(((train[f] - meanDict[f])/stdDict[f]))
weight = weight / min(weight)
trainSet = lgb.Dataset(train[allF], train[transfF], weight=weight)#train['weight'], free_raw_data=False)

params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'l2',
    #'num_leaves': 15,
    #'learning_rate': 0.05,
    #'feature_fraction': 0.9,
    #'bagging_fraction': 0.8,
    #'bagging_freq': 5,
    'verbose': 1
}


gbm = None

gbm = lgb.train(params,
            trainSet,
            num_boost_round=200, 
            valid_sets=valSet,
            init_model=gbm)

predTrain0 = gbm.predict(train[allF])
predVal0 = gbm.predict(val[allF])
predTrain = inv_custom_log_2(predTrain0, minDict[f])
predVal = inv_custom_log_2(predVal0, minDict[f])
r2train =r2_score(train[f], predTrain)
r2test =r2_score(val[f], predVal)
print('r2 scores', r2train,r2test, 'transormed',r2_score(train[transfF], predTrain0),r2_score(val[transfF], predVal0))

In [None]:
predTrain0 = gbm.predict(train[allF])
predVal0 = gbm.predict(val[allF])
predTrain = inv_custom_log_2(predTrain0, minDict[f])
predVal = inv_custom_log_2(predVal0, minDict[f])
r2train =r2_score(train[f], predTrain)
r2test =r2_score(val[f], predVal)
print('r2 scores', r2train,r2test, 'transormed',r2_score(train[transfF], predTrain0),r2_score(val[transfF], predVal0))

In [None]:
train['prediction_log'] = predTrain0
train['prediction_feat'] = predTrain

In [None]:
""" in feature space """
plt.scatter(x=range(train.shape[0]),y=train[f], s=1,label=f)
plt.scatter(x=range(train.shape[0]),y=predTrain, s=1,label='pred_train')
plt.legend()
plt.show()

plt.scatter(x=range(val.shape[0]),y=val[f], s=1,label=f)
plt.scatter(x=range(val.shape[0]),y=predVal, s=1,label='pred_test')
plt.legend()
plt.show()

""" in transformed space """
plt.scatter(x=range(train.shape[0]),y=predTrain0, s=1,label='pred_train')
plt.scatter(x=range(train.shape[0]),y=train[transfF], s=1,label=f)
plt.legend()
plt.show()

plt.scatter(x=range(val.shape[0]),y=predVal0, s=1,label='pred_test')
plt.scatter(x=range(val.shape[0]),y=val[transfF], s=1,label=f)
plt.legend()
plt.show()

#  correlation between targets
- only targets close to each other are strongly correlated
- some have correlations to other variables as well
- weak correlation of features to targets (no surprise)

In [None]:
q0002_f = []
for i in range(60):
    if i <12:
        continue
    q0002_f.append('ptend_q0003_'+str(i))

In [None]:
allF

In [None]:
corrMat = train[allF].corr()

In [None]:
import seaborn as sns
plt.figure(figsize=(8, 6))  # Adjust the figure size as needed
sns.heatmap(corrMat)
plt.title(f"Correlation Matrix")
plt.show()

In [None]:
pd.set_option('display.max_rows',None)
corrMat['ptend_q0003_17']