In [None]:
import dask.dataframe as dd
import matplotlib.pyplot as plt
import os
from sklearn.metrics import r2_score
import dask_ml
import dask
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.preprocessing import QuantileTransformer, PowerTransformer

import warnings
warnings.filterwarnings("ignore", category=UserWarning, message=".*Sending large graph.*")

from dask.distributed import Client, LocalCluster
import dask.multiprocessing

cluster = LocalCluster(processes=True,n_workers=6, threads_per_worker=1)
client = Client(cluster)
import sys
import pickle 

from data_helpers import *
from metrics import *

In [None]:
folders = [
    'train0_25',
    'train25_50',
    'train50_75',
    'train75_100'
]

# Read Parquet files from each folder into Dask DataFrames
dfs = [dd.read_parquet(folder) for folder in folders]

# Concatenate all DataFrames into a single DataFrame
data = dd.concat(dfs)

In [None]:
with open('meanDict_allT.pkl', 'rb') as f:
    meanDict = pickle.load(f)

with open('stdDict_allT.pkl', 'rb') as f:
    stdDict = pickle.load(f)

with open('minVal_allT2.pkl', 'rb') as f:
    minDict = pickle.load(f)

with open('zScore_allT2.pkl', 'rb') as f:
    zscoreDict = pickle.load(f)

In [None]:
def render(seq1, transf1, f1):
    df = pd.DataFrame(seq1)
    df['transf'] = transf1
    
    #a = df.sort_values(by=f1).iloc[0:10000]
    a = df.sample(n=1000000)
    
    """ in feature space """
    plt.scatter(x=range(a.shape[0]),y=a[f1], s=1,label=f1)
    plt.legend()
    plt.show()
    """ in transformed space """
    plt.scatter(x=range(a.shape[0]),y=a['transf'], s=1,label='transformed')
    plt.legend()
    plt.show()

In [None]:
np.exp(-0.5)

In [None]:
"""
custom log function to map into a continuous region, gives more resolution to the small values
"""
def custom_log_2(x, minValue, offset=6, nullValFactor=0.99):  #offset of works for [-403:403] of x values otherwise sign is lost
    nullValueFeat = -minValue*nullValFactor             # define the 0-value in the feature space
    x[x==0] = nullValueFeat                             # will make problems bc 0 could be positive but also negative! dynamics will point in different directions
    #plt.scatter(x=range(x.shape[0]),y=x, s=1,label='replace 0')
    #plt.legend()
    #plt.show()
    y = np.log(abs(x))
    #plt.scatter(x=range(x.shape[0]),y=y, s=1,label='log transf')
    #plt.legend()
    #plt.show()
    y = y - offset                                      #move curve down such that we have a bigger domain that always has negative values as an outcome [-403:403]
    #plt.scatter(x=range(x.shape[0]),y=y, s=1,label='offset')
    #plt.legend()
    #plt.show()
    nullValueLog = np.log(abs(nullValueFeat)) - offset  # transform 0-value into log space
    y[x>0] = nullValueLog - (y[x>0] - nullValueLog)
    #plt.scatter(x=range(x.shape[0]),y=y, s=1,label='mapping of pos values')
    #plt.legend()
    #plt.show()
    return y

"""
inverse custom log function to map into a continuous region, gives more resolution to the small values
"""
def inv_custom_log_2(y,minValue, offset=6, nullValFactor=0.99):
    nullValueFeat = -minValue*nullValFactor
    nullValueLog  = np.log(abs(nullValueFeat)) - offset 

    x = y.copy()
    x[y<nullValueLog] = nullValueLog - (x[y<nullValueLog] - nullValueLog) # remap to log function
    x = x + offset                                                        # add offset
    x = np.exp(x)                                                         # apply exp funciton (all pos values aftewards)
    x[x<nullValueFeat] = 0                                                # map to 0
    x[y>nullValueLog] = -x[y>nullValueLog]                                # find negative values
    return x

In [None]:
test_x = np.linspace(-2,2,4000)
test_y = np.ones(test_x.shape)# np.log(abs(test_x))

transf = custom_log_2(test_y, minValue=1e-10)
inv_transf = inv_custom_log_2(transf, minValue=1e-10)

plt.scatter(x=test_x,y=test_y, s=1,label='gt')
plt.scatter(x=test_x,y=inv_transf, s=1,label='reverse transf')
plt.legend()
plt.show()

plt.scatter(x=test_x,y=transf, s=1,label='transf')
plt.legend()
plt.show()

In [None]:
sampled = seq1.sample(n=100000)
transf = custom_log_2(sampled, minValue=minDict[f1]['min'])
inv_transf = inv_custom_log_2(transf, minValue=minDict[f1]['min'])

plt.scatter(x=range(sampled.shape[0]),y=sampled, s=1,label='gt')
plt.scatter(x=range(sampled.shape[0]),y=inv_transf, s=1,label='reverse transf')
plt.legend()
plt.show()

plt.scatter(x=range(sampled.shape[0]),y=transf, s=1,label='transf')
plt.legend()
plt.show()

In [None]:
f1 = 'ptend_q0002_26'
seq1 = data[f1].compute()

transf1 = custom_log(seq1, minValue=minDict[f1]['min'])

render(seq1, transf1, f1)

In [None]:
f1 = 'ptend_q0002_15'
seq1 = data[f1].compute()

transf1 = custom_log(seq1, minValue=minDict[f1]['min'])

render(seq1, transf1, f1)

In [None]:
f1 = 'ptend_u_25'
seq1 = data[f1].compute()

transf1 = custom_log(seq1, minValue=minDict[f1]['min'])

render(seq1, transf1, f1)