In [None]:
import dask.dataframe as dd
import matplotlib.pyplot as plt
import os
from sklearn.metrics import r2_score
import dask_ml
import dask
import pandas as pd
import numpy as np
import lightgbm as lgb

import warnings
warnings.filterwarnings("ignore", category=UserWarning, message=".*Sending large graph.*")

from dask.distributed import Client, LocalCluster
import dask.multiprocessing

cluster = LocalCluster(processes=True,n_workers=6, threads_per_worker=1)
client = Client(cluster)
import sys
import pickle 

from data_helpers import *
from metrics import *

In [None]:
folders = [
    #'train0_25',
    #'train25_50',
    'train50_75',
    #'train75_100'
]

# Read Parquet files from each folder into Dask DataFrames
dfs = [dd.read_parquet(folder) for folder in folders]

# Concatenate all DataFrames into a single DataFrame
data = dd.concat(dfs)

In [None]:
np.random.seed(42)

orig_partitions = [i for i in range(0,int(data.npartitions))]
np.random.shuffle(orig_partitions) #shuffles inplace

trainSep = int(0.95* data.npartitions)
valEnd = data.npartitions #int(0.05* data.npartitions) + trainSep

sampledPartIdxTrain = orig_partitions[0:trainSep]
sampledPartIdxTest  = orig_partitions[trainSep:valEnd]

## find min values

In [None]:
transfTarg =['ptend_q0002']# ['ptend_q0001','ptend_q0002','ptend_q0003']
transfTarg60 = []
for f in transfTarg:
    for i in range(60):
        transfTarg60.append(f+'_'+str(i))

In [None]:
minDict = {} #minimum value that is not 0
for f in ['ptend_q0001_26','ptend_q0002_26']:#transfTarg60: #allT:
    a = data[f].compute()
    hasPos = max(a)>0
    hasNeg = min(a)<0
    minNeg = min(abs(a.loc[a < 0])) if hasNeg else 1e10
    minPos = min(abs(a.loc[a > 0])) if hasPos else 1e10
    maxPos = max(a)
    maxNeg = abs(min(a))
    minDict[f] = {'minNeg':minNeg, 'minPos':minPos, 'min':min(minNeg,minPos), 'maxPos':maxPos, 'maxNeg':maxNeg, 'max':max(maxPos, maxNeg)}

In [None]:
minDict

In [None]:
with open('minVal_ptend_q0002_26.pkl', 'wb') as f:
    pickle.dump(minDict, f)    

In [None]:
with open('minVal_ptend_q0002_26.pkl', 'rb') as f:
    minDict = pickle.load(f)

# try just with one dataset of 50 part

In [None]:
def getTensorDataFlattendPredictNextTimeStamp(data, partPerLoop, startPartIdx,sampledPartIdx):
    dfList = []
    for j in range(partPerLoop):
        a = data.get_partition(int(sampledPartIdx[startPartIdx+j])).compute()
        a, newF = addFeatures(a)

        # transform targets
        transfTarg = ['ptend_q0001','ptend_q0002','ptend_q0003']
        transfF0 = ['state_q0001','state_q0002','state_q0003']
        transfTargList = []
        colDict={}
        for ind,f in enumerate(transfTarg):
            for i in range(60):
                transfF = f+'_'+str(i)+'_transf'
                colDict[transfF] = a[transfF0[ind]+'_'+str(i)]+a[f+'_'+str(i)]*1200
                transfTargList.append(transfF)
        a = pd.concat([a, pd.DataFrame(colDict)], axis=1)

        allF = features60+newF+feat1
        dfList.append(a)
    
    return pd.concat(dfList), allF, transfTargList

def custom_x_inv(x):
    return np.nan_to_num(1/(100*x), nan=0.0)

def custom_log(x, minValue, offset=6):  #offset of works for [-403:403] of x values otherwise sign is lost
    modMin = -minValue #* 0.9
    x[x==0] = modMin # will make problems bc 0 could be positive but also negative! dynamics will point in different directions
    y = np.log(abs(x))
    #y[x==0] = -1e50  #replace infinities with 0 -> problem, can't learn that after very small x = large y, there should be 0 -> need a different mapping
    y = y - offset           #move curve down such that we have a bigger domain that always has negative values as an outcome [-403:403]
    y = np.sign(x)*abs(y)    #return sign information

    y = y + abs(np.log((abs(modMin))))
    return y

def inv_custom_log(y,minValue, offset=6):
    modMin = -minValue #* 0.9
    y = y - abs(np.log((abs(modMin))))
    x = np.exp(-abs(y) + offset)
    #x[y == 1e-100] = 0       # not needed since
    x = np.sign(y)*x
    x[x== modMin] = 0
    return x

def getTensorDataFlattendPredictLog(data, partPerLoop, startPartIdx,sampledPartIdx):
    dfList = []
    for j in range(partPerLoop):
        a = data.get_partition(int(sampledPartIdx[startPartIdx+j])).compute()
        a, newF = addFeatures(a)

        # transform targets
        transfTarg = ['ptend_q0001','ptend_q0002']#['ptend_q0001','ptend_q0002','ptend_q0003']
        transfTargList = []
        colDict={}
        for ind,f in enumerate(transfTarg):
            for i in [26]: #range(60):
                feature = f+'_'+str(i)
                transfF = feature+'_transf'
                minValue = minDict[feature]['min']
                colDict[transfF] = custom_log(a[feature].copy(), minValue=minValue)
                transfTargList.append(transfF)
        a = pd.concat([a, pd.DataFrame(colDict)], axis=1)

        allF = features60+newF+feat1
        dfList.append(a)
    
    return pd.concat(dfList), allF, transfTargList

def concatData(data, partPerLoop, startPartIdx,sampledPartIdx):
    dfList = []
    for j in range(partPerLoop):
        a = data.get_partition(int(sampledPartIdx[startPartIdx+j])).compute()
        a, newF = addFeatures(a)

        allF = features60+newF+feat1
        dfList.append(a)
    
    return pd.concat(dfList), allF, allF

In [None]:
# validation data
partPerLoop = 35

for i in range(1):
    startPartIdx = i*partPerLoop
    val, combinedF,transT = concatData(data, partPerLoop, startPartIdx, sampledPartIdxTest)

In [None]:
# training sequentially
partPerLoop = 25

for i in range(1):
    startPartIdx = i*partPerLoop
    train, combinedF,transT = concatData(data, partPerLoop, startPartIdx, sampledPartIdxTrain)  

## find mapping for tiny values

ATTENTION: will not improve r2 score -> most of the error of r2 is caused by outliers / values far from mean need to be predicted well for a good r2

### logarithm mapping

In [None]:
a = train.ptend_q0002_26
#a[a==0]= 0.9*minDict['ptend_q0002_26']['min']
plt.hist(np.log(a), bins=100)

In [None]:
plt.scatter(x=range(0,train.shape[0]), y=np.log(a), s=1)

In [None]:
plt.scatter(x=range(0,train.shape[0]), y=train.ptend_q0002_26, s=1)

In [None]:
plt.scatter(x=range(0,train.shape[0]), y=train.ptend_q0002_26, s=1)
plt.scatter(x=range(0,train.shape[0]), y=inv_custom_log(train.ptend_q0002_26_transf,minDict['ptend_q0002_26']['min']), s=1)

In [None]:
plt.scatter(x=range(0,train.shape[0]), y=train.ptend_q0002_26_transf, s=1)

In [None]:
precision_info = np.finfo(train[f].dtype.type)

precision = precision_info.precision
precision,precision_info.min

In [None]:
np.exp(np.log(1e-50)), 1e-50

In [None]:
import numpy as np
import matplotlib.pyplot as plt
# attention you loose the sign with this one!!!!
def sech_squared(x,amplitude=10e4,freq=500):
    return np.sign(x)* amplitude / np.cosh(freq*x)
def inverse_sech_squared(y, amplitude=10e4, freq=50):
    return (1 / freq) * np.arccosh(np.sqrt(amplitude / np.abs(y))) * np.sign(y)

def custom_log(x, minValue, offset=6):  #offset of works for [-403:403] of x values otherwise sign is lost
    x[x==0] = minValue * 0.5 # will make problems bc 0 could be positive but also negative! dynamics will point in different directions
    y = np.log(abs(x))
    #y[x==0] = -1e50  #replace infinities with 0 -> problem, can't learn that after very small x = large y, there should be 0 -> need a different mapping
    y = y - offset           #move curve down such that we have a bigger domain that always has negative values as an outcome [-403:403]
    y = np.sign(x)*abs(y)    #return sign information
    return y

def inv_custom_log(y,offset=6):
    x = np.exp(-abs(y) + offset)
    #x[y == 1e-100] = 0       # not needed since
    x = np.sign(y)*x
    return x

x0 = 0
L = 10
k = 200
x = np.linspace(-2,2,21)
sigmoid = L / (1 + np.exp(-k*(x - x0)))
y = custom_log(x)
inv = inv_custom_log(y)
fig = plt.figure()
plt.plot(x,y)
plt.plot(x,inv)



### cox-box transformation

In [None]:
import scipy.stats as st

In [None]:
trans = st.boxcox(train['ptend_q0002_26'] + abs(train['ptend_q0002_26']), lmbda= 0.5)

fig = plt.figure()
plt.scatter(x=range(0,train.shape[0]), y=trans, s=1)

In [None]:
np.unique(trans)

### quantile transfomer from sklearn
q0001:
- uniform dist:  r2 scores -0.0024028623201599597 -0.14726808308227235 transormed 0.5795544569557314 0.46740129096623895
- normal:        r2 scores -6.038224021203636 -5.831603829623485 transormed 0.7500584769750813 0.6309097700251043

q0002:
- uniform: r2 scores 9.495082898713925e-06 -9.13032750657905e-05 transormed 0.9597399802188882 0.9419017458308543

In [None]:
from sklearn.preprocessing import QuantileTransformer

In [None]:
qt = QuantileTransformer(n_quantiles=10000, random_state=0, output_distribution='uniform')
quantileT = qt.fit_transform(train[['ptend_q0002_26']])

fig = plt.figure()
plt.scatter(x=range(0,train.shape[0]), y=quantileT, s=1)

In [None]:
invT = qt.inverse_transform(quantileT)

fig = plt.figure()
plt.scatter(x=range(0,train.shape[0]), y=invT, s=1)

In [None]:
f0 = 'ptend_q0002_26'
f = 'ptend_q0002_26_transf'

#f0 = 'ptend_u_42'
#f = f0+'_transf'
#
#f0 = 'ptend_q0003_15'
#f = f0+'_transf'

qt = QuantileTransformer(n_quantiles=10000, random_state=0,output_distribution='uniform')
train[f] = qt.fit_transform(train[[f0]])
val[f]   = qt.transform(val[[f0]])

#valSet = lgb.Dataset(val[combinedF], label=val[f], free_raw_data=False)
#train_set = lgb.Dataset(train[combinedF], train[f], free_raw_data=False)

valSet = lgb.Dataset(val.loc[val[f0] != 0][combinedF], label=val.loc[val[f0] != 0][f], free_raw_data=False)
train_set = lgb.Dataset(train[train[f0] != 0][combinedF], train.loc[train[f0] != 0][f], free_raw_data=False)

params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'l2',
    #'num_leaves': 15,
    #'learning_rate': 0.05,
    #'feature_fraction': 0.9,
    #'bagging_fraction': 0.8,
    #'bagging_freq': 5,
    'verbose': -1
}

print('processing ',f)
fileName = 'individualLGBMs_feat/model_'+f+'.txt'
gbm = None #lgb.Booster(model_file=fileName) if i != 0 else None

gbm = lgb.train(params,
            train_set,
            num_boost_round=500, 
            valid_sets=valSet,
            #callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=True)],
            init_model=gbm)

In [None]:
predTrain0 = gbm.predict(train[train[f0] != 0][combinedF])
predVal0 = gbm.predict(val[val[f0] != 0][combinedF])
predTrain = qt.inverse_transform(np.reshape(predTrain0,(-1,1)))
predVal = qt.inverse_transform(np.reshape(predVal0,(-1,1)))
r2train =r2_score(np.reshape(train[train[f0] != 0][f0], (-1,1)), predTrain)
r2test =r2_score(np.reshape(val[val[f0] != 0][f0], (-1,1)), predVal)
print('r2 scores', r2train,r2test, 'transormed',r2_score(train[train[f0] != 0][f], predTrain0),r2_score(val[val[f0] != 0][f], predVal0))

In [None]:
a = train[train[f0] != 0][f0]
((a-a.mean())**2).sum()

In [None]:
a.mean()

In [None]:
(a- a.mean())

In [None]:
mse = ((np.reshape(a, (-1,1)) - predTrain)**2).sum()

In [None]:
b = (np.reshape(a, (-1,1)) - predTrain)**2 / mse *100
plt.scatter(x=range(predTrain0.shape[0]),y=b, s=1,label='gt')
plt.legend()
plt.show()

In [None]:
train[train[f0] != 0][f0]

### quantile transformer self implemented

choose closest neighbor
- q0002: r2 scores 0.00031506126944913504 -0.027846289869214003 transormed 0.9940111528867572 0.9916804881993494
- q0001: r2 scores -0.2750321239707938 -0.6288146334566054 transormed 0.5818159792591474 0.4669909993569028


interpolation:
- q0002 r2 scores 0.0004193936653468233 8.448629331803126e-06 transormed 0.9940111528867572 -0.4430557552761931
- q0001 r2 scores -5.252347836889369 -4.9835035690989375 transormed 0.5818159792591474 -3.423471356255156

In [None]:
import scipy

In [None]:
""" 
Attention: the origValues and the quantiles have to have the same indexing, e.g. origValues[1] = f[quantiles[1]], means this represents the mapping between quantile and orig value
"""
def reverseMapping(predictQuantiles, origValues, quantiles):
    results = np.ones((predictQuantiles.shape[0],1)) * np.nan
    for i,val in enumerate(predictQuantiles):
        idx = np.argmin(abs(quantiles - val))
        results[i] = origValues[idx]
    return results

def mapping(values, origValues, quantiles):
    results = np.ones((values.shape[0],1)) * np.nan
    for i,val in enumerate(values):
        idx = np.argmin(abs(origValues - val))
        results[i] = quantiles[idx]
    return results


# get original values for predicted quantiles
def reverseMappingInterp(predictQuantiles, origValues, quantiles):
    results = np.ones((predictQuantiles.shape[0],1)) * np.nan

    for i,val in enumerate(predictQuantiles):
        idx_lower = np.argmax(quantiles <= val)
        idx_upper = np.argmin(quantiles >= val)
    
        if idx_lower == idx_upper:
            results[i] = origValues[idx_upper]
        else:
            # Linear interpolation
            fraction = (val - quantiles[idx_lower]) / (quantiles[idx_upper] - quantiles[idx_lower])
            results[i] = origValues[idx_lower] + (origValues[idx_upper] - origValues[idx_lower]) * fraction
    return results

# get quantiles from some original Values
def mappingInterp(values, origValues, quantiles):
    results = np.ones((values.shape[0],1)) * np.nan

    for i,val in enumerate(values):
        idx_lower = np.argmax(origValues <= val)
        idx_upper = np.argmin(origValues >= val)
    
        if idx_lower == idx_upper:
            results[i] = quantiles[idx_upper]
        else:
            fraction = (val - origValues[idx_lower]) / (origValues[idx_upper] - origValues[idx_lower])
            results[i] = quantiles[idx_lower] + (quantiles[idx_upper] - quantiles[idx_lower]) * fraction
    return results

In [None]:
train['rank'] = train['ptend_q0002_26'].rank(method='dense', ascending=True)
train['quantiles'] = train['rank'] / train.shape[0]

predicts = train.quantiles.to_numpy()
ranks = train['rank'].to_numpy()
quantiles = train.quantiles.to_numpy()
origValues = train['ptend_q0002_26'].to_numpy()
res = reverseMapping(predicts, ranks,quantiles)

In [None]:
res = mapping(val['ptend_q0002_26'].to_numpy(), origValues,quantiles)

In [None]:
f0 = 'ptend_q0002_26'
f = 'ptend_q0002_26_transf'

train['rank'] = train[f0].rank(method='dense', ascending=True)
train['quantiles'] = train['rank'] / train.shape[0]

quantiles = train.quantiles.to_numpy()
origValues = train[f0].to_numpy()

train[f] = train['quantiles']
val[f] = mappingInterp(val[f0].to_numpy(), origValues,quantiles)

valSet = lgb.Dataset(val[combinedF], label=val[f], free_raw_data=False)
train_set = lgb.Dataset(train[combinedF], train[f], free_raw_data=False)

params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'l2',
    #'num_leaves': 15,
    #'learning_rate': 0.05,
    #'feature_fraction': 0.9,
    #'bagging_fraction': 0.8,
    #'bagging_freq': 5,
    'verbose': -1
}

print('processing ',f)
fileName = 'individualLGBMs_feat/model_'+f+'.txt'
gbm = None #lgb.Booster(model_file=fileName) if i != 0 else None

gbm = lgb.train(params,
            train_set,
            num_boost_round=100, 
            valid_sets=valSet,
            init_model=gbm)

In [None]:
predTrain0 = gbm.predict(train[combinedF])
predVal0 = gbm.predict(val[combinedF])
predTrain = reverseMappingInterp(np.reshape(predTrain0,(-1,1)), origValues,quantiles)
predVal = reverseMappingInterp(np.reshape(predVal0,(-1,1)), origValues,quantiles)
r2train =r2_score(train[f0], predTrain)
r2test =r2_score(val[f0], predVal)
print('r2 scores', r2train,r2test, 'transormed',r2_score(train[f], predTrain0),r2_score(val[f], predVal0))

In [None]:
#plt.scatter(x=range(quantiles.shape[0]),y=quantiles, s=1,label='quantiles')
#plt.scatter(x=range(values.shape[0]),y=values, s=1,label='values')
plt.scatter(x=range(train['ptend_q0002_26_transf'].shape[0]),y=train['ptend_q0002_26_transf'], s=1,label='x_transformed')

### folded powers transformation

### log transform testing

In [None]:
f0 = 'ptend_q0001_26'
f = 'ptend_q0001_26_transf'

valSet = lgb.Dataset(val[combinedF], label=val[f], free_raw_data=False)
train_set = lgb.Dataset(train[combinedF], train[f], free_raw_data=False)

In [None]:
f0 = 'ptend_q0002_26'
f = 'ptend_q0002_26_transf'

subV = val.loc[val[f0] != 0]
subT = train.loc[train[f0] != 0]

valSet = lgb.Dataset(subV[combinedF], label=subV[f], free_raw_data=False)
train_set = lgb.Dataset(subT[combinedF], subT[f], free_raw_data=False)

In [None]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'l2',
    #'num_leaves': 15,
    #'learning_rate': 0.05,
    #'feature_fraction': 0.9,
    #'bagging_fraction': 0.8,
    #'bagging_freq': 5,
    'verbose': -1
}


print('processing ',f)
fileName = 'individualLGBMs_feat/model_'+f+'.txt'
gbm = None #lgb.Booster(model_file=fileName) if i != 0 else None

gbm = lgb.train(params,
            train_set,
            num_boost_round=100, 
            valid_sets=valSet,
            init_model=gbm)

predTrain0 = gbm.predict(subT[combinedF])
predVal0 = gbm.predict(subV[combinedF])
predTrain = inv_custom_log(predTrain0, minDict[f0]['min'])
predVal = inv_custom_log(predVal0, minDict[f0]['min'])
r2train =r2_score(subT[f0], predTrain)
r2test =r2_score(subV[f0], predVal)
#r2ScoreDict[f][i] = {'train':r2train,'test':r2test}
print('r2 scores', r2train,r2test, 'transormed',r2_score(subT[f], predTrain0),r2_score(subV[f], predVal0))
#gbm.save_model(fileName)
#gbm.save_model('individualLGBMs_feat/checkpoints/model_'+f+'_'+str(i)+'_'+str(round(r2test,3))+'.txt')



In [None]:
print('r2 scores', r2train,r2test, 'transormed',r2_score(subT[f], predTrain0),r2_score(subV[f], predVal0))

In [None]:
#r2 scores 0.0002870120951724564 -2.0425050317740556e-05    q0002
#r2 scores -7.53548410945837e+19 -7.067963969118257e+19     q0001

In [None]:
# todo: need to predict magnitude and sign separate: by using the log to map the network is able to predict very small changes,
#       but probably will have issues with 0 since they all will have one sign (-) & then the dynamics don't fit together, 
#       a small change in state space = big change in output (- to +) -200 -> + 200
# -> test and evaluate this!!

# -> target needs a continuous form, gbm needs some values between 0 and not 0

# -> even when predicting targets != 0 the r value is not good, maybe because the small changes in the system are highly relevant -> need a different mapping
#       in transofrmed domain we fit very well! just not when we transform back

In [None]:
gbm.best_iteration

### visualization in transformed space

In [None]:
plt.scatter(x=range(predTrain0.shape[0]),y=train[train[f0] != 0][f], s=1,label='gt',alpha=0.5)
plt.scatter(x=range(predTrain0.shape[0]),y=predTrain0, s=1,label='pred',alpha=0.5)

plt.legend()
plt.show()

In [None]:
plt.scatter(x=range(predVal0.shape[0]),y=predVal0, s=1,label='pred')
plt.scatter(x=range(predVal0.shape[0]),y=val[val[f0] != 0][f], s=1,label='gt')
plt.legend()
plt.show()

### visualization in plotly

In [None]:
import plotly.graph_objects as go
# Data preparation
x = list(range(predTrain0.shape[0]))
y_gt = train[train[f0] != 0][f].values
y_pred = predTrain0

trace_gt = go.Scatter(x=x,y=y_gt,mode='markers',marker=dict(size=4, color='green'),name='gt')
trace_pred = go.Scatter(x=x,y=y_pred,mode='markers',marker=dict(size=2,color='red'),name='pred')

# Create the figure
fig = go.Figure()

# Add traces to the figure
fig.add_trace(trace_gt)
fig.add_trace(trace_pred)

# Add legend
fig.update_layout(legend=dict(title="Legend"),showlegend=True)

# Show the figure
fig.show()

In [None]:
x = list(range(predTrain.shape[0]))
y_gt = train[train[f0] != 0][f0].values
y_pred = predTrain

trace_gt = go.Scatter(x=x,y=y_gt,mode='markers',marker=dict(size=4, color='green'),name='gt')
trace_pred = go.Scatter(x=x,y=y_pred,mode='markers',marker=dict(size=2,color='red'),name='pred')

# Create the figure
fig = go.Figure()

# Add traces to the figure
fig.add_trace(trace_gt)
fig.add_trace(trace_pred)

# Add legend
fig.update_layout(legend=dict(title="Legend"),showlegend=True)

# Show the figure
fig.show()

In [None]:
y_pred

### visualization in target space

In [None]:
plt.scatter(x=range(predTrain.shape[0]),y=predTrain, s=1,label='pred')
plt.scatter(x=range(predTrain.shape[0]),y=train[f0], s=1,label='gt')
plt.legend()
plt.show()

In [None]:
plt.scatter(x=range(predTrain.shape[0]),y=predTrain, s=1,label='pred')
plt.scatter(x=range(predTrain.shape[0]),y=train[f0], s=1,label='gt')
plt.legend()
plt.show()

# baseline

In [None]:
allTargets = targets60+target1

In [None]:
mean = np.mean(y, axis=0)
std = np.std(y, axis=0)
std[std==0] = 1

yn = (y - mean) / std
yn_val = (y_val - mean) / std

In [None]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'l2',
    'num_leaves': 15,
    #'learning_rate': 0.05,
    #'feature_fraction': 0.9,
    #'bagging_fraction': 0.8,
    #'bagging_freq': 5,
    'verbose': -1
}


r2ScoreDict = {f: {} for f in allTargets}
i=0
for idx,f in enumerate(allTargets):
    print('processing ',f)
    fileName = 'individualLGBMs_feat/model_'+f+'.txt'
    gbm = lgb.Booster(model_file=fileName) if i != 0 else None

    valSet = lgb.Dataset(X_val, label=yn_val[:,idx], free_raw_data=False)
    train_set = lgb.Dataset(X, yn[:,idx], free_raw_data=False)
    gbm = lgb.train(params,
                train_set,
                num_boost_round=20, 
                valid_sets=valSet,
                init_model=gbm)
    
    predTrain = gbm.predict(X)
    predVal = gbm.predict(X_val)

    predTrain = predTrain*std[idx] + mean[idx]
    predVal = predVal *std[idx] + mean[idx]
    r2train =r2_score(y[:,idx], predTrain)
    r2test =r2_score(y_val[:,idx], predVal)
    r2ScoreDict[f][i] = {'train':r2train,'test':r2test}
    print('r2 scores', r2train,r2test)
    gbm.save_model(fileName)
    gbm.save_model('individualLGBMs_feat/checkpoints/model_'+f+'_'+str(i)+'_'+str(round(r2test,3))+'.txt')



In [None]:
# q0002_26 -> outlier removal, seems like there are some cases where it's off
# regenerate plots, index needs to be resetted i guess
# -> better: use exp(targ) to have a better distinguishable target

# MULTIPLY by time! dt = 1200sec, maybe transform to abs value, instead of predicting flux, predict abs value
#e.g. T1 = t0+flux -> flux = (t1-t0)*1200

# scaled baseline

In [None]:
f = 'ptend_q0002_26'

valSet = lgb.Dataset(val[combinedF], label=val[f]/minDict[f]['min'], free_raw_data=False)
trainSet = lgb.Dataset(train[combinedF], train[f]/minDict[f]['min'], free_raw_data=False)

In [None]:
minDict[f]

In [None]:
np.reshape(valSet.label,(-1,1)) * minDict[f]['min']

In [None]:
plt.scatter(x=range(val.shape[0]),y=valSet.label*minDict[f]['min'], s=1,label=f)
plt.legend()
plt.show()

In [None]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'l2',
    #'num_leaves': 15,
    #'learning_rate': 0.05,
    #'feature_fraction': 0.9,
    #'bagging_fraction': 0.8,
    #'bagging_freq': 5,
    'verbose': -1
}

gbm = lgb.train(params,
            train_set,
            num_boost_round=100, 
            valid_sets=valSet,
            init_model=None)

predTrain0 = gbm.predict(trainSet.data)
predVal0 = gbm.predict(valSet.data)
predTrain = predTrain0*minDict[f]['min']
predVal = predVal0 *minDict[f]['min']
r2train =r2_score(trainSet.label*minDict[f]['min'], predTrain)
r2test =r2_score(valSet.label*minDict[f]['min'], predVal)
print('r2 scores', r2train,r2test)



# try with multiple part