In [None]:
import dask.dataframe as dd
import matplotlib.pyplot as plt
import os
from sklearn.metrics import r2_score
import dask_ml
import dask
import pandas as pd
import numpy as np
import lightgbm as lgb

import warnings
warnings.filterwarnings("ignore", category=UserWarning, message=".*Sending large graph.*")

from dask.distributed import Client, LocalCluster
import dask.multiprocessing

cluster = LocalCluster(processes=True,n_workers=6, threads_per_worker=1)
client = Client(cluster)
import sys
from data_helpers import *
from metrics import *

In [None]:
folders = [
    'train0_25',
    'train25_50',
    'train50_75',
    'train75_100'
]

# Read Parquet files from each folder into Dask DataFrames
dfs = [dd.read_parquet(folder) for folder in folders]

# Concatenate all DataFrames into a single DataFrame
data = dd.concat(dfs)

In [None]:
np.random.seed(42)

orig_partitions = [i for i in range(0,int(data.npartitions))]
np.random.shuffle(orig_partitions) #shuffles inplace

trainSep = int(0.95* data.npartitions)
valEnd = data.npartitions #int(0.05* data.npartitions) + trainSep

sampledPartIdxTrain = orig_partitions[0:trainSep]
sampledPartIdxTest  = orig_partitions[trainSep:valEnd]

# try just with one dataset of 50 part

In [None]:
# validation data
partPerLoop = 35

for i in range(1):
    startPartIdx = i*partPerLoop
    X_val, y_val, combinedF,_ = getTensorDataFlattend(data, partPerLoop, startPartIdx, sampledPartIdxTest)

In [None]:
# training sequentially
partPerLoop = 50

for i in range(1):
    startPartIdx = i*partPerLoop
    X,y, combinedF,_ = getTensorDataFlattend(data, partPerLoop, startPartIdx, sampledPartIdxTrain)  

In [None]:
allTargets = targets60+target1

In [None]:
mean = np.mean(y, axis=0)
std = np.std(y, axis=0)
std[std==0] = 1

yn = (y - mean) / std
yn_val = (y_val - mean) / std

In [None]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'l2',
    'num_leaves': 15,
    #'learning_rate': 0.05,
    #'feature_fraction': 0.9,
    #'bagging_fraction': 0.8,
    #'bagging_freq': 5,
    'verbose': -1
}


r2ScoreDict = {f: {} for f in allTargets}
i=0
for idx,f in enumerate(allTargets):
    print('processing ',f)
    fileName = 'individualLGBMs_feat/model_'+f+'.txt'
    gbm = lgb.Booster(model_file=fileName) if i != 0 else None

    valSet = lgb.Dataset(X_val, label=yn_val[:,idx], free_raw_data=False)
    train_set = lgb.Dataset(X, yn[:,idx], free_raw_data=False)
    gbm = lgb.train(params,
                train_set,
                num_boost_round=20, 
                valid_sets=valSet,
                init_model=gbm)
    
    predTrain = gbm.predict(X)
    predVal = gbm.predict(X_val)

    predTrain = predTrain*std[idx] + mean[idx]
    predVal = predVal *std[idx] + mean[idx]
    r2train =r2_score(y[:,idx], predTrain)
    r2test =r2_score(y_val[:,idx], predVal)
    r2ScoreDict[f][i] = {'train':r2train,'test':r2test}
    print('r2 scores', r2train,r2test)
    gbm.save_model(fileName)
    gbm.save_model('individualLGBMs_feat/checkpoints/model_'+f+'_'+str(i)+'_'+str(round(r2test,3))+'.txt')



In [None]:
# q0002_26 -> outlier removal, seems like there are some cases where it's off
# regenerate plots, index needs to be resetted i guess
# -> better: use exp(targ) to have a better distinguishable target

# MULTIPLY by time! dt = 1200sec, maybe transform to abs value, instead of predicting flux, predict abs value
#e.g. T1 = t0+flux -> flux = (t1-t0)*1200

# try with multiple part