In [None]:
import dask.dataframe as dd
import matplotlib.pyplot as plt
import os
from sklearn.metrics import r2_score
import dask_ml
import dask
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.preprocessing import QuantileTransformer, PowerTransformer

import warnings
warnings.filterwarnings("ignore", category=UserWarning, message=".*Sending large graph.*")

from dask.distributed import Client, LocalCluster
import dask.multiprocessing

cluster = LocalCluster(processes=True,n_workers=6, threads_per_worker=1)
client = Client(cluster)
import sys
import pickle 

from data_helpers import *
from metrics import *

In [None]:
cluster = LocalCluster(processes=True,n_workers=6, threads_per_worker=1)
client = Client(cluster)

In [None]:
folders = [
    'train0_25',
    'train25_50',
    'train50_75',
    'train75_100'
]

# Read Parquet files from each folder into Dask DataFrames
dfs = [dd.read_parquet(folder) for folder in folders]

# Concatenate all DataFrames into a single DataFrame
data = dd.concat(dfs)

In [None]:
np.random.seed(42)

orig_partitions = [i for i in range(0,int(data.npartitions))]
np.random.shuffle(orig_partitions) #shuffles inplace

trainSep = int(0.7* data.npartitions)
valEnd = data.npartitions #int(0.05* data.npartitions) + trainSep

sampledPartIdxTrain = orig_partitions[0:trainSep]
sampledPartIdxTest  = orig_partitions[trainSep:valEnd]

# min Values

In [None]:
minDict = {} #minimum value that is not 0
for f in allT2: #['ptend_q0001_26','ptend_q0002_26']:#transfTarg60: #allT:
    a = data[f].compute()
    hasPos = max(a)>0
    hasNeg = min(a)<0
    minNeg = min(abs(a.loc[a < 0])) if hasNeg else 1e10
    minPos = min(abs(a.loc[a > 0])) if hasPos else 1e10
    maxPos = max(a)
    maxNeg = abs(min(a))
    minDict[f] = {'minNeg':minNeg, 'minPos':minPos, 'min':min(minNeg,minPos), 'maxPos':maxPos, 'maxNeg':maxNeg, 'max':max(maxPos, maxNeg)}

In [None]:
with open('minVal_allT2.pkl', 'wb') as f:
    pickle.dump(minDict, f)    

In [None]:
with open('minVal_allT2.pkl', 'rb') as f:
    minDict = pickle.load(f)

# load dicts

In [None]:
with open('meanDict_allT.pkl', 'rb') as f:
    meanDict = pickle.load(f)

with open('stdDict_allT.pkl', 'rb') as f:
    stdDict = pickle.load(f)

with open('minVal_allT2.pkl', 'rb') as f:
    minDict = pickle.load(f)

# data setup

In [None]:
baseTrain = data.partitions[sampledPartIdxTrain[0:15]].compute()
largeV_26 = pd.read_csv('large_ptend_q0002_26.csv')
largeV = pd.read_parquet('large_training_df_0001')


#with all large values: -> way worse log transform performance
#                       -> quantile transf is stable, can't fit training well / too extreme

In [None]:
val = data.partitions[sampledPartIdxTest[15:30]].compute()

# LGBM for ptend_q0002_26 & log transf
- looks really good with log transform + weight
- quantile transform + weight has a hard time resolving extreme values - not as good


looks really good if:
- specifically filtered for large values per feature (>mean) - attention: potential loss of information due to validation leak (all large values in data used)
- use weighting for overall features


In [None]:
stdDict['ptend_q0002_55'], meanDict['ptend_q0002_55']

In [None]:
stdDict['ptend_q0002_26'], meanDict['ptend_q0002_26']

In [None]:
f = 'ptend_q0002_55'
sub = data.partitions[sampledPartIdxTrain]
largeV_f = sub.loc[abs(sub[f]) > abs(meanDict[f])].compute()
# for 55 too many samples to load into memory -> use large values from large impact file

In [None]:
f = 'ptend_q0002_26'
minValue = minDict[f]['min']
transfF = f+'_transf'

filtered = largeV.loc[abs(largeV[f]) >= abs(meanDict[f])]
print(filtered.shape, largeV.shape)
#train = pd.concat([baseTrain,largeV_f], axis = 0)
#train = pd.concat([baseTrain,largeV_26], axis = 0)
train = pd.concat([baseTrain,filtered], axis = 0)

train[transfF] = custom_log(train[f].copy(), minValue=minValue)
val[transfF] = custom_log(val[f].copy(), minValue=minValue)

valSet = lgb.Dataset(val[allF], label=val[transfF], free_raw_data=False)
weight = (((train[f] - meanDict[f])/stdDict[f])**2)#specific weighting based on feature
#weight = 100*weight / max(weight)
trainSet = lgb.Dataset(train[allF], train[transfF], weight=train['weight'], free_raw_data=False)

params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'l2',
    #'num_leaves': 15,
    #'learning_rate': 0.05,
    #'feature_fraction': 0.9,
    #'bagging_fraction': 0.8,
    #'bagging_freq': 5,
    'verbose': 1
}


gbm = None #lgb.Booster(model_file=fileName) if i != 0 else None

gbm = lgb.train(params,
            trainSet,
            num_boost_round=200, 
            valid_sets=valSet,
            init_model=gbm)

predTrain0 = gbm.predict(train[allF])
predVal0 = gbm.predict(val[allF])
predTrain = inv_custom_log(predTrain0, minDict[f]['min'])
predVal = inv_custom_log(predVal0, minDict[f]['min'])
r2train =r2_score(train[f], predTrain)
r2test =r2_score(val[f], predVal)
print('r2 scores', r2train,r2test, 'transormed',r2_score(train[transfF], predTrain0),r2_score(val[transfF], predVal0))



In [None]:
""" in feature space """
plt.scatter(x=range(train.shape[0]),y=train[f], s=1,label=f)
plt.scatter(x=range(train.shape[0]),y=predTrain, s=1,label='pred_train')
plt.legend()
plt.show()

plt.scatter(x=range(val.shape[0]),y=val[f], s=1,label=f)
plt.scatter(x=range(val.shape[0]),y=predVal, s=1,label='pred_test')
plt.legend()
plt.show()

In [None]:
""" in transformed space """
plt.scatter(x=range(train.shape[0]),y=train[transfF], s=1,label=f)
plt.scatter(x=range(train.shape[0]),y=predTrain0, s=1,label='pred_train')
plt.legend()
plt.show()

plt.scatter(x=range(val.shape[0]),y=val[transfF], s=1,label=f)
plt.scatter(x=range(val.shape[0]),y=predVal0, s=1,label='pred_test')
plt.legend()
plt.show()

In [None]:
predTrain0 = gbm.predict(train[allF])
predVal0 = gbm.predict(val[allF])
predTrain = inv_custom_log(predTrain0, minDict[f]['min'])
predVal = inv_custom_log(predVal0, minDict[f]['min'])
r2train =r2_score(train[f], predTrain)
r2test =r2_score(val[f], predVal)
print('r2 scores', r2train,r2test, 'transormed',r2_score(train[transfF], predTrain0),r2_score(val[transfF], predVal0))

# lgbm for ptend_q0002_55
- distribution gets all fucked up with custom log

In [None]:
f = 'ptend_q0002_55'
minValue = minDict[f]['min']
transfF = f+'_transf'
train[transfF] = custom_log(train[f].copy(), minValue=minValue)
val[transfF] = custom_log(val[f].copy(), minValue=minValue)

In [None]:
plt.scatter(x=range(train.shape[0]),y=train[f], s=1,label=f)
plt.legend()
plt.show()

plt.scatter(x=range(train.shape[0]),y=train[transfF], s=1,label=f)
plt.legend()
plt.show()

In [None]:
f = 'ptend_q0002_55'
transfF = f+'_transf'
qt = QuantileTransformer(n_quantiles=10000, random_state=0, output_distribution='uniform')
train[transfF] = qt.fit_transform(train[[f]])
val[transfF] = qt.transform(val[[f]])

fig = plt.figure()
plt.scatter(x=range(0,train.shape[0]), y=train[transfF], s=1)
plt.scatter(x=range(0,val.shape[0]), y=val[transfF], s=1)

In [None]:
# fails for extreme cases
f = 'ptend_q0002_26'
transfF = f+'_transf'
qt = PowerTransformer()
train[transfF] = qt.fit_transform(train[[f]])
val[transfF] = qt.transform(val[[f]])

fig = plt.figure()
plt.scatter(x=range(0,train.shape[0]), y=train[transfF], s=1)
plt.scatter(x=range(0,val.shape[0]), y=val[transfF], s=1)

In [None]:
qt.inverse_transform(np.reshape(val[transfF],(-1,1))), val[f]

In [None]:
"""------ BASE -----"""
valSet = lgb.Dataset(val[allF], label=val[f], free_raw_data=False)
trainSet = lgb.Dataset(train[allF], train[f])#, weight=train['weight'], free_raw_data=False)

params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'l2',
    #'num_leaves': 15,
    #'learning_rate': 0.05,
    #'feature_fraction': 0.9,
    #'bagging_fraction': 0.8,
    #'bagging_freq': 5,
    'verbose': 1
}


gbm = None #lgb.Booster(model_file=fileName) if i != 0 else None

gbm = lgb.train(params,
            trainSet,
            num_boost_round=200, 
            valid_sets=valSet,
            init_model=gbm)

predTrain = gbm.predict(train[allF])
predVal = gbm.predict(val[allF])
r2train =r2_score(train[f], predTrain)
r2test =r2_score(val[f], predVal)
print('r2 scores', r2train,r2test, 'transormed')

In [None]:
"""------ transformed test -----"""
valSet = lgb.Dataset(val[allF], label=val[transfF], free_raw_data=False)
weight = ((train[f] - meanDict[f])**2) #specific weighting based on feature
weight = 100*weight / max(weight)
trainSet = lgb.Dataset(train[allF], label=train[transfF], weight=weight)#train['weight'], free_raw_data=False)

params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'l2',
    #'num_leaves': 15,
    #'learning_rate': 0.05,
    #'feature_fraction': 0.9,
    #'bagging_fraction': 0.8,
    #'bagging_freq': 5,
    'verbose': 1
}


gbm = None #lgb.Booster(model_file=fileName) if i != 0 else None

gbm = lgb.train(params,
            trainSet,
            num_boost_round=200, 
            valid_sets=valSet,
            callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=True)],
            init_model=gbm)

predTrain0 = gbm.predict(train[allF])
predVal0 = gbm.predict(val[allF])
predTrain = qt.inverse_transform(np.reshape(predTrain0,(-1,1)))
predVal = qt.inverse_transform(np.reshape(predVal0,(-1,1)))
r2train =r2_score(train[f], predTrain)
r2test =r2_score(val[f], predVal)
print('r2 scores', r2train,r2test, 'transormed',r2_score(train[transfF], predTrain0),r2_score(val[transfF], predVal0))

In [None]:
""" in feature space """
plt.scatter(x=range(train.shape[0]),y=train[f], s=1,label=f)
plt.scatter(x=range(train.shape[0]),y=predTrain, s=1,label='pred_train')
plt.legend()
plt.show()

plt.scatter(x=range(val.shape[0]),y=val[f], s=1,label=f)
plt.scatter(x=range(val.shape[0]),y=predVal, s=1,label='pred_test')
plt.legend()
plt.show()

In [None]:
""" in transformed space """
plt.scatter(x=range(train.shape[0]),y=train[transfF], s=1,label=f)
plt.scatter(x=range(train.shape[0]),y=predTrain0, s=1,label='pred_train')
plt.legend()
plt.show()

plt.scatter(x=range(val.shape[0]),y=val[transfF], s=1,label=f)
plt.scatter(x=range(val.shape[0]),y=predVal0, s=1,label='pred_test')
plt.legend()
plt.show()

# where is the biggest error coming from in weight

In [None]:
train = pd.concat([baseTrain,largeV], axis = 0)

for f in allT2:
    weightF = f+'_weightContribution'
    train[weightF] = ((train[f] - meanDict[f])/stdDict[f])**2 / train['weight']

In [None]:
pd.set_option('Display.max_columns',10)
train

In [None]:
train['ptend_q0002_26_weightContribution'].sort_values(ascending=False).head(20)