In [None]:
import dask.dataframe as dd
import matplotlib.pyplot as plt
import os
from sklearn.metrics import r2_score
import dask_ml
import dask
import pandas as pd
import numpy as np
import lightgbm as lgb

import warnings
warnings.filterwarnings("ignore", category=UserWarning, message=".*Sending large graph.*")

from dask.distributed import Client, LocalCluster
import dask.multiprocessing

cluster = LocalCluster(processes=True,n_workers=6, threads_per_worker=1)
client = Client(cluster)
import sys
import pickle 

from data_helpers import *
from metrics import *

In [None]:
cluster = LocalCluster(processes=True,n_workers=6, threads_per_worker=1)
client = Client(cluster)

In [None]:
import tensorflow as tf

from tensorflow.keras import Input
from tensorflow.keras.layers import Dense, LSTM, Embedding, Concatenate,BatchNormalization, Reshape
from tensorflow.keras.models import Model

In [None]:
large_tr = pd.read_parquet('large_training_df_0001')
small_tr = pd.read_parquet('small_training_df_0001')

In [None]:
#large_tr = large_tr.loc[large_tr.norm_weight > 0.0005]
large_tr = large_tr.sample(n=70000, replace=True)
small_tr = small_tr.sample(n=70000)

sample_train = pd.concat([large_tr, small_tr], axis=0)
sample_train = sample_train.sample(n=sample_train.shape[0]) #shuffle
del large_tr, small_tr

In [None]:
folders_tr = [
    'train0_25',
    'train25_50',
    'train50_75',
]
folders_te = [
    'train75_100'
]

dfs = [dd.read_parquet(folder) for folder in folders_tr]
train = dd.concat(dfs)

dfs = [dd.read_parquet(folder) for folder in folders_te]
test = dd.concat(dfs)

In [None]:
test_size = test.shape[0].compute()
sample_test = test.sample(frac=30000/test_size).compute()

In [None]:
sample_test

In [None]:
# add features
sample_train, addF = addFeatures(sample_train)
sample_test, addF = addFeatures(sample_test)

trainF = addF + allF

# try a Fully connected network (FC)

In [None]:
tf.random.set_seed(42)

numF = len(allF)
numT = len(allT2)

input = Input(shape=(numF))

x = BatchNormalization()(input)
#x = Dense(numF, activation='relu')(input)

print(x.shape)
for i in range(2):
    x = Dense((i+1)*numF, activation='relu')(x)
    print(x.shape)
for i in range(2):
    x = Dense(1/(i+1)*x.shape[1], activation='relu')(x)
    print(x.shape)
x = Dense(numT, activation='linear',name='output')(x)

print(x.shape)
output =x

model = Model(inputs=input, outputs=output)
model.compile(optimizer='adam', loss='mse', metrics=[RSquaredMetric()])
#model.summary()

hist = model.fit(sample_train[allF], sample_train[allT2], epochs=15, batch_size=512, validation_data=(sample_test[allF],sample_test[allT2]))

In [None]:
p = model.predict(sample_test[allF])

In [None]:
#loss: 8.8343 - r_squared: -25891405363985514496.0000 - val_loss: 10.8290 - val_r_squared: -39792953086422548480.0000
# with norm_weight
#loss: 0.0076 - r_squared: -189225829431496491027386073088.0000 - val_loss: 118.8812 - val_r_squared: -14895986782100527251456.0000
# with weight
#loss: 152517.7969 - r_squared: -732297370498595883287838720000.0000 - val_loss: 117.0078 - val_r_squared: -13273999613704347320320.0000
# shuffled data with "weight"
#loss: 157569.0312 - r_squared: -360110852060544499712.0000 - val_loss: 118.2304 - val_r_squared: -196862049473160990949376.0000

# try a lgbm

In [None]:
# no sampling too crazy, use the 0.0001 threshold for a 50/50 sampling -> high weight on the far away samples
r2dict = {}
for f in ['ptend_q0002_26']:#allT2:
    params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': 'l2',
        #'num_leaves': 15,
        #'learning_rate': 0.05,
        #'feature_fraction': 0.9,
        #'bagging_fraction': 0.8,
        #'bagging_freq': 5,
        'verbose': -1
    }   

    gbm = lgb.train(params,
                lgb.Dataset(sample_train[trainF], label=sample_train[f], free_raw_data=False),#, weight=sample_train['weight']),
                num_boost_round=100, 
                valid_sets=lgb.Dataset(sample_test[trainF], label=sample_test[f], free_raw_data=False),
                #callbacks = [lgb.early_stopping(stopping_rounds=100)],
                init_model=None)    

    predTrain = gbm.predict(sample_train[trainF])
    predVal = gbm.predict(sample_test[trainF])
    r2train =r2_score(sample_train[f], predTrain)
    r2test =r2_score(sample_test[f], predVal)
    print('r2 scores', r2train,r2test,f)
    r2dict[f] = {'train':r2train, 'test':r2test}
    gbm.save_model('individualLGBMs_sampled/model_'+f+'_'+str(round(r2test,3))+'.txt')
    gbm.save_model('individualLGBMs_sampled/model_'+f+'.txt')
    break

In [None]:
#ptend_t_0
#base: r2 scores -3.594 ptend_t_0
#addF: r2 scores 0.9354909909280742 -5.574277330887769 ptend_t_0
#with callback: r2 scores 0.17447142147913053 0.10560585670117484 ptend_t_0 ->doesn't fit well at all

#ptend_q0002_26
#with callback: r2 scores 0.999897657944176 -7095520.075311637 ptend_q0002_26 -> doesn't fit well, we overpredict too large values
#without weight: r2 scores 0.8236448689419253 -0.5606788450609883 ptend_q0002_26
#without weight, 200 rounds: r2 scores 0.9269528462506769 -102.56223446349581 ptend_q0002_26
plt.scatter(x=range(sample_test.shape[0]),y=sample_test[f], s=1,label=f)
plt.scatter(x=range(sample_test.shape[0]),y=predVal, s=1,label='pred_test')
plt.legend()
plt.show()

plt.scatter(x=range(sample_train.shape[0]),y=sample_train[f], s=1,label=f)
plt.scatter(x=range(sample_train.shape[0]),y=predTrain, s=1,label='pred_train')
plt.legend()
plt.show()