In [None]:
import dask.dataframe as dd
import matplotlib.pyplot as plt
import os
from sklearn.metrics import r2_score
import dask_ml
import dask
import pandas as pd
import numpy as np
import lightgbm as lgb

import warnings
warnings.filterwarnings("ignore", category=UserWarning, message=".*Sending large graph.*")

from dask.distributed import Client, LocalCluster
import dask.multiprocessing

cluster = LocalCluster(processes=True,n_workers=6, threads_per_worker=1)
client = Client(cluster)
import sys
import pickle 

from data_helpers import *
from metrics import *

In [None]:
folders = [
    'train0_25',
    'train25_50',
    'train50_75',
    'train75_100'
]

# Read Parquet files from each folder into Dask DataFrames
dfs = [dd.read_parquet(folder) for folder in folders]

# Concatenate all DataFrames into a single DataFrame
data = dd.concat(dfs)

In [None]:
np.random.seed(42)

orig_partitions = [i for i in range(0,int(data.npartitions))]
np.random.shuffle(orig_partitions) #shuffles inplace

trainSep = int(0.95* data.npartitions)
valEnd = data.npartitions #int(0.05* data.npartitions) + trainSep

sampledPartIdxTrain = orig_partitions[0:trainSep]
sampledPartIdxTest  = orig_partitions[trainSep:valEnd]

In [None]:
# validation data
val = data.partitions[sampledPartIdxTest[0:30]].compute()

In [None]:
plt.scatter(x=range(val.shape[0]),y=val['ptend_q0002_26'], s=1)
plt.show()
# [-8*e-15:0]

In [None]:
f = 'ptend_q0002_26'
mean_f = data[f].mean().compute()

separator=1
train = data.partitions[sampledPartIdxTrain]
tr_large = train.loc[abs(train[f]) > separator*abs(mean_f)]
tr_large = tr_large.compute()

tr_small = train.loc[abs(train[f]) < separator*abs(mean_f)]
size = tr_small.shape[0].compute()
tr_small = tr_small.sample(frac=10000/size).compute()

In [None]:
plt.scatter(x=range(tr_large.shape[0]),y=tr_large['ptend_q0002_26'], s=1)
plt.show()
# [1.4e-16:8e-10]

In [None]:
tr_large.to_csv(f+'_largeV_sep1_all.csv')

In [None]:
import sys
sys.getsizeof(tr_large)/1e6, sys.getsizeof(tr_small)/1e6 #in mb

In [None]:
# sample full data
tr_large0 = tr_large.sample(n=100000, random_state=42, replace=True)
tr_small0 = tr_small.sample(n=100000, random_state=42, replace=True)

tr_new = pd.concat([tr_large0, tr_small0], axis =0)
tr_new = tr_new.sample(frac=1).reset_index(drop=True)
del tr_large0, tr_small0

valSet = lgb.Dataset(val[allF], label=val[f], free_raw_data=False)
trainSet = lgb.Dataset(tr_new[allF], tr_new[f], free_raw_data=False)

In [None]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'l2',
    #'num_leaves': 15,
    #'learning_rate': 0.05,
    #'feature_fraction': 0.9,
    #'bagging_fraction': 0.8,
    #'bagging_freq': 5,
    'verbose': -1
}

gbm = lgb.train(params,
            trainSet,
            num_boost_round=100, 
            valid_sets=valSet,
            callbacks = [lgb.early_stopping(stopping_rounds=10)],
            init_model=None)

predTrain = gbm.predict(trainSet.data)
predVal = gbm.predict(valSet.data)
r2train =r2_score(trainSet.label, predTrain)
r2test =r2_score(valSet.label, predVal)
print('r2 scores', r2train,r2test)

#sep 3
# r2 scores 0.9476652354318174 -6196156161.0836525 : 10k/10k not shuffled, same if shuffled
# r2 scores 0.9999999992944911 -17.38356548018086  : 100k/100k
# r2 scores 0.9999999992944553 -17.78288642970076  : 100k/100k with 15leaves max
#sep 1
# r2 scores 0.9995889016184228 -90207697.06780742

# without features, but all large values

In [None]:
plt.scatter(x=range(predTrain.shape[0]),y=predTrain, s=1,label='pred')
plt.scatter(x=range(predTrain.shape[0]),y=trainSet.label, s=1,label='gt')
plt.legend()
plt.show()

In [None]:
plt.scatter(x=range(predVal.shape[0]),y=predVal, s=1,label='pred')
plt.scatter(x=range(predVal.shape[0]),y=valSet.label, s=1,label='gt')
plt.legend()
plt.show()