In [None]:
import dask.dataframe as dd
import matplotlib.pyplot as plt
import os
from sklearn.metrics import r2_score
import dask_ml
import dask
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.preprocessing import QuantileTransformer, PowerTransformer

import warnings
warnings.filterwarnings("ignore", category=UserWarning, message=".*Sending large graph.*")

from dask.distributed import Client, LocalCluster
import dask.multiprocessing

cluster = LocalCluster(processes=True,n_workers=6, threads_per_worker=1)
client = Client(cluster)
import sys
import pickle 

from data_helpers import *
from metrics import *

In [None]:
folders = [
    'train0_25',
    'train25_50',
    'train50_75',
    'train75_100'
]

# Read Parquet files from each folder into Dask DataFrames
dfs = [dd.read_parquet(folder) for folder in folders]

# Concatenate all DataFrames into a single DataFrame
data = dd.concat(dfs)

In [None]:
np.random.seed(42)

orig_partitions = [i for i in range(0,int(data.npartitions))]
np.random.shuffle(orig_partitions) #shuffles inplace

trainSep = int(0.7* data.npartitions)
valEnd = data.npartitions #int(0.05* data.npartitions) + trainSep

sampledPartIdxTrain = orig_partitions[0:trainSep]
sampledPartIdxTest  = orig_partitions[trainSep:valEnd]

with open('meanDict_allT.pkl', 'rb') as f:
    meanDict = pickle.load(f)

with open('stdDict_allT.pkl', 'rb') as f:
    stdDict = pickle.load(f)

with open('minVal_allT2.pkl', 'rb') as f:
    minDict = pickle.load(f)

with open('zScore_allT2.pkl', 'rb') as f:
    zscoreDict = pickle.load(f)

In [None]:
baseTrain=[]
nPartitions=30
for i in range(int(nPartitions/5)):
    baseTrain.append(data.partitions[sampledPartIdxTrain[i*5:i*5+5]].compute())
baseTrain = pd.concat(baseTrain)

baseVal=[]
nPartitions=15
for i in range(int(nPartitions/5)):
    baseVal.append(data.partitions[sampledPartIdxTest[i*5:i*5+5]].compute())
baseVal = pd.concat(baseVal)

In [None]:
largeV = pd.read_parquet('large_training_df_0001')

In [None]:
filtered = largeV.loc[abs(largeV[f]) > abs(meanDict[f])]
sep = int(largeV.shape[0]*0.7)
end = largeV.shape[0]-1
train=pd.concat([baseTrain, largeV.iloc[0:sep]])
val = pd.concat([baseVal, largeV.iloc[sep:end]])

In [None]:
f = 'ptend_q0001_17'
f = 'ptend_q0002_26'
f = 'ptend_q0002_55'

filtered = largeV.loc[abs(largeV[f]) > abs(meanDict[f])]
sep = int(filtered.shape[0]*0.7)
end = filtered.shape[0]-1
train=pd.concat([baseTrain, filtered])#.iloc[0:sep]])
val = baseVal #pd.concat([baseVal, filtered.iloc[sep:end]])

valSet = lgb.Dataset(val[allF], label=val[f], free_raw_data=False)
train['weight'] = abs(train[f]-meanDict[f])
trainSet = lgb.Dataset(train[allF], train[f])#, weight=train['weight']/train['weight'].min(),free_raw_data=False)

params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'l2',
    #'num_leaves': 15,
    #'learning_rate': 0.05,
    #'feature_fraction': 0.9,
    #'bagging_fraction': 0.8,
    #'bagging_freq': 5,
    'verbose': 1
}


gbm = None

gbm = lgb.train(params,
            trainSet,
            num_boost_round=100, 
            valid_sets=valSet,
            #callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=True)],
            init_model=gbm)

predTrain = gbm.predict(train[allF])
predVal = gbm.predict(val[allF])
r2train =r2_score(train[f], predTrain)
r2test =r2_score(val[f], predVal)
print('r2 scores', r2train,r2test)# 'transormed',r2_score(train[transfF], predTrain0),r2_score(val[transfF], predVal0))

In [None]:
plt.scatter(x=range(train.shape[0]),y=train[f], s=1,label=f)#
plt.scatter(x=range(train.shape[0]),y=predTrain, s=1,label='pred_train')
plt.legend()
plt.show()


plt.scatter(x=range(val.shape[0]),y=val[f], s=1,label=f)
plt.scatter(x=range(val.shape[0]),y=predVal, s=1,label='pred_test')

plt.legend()
plt.show()