In [None]:
import dask.dataframe as dd
import matplotlib.pyplot as plt
import os
from sklearn.metrics import r2_score
import dask_ml
import dask
import pandas as pd
import numpy as np
import lightgbm as lgb

import warnings
warnings.filterwarnings("ignore", category=UserWarning, message=".*Sending large graph.*")

from dask.distributed import Client, LocalCluster
import dask.multiprocessing

cluster = LocalCluster(processes=True,n_workers=6, threads_per_worker=1)
client = Client(cluster)
import sys
import pickle 

from data_helpers import *
from metrics import *

In [None]:
cluster = LocalCluster(processes=True,n_workers=6, threads_per_worker=1)
client = Client(cluster)

In [None]:
folders = [
    'train0_25',
    'train25_50',
    'train50_75',
    'train75_100'
]

# Read Parquet files from each folder into Dask DataFrames
dfs = [dd.read_parquet(folder) for folder in folders]

# Concatenate all DataFrames into a single DataFrame
data = dd.concat(dfs)

In [None]:
ptend_q002 = []
for i in range(60):
    ptend_q002.append('ptend_q0002_'+str(i))

targetsToDrop12 = [ 'ptend_q0001', 'ptend_q0002', 'ptend_q0003', 'ptend_u', 'ptend_v']
dropT = [] #'ptend_q0002_12','ptend_q0002_13','ptend_q0002_14'] # attention, I think i also need to predict _15
for f in targetsToDrop12:
    dropT = dropT + [f+'_'+str(i) for i in range(12)]

allT2 = [i for i in allT if i not in dropT]

# mean & stddev computation

In [None]:
meanDict ={}
for f in allT:
    meanDict[f] = data[f].mean().compute()

with open('meanDict_allT.pkl', 'wb') as f:
    pickle.dump(meanDict, f)    

In [None]:
stdDict ={}
for f in allT:
    stdDict[f] = data[f].std().compute()

with open('stdDict_allT.pkl', 'wb') as f:
    pickle.dump(stdDict, f)  

In [None]:
maxDict ={}
for f in allT:
    maxDict[f] = abs(data[f]).max().compute()

with open('maxDict_allT.pkl', 'wb') as f:
    pickle.dump(maxDict, f)  

In [None]:
with open('meanDict_allT.pkl', 'rb') as f:
    meanDict = pickle.load(f)

with open('stdDict_allT.pkl', 'rb') as f:
    stdDict = pickle.load(f)

# transform df

In [None]:
def calculate_squared_error(row, mean_values,std_values, columns):
    return sum(((row[col] - mean_values[col])/std_values[col]) ** 2 for col in columns)

In [None]:
squared_error_per_row = data.apply(lambda row: calculate_squared_error(row, meanDict, stdDict, allT2), axis=1, meta=('x', 'f8'))

# Compute the result
result = squared_error_per_row.compute()

In [None]:
result.to_csv('sumSquaredError.csv')

In [None]:
result = pd.read_csv('sumSquaredError.csv')

In [None]:
normRes = result['0']/max(result['0'])

In [None]:
normRes = normRes.reset_index()#.drop('index',axis=1)

In [None]:
normRes.loc[normRes[0] > 0.0001]

In [None]:
#data = data.assign(norm_weight = normRes[0].values)
dask_series = dd.from_pandas(normRes, npartitions=data.npartitions)#,chunks=data.npartitions)

In [None]:
import dask.array as da

dask_array = da.from_array(normRes[0], chunks=data.partitions[0].shape[0].compute())

## by assign function

In [None]:
min(normRes), max(result)

In [None]:
data1 = data.assign(weight=data.apply(lambda row: calculate_squared_error(row, meanDict, stdDict, allT2), axis=1, meta=('x', 'f8')))
data1['normWeight'] = data1['weight'] / max(result['0'])

In [None]:
largeSamp = data1.loc[(data1['normWeight'] > 0.0001)] 

In [None]:
largeSamp = largeSamp.compute()

## add new index

In [None]:
normRes

In [None]:
new_index = dd.from_pandas(normRes, npartitions=data.npartitions)

In [None]:
data = data.assign(new_index=new_index)

In [None]:
data = data.set_index('new_index', sorted=True)

## do it per partition and write it to source

In [None]:
folders = [
    #'testPar',
    'train0_25',
    'train25_50',
    'train50_75',
    'train75_100'
]

def calculate_squared_errorPart(partition, mean_values, std_values, columns):
    #return sum(((row[col] - mean_values[col])/std_values[col]) ** 2 for col in columns)
    partition['weight'] = partition.apply(lambda row : calculate_squared_error(row, mean_values, std_values, columns),axis=1)
    #partition['weight'] = 1
    return partition

for folder in folders:
    data = dd.read_parquet(folder)

    data = data.map_partitions(
        calculate_squared_errorPart,
        mean_values=meanDict,
        std_values=stdDict,
        columns=allT2,
        #meta=('squared_error', 'float64')
        )
    
    data.to_parquet(folder)



In [None]:
maxVal = data['weight'].max().compute()

In [None]:
folders = [
    'testPar',
    'train0_25',
    'train25_50',
    'train50_75',
    'train75_100'
]

def calculate_norm_weight(partition, maxVal):
    #return sum(((row[col] - mean_values[col])/std_values[col]) ** 2 for col in columns)
    partition['norm_weight'] = partition['weight'] / maxVal
    #partition['weight'] = 1
    return partition


for folder in folders:
    data = dd.read_parquet(folder)

    data = data.map_partitions(
        calculate_norm_weight,
        maxVal = maxVal
        )
    
    data.to_parquet(folder)



# get large values per feature

In [None]:
separator=10
condition = None
subF = []
for i in range(20,27):
    f = 'ptend_q0002_' + str(i)
    subF.append(f)
    if condition is None:
        condition = (abs(data[f]) > separator*abs(meanDict[f]))
    else:
        condition = condition | (abs(data[f]) > separator*abs(meanDict[f]))

In [None]:
tr_large = data.loc[condition]
tr_large = tr_large.compute()
# 5.5 min for sep1

In [None]:
a=1
trShape = tr_large.shape[0]
for f in subF:
    count26 = tr_large.loc[abs(tr_large[f]) > a*abs(meanDict[f])].shape[0]
    print(f, count26, round(count26/trShape, 2)*100)

In [None]:
tr_large.loc[(abs(tr_large['ptend_q0002_26']) > separator*abs(meanDict['ptend_q0002_26'])) | (abs(tr_large['ptend_q0002_25']) > separator*abs(meanDict['ptend_q0002_25']))]

In [None]:
separator=3
tr_large_25 = data.loc[(abs(data['ptend_q0002_25']) > separator*abs(meanDict['ptend_q0002_25']))]
tr_large_25 = tr_large_25.compute()

In [None]:
separator=3
tr_large_27 = data.loc[(abs(data['ptend_q0002_27']) > separator*abs(meanDict['ptend_q0002_27']))]
tr_large_27 = tr_large_27.compute()

In [None]:
plt.scatter(x=range(tr_large.shape[0]),y=tr_large['ptend_q0002_26'], s=1,label='ptend_q0002_26')
plt.scatter(x=range(tr_large.shape[0]),y=tr_large['ptend_q0002_25'], s=1,label='ptend_q0002_25')
plt.scatter(x=range(tr_large.shape[0]),y=tr_large['ptend_q0002_24'], s=1,label='ptend_q0002_24')
plt.scatter(x=range(tr_large.shape[0]),y=tr_large['ptend_q0002_27'], s=1,label='ptend_q0002_27')
plt.legend()
plt.show()

# analyze norm weight

In [None]:
folders_tr = [
    'train0_25',
    'train25_50',
    'train50_75',
]
folders_te = [
    'train75_100'
]

dfs = [dd.read_parquet(folder) for folder in folders_tr]
train = dd.concat(dfs)

dfs = [dd.read_parquet(folder) for folder in folders_te]
test = dd.concat(dfs)

In [None]:
large_tr = train.loc[train['norm_weight'] > 0.0001].compute()
# 0.001 ~6min, 1250 samples
# 0.0005 ~8.5min, 4166 samples (min weight = 13126)
# 0.0001 ~11.5min, 90485 (min weight 2625) -> too many unimportant samples I guess

In [None]:
large_tr.to_parquet('large_training_df_0001')

In [None]:
# get 100k small samples
small_sample_size = train.shape[0].compute()
small_tr = train.sample(frac=100000/small_sample_size).compute()

In [None]:
small_tr.to_parquet('small_training_df_0001')