In [None]:
# !pip install pandarallel
# !pip install swifter

In [None]:
import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.append('/home/mshoush/5th/common_files')  


from DatasetManager import DatasetManager
import gc
import pandas as pd
import numpy as np
from sklearn.pipeline import FeatureUnion
import os
import EncoderFactory


case_id_col = 'case_id'
activity_col = 'activity'
resource_col = 'resource'
timestamp_col = 'timestamp'
label_col = 'label'
treatment_col = "Treatment1"

results_dir = "./../prepared_data/"
cls_methods = ["catboost", "other",] # "other", 

dataset_ref_to_datasets = {
    "bpic2012": ["bpic2012"], 
    "bpic2017": ["bpic2017"],
        
}

encoding_dict = {  
    "index": ["static", "index"],  
    "laststate": ["static", "last"],
    "agg": ["static", "agg"],       
    "combined": ["static", "last", "agg"]
    }

task_types = ["regression", "classification",]


    
fillna = True

# Specify the desired ratios
train_ratio = 0.5
val_ratio = 0.3
test_ratio = 0.2

    
def save_file_to_parquet(file, results_dir, name, dataset_name, cat_cols=None):
    if not os.path.exists(results_dir):
        os.makedirs(results_dir)
        
    if cat_cols is not None:
        file.iloc[:, cat_cols] = file.iloc[:, cat_cols].astype(str)

    try:     
        file[case_id_col] = file[case_id_col].astype(str)
    except:
        pass      
    file.to_parquet(os.path.join(results_dir, name+"_%s.parquet" % dataset_name))
    
    
    
def transform_chunked(feature_combiner, prefixes, chunk_size):
    x = []
    unique_groups = prefixes["case_id"].unique()
    print(len(unique_groups))
    
    for start_idx in range(0, len(unique_groups), chunk_size):
        end_idx = min(start_idx + chunk_size, len(unique_groups))
        chunk_groups = unique_groups[start_idx:end_idx]
        
        chunk_prefixes = prefixes[prefixes["case_id"].isin(chunk_groups)]
        transformed_chunk = feature_combiner.transform(chunk_prefixes)
        x.append(transformed_chunk)
    
    return np.vstack(x)

    



def encode_data(feature_combiner, prefixes, task_type, cls_encoding='index'):
    if cls_encoding=='index':
        x = transform_chunked(feature_combiner, prefixes, chunk_size=1000)         
    else:
        x = feature_combiner.transform(prefixes)
                   
    if task_type == "classification":        
        y = dataset_manager.get_label_numeric(prefixes)
    elif task_type == "regression":
        y = dataset_manager.get_label_regression(prefixes)
            
    t = dataset_manager.get_treatment_numeric(prefixes)
    
    data_np = np.column_stack((x, y, t))
    data = pd.DataFrame(data_np)
    
    return data

  
  
def process_data(dataset, data_type):
    gc.collect()
    dt_prefixes = dataset_manager.generate_prefix_data(dataset, min_prefix_length, max_prefix_length)
    save_file_to_parquet(dt_prefixes, results_dir, f"{data_type}_prefixes", dataset_name)
    #print(f"dt_{data_type}_prefixes.shape: ", dt_prefixes.shape)
    encoded_data = encode_data(feature_combiner, dt_prefixes, task_type)
    del dt_prefixes
    gc.collect()
    #print(" ")
    return encoded_data


def read_encoded_data(cls_encoding, dataset_name, results_dir):
    gc.collect()
    if cls_encoding == "combined":
        df_agg = pd.read_parquet(os.path.join(results_dir, 'dt_transformed_agg_%s_%s.parquet' % (cls_method, dataset_name)))
        df_static = pd.read_parquet(os.path.join(results_dir, 'dt_transformed_static_%s_%s.parquet' % (cls_method, dataset_name)))
        df_last = pd.read_parquet(os.path.join(results_dir, 'dt_transformed_laststate_%s_%s.parquet' % (cls_method, dataset_name)))
        static_agg_df = pd.concat([df_static, df_last, df_agg], axis=1)
        del df_agg, df_static, df_last
    else:
        #print("read agg and static")
        df_agg = pd.read_parquet(os.path.join(results_dir, 'dt_transformed_%s_%s_%s.parquet' % (cls_encoding, cls_method, dataset_name)))
        df_static = pd.read_parquet(os.path.join(results_dir, 'dt_transformed_static_%s_%s.parquet' % (cls_method, dataset_name)))
        static_agg_df = pd.concat([df_static, df_agg], axis=1)
        del df_agg, df_static
    gc.collect()
    return static_agg_df

def save_encoded_data(encoded_data, results_dir, dataset_name, cls_encoding, data_type):
    gc.collect()
    #print("read static_agg_df")
    static_agg_df = read_encoded_data(cls_encoding, dataset_name, results_dir).head()
    encoded_data.columns = list(static_agg_df.columns) + [str(dataset_manager.label_col)] + ["Treatment"]
    encoded_data = encoded_data.astype(static_agg_df.dtypes.to_dict())
    del static_agg_df
    gc.collect()
    #print("del static_agg_df")
    
    # Remove duplicated columns in place without creating a copy
    #print("encoded_data")
    encoded_data = encoded_data.loc[:, ~encoded_data.columns.duplicated()]                                
    cat_feat_idx = np.where((encoded_data.dtypes == 'object') & ~encoded_data.columns.isin([str(dataset_manager.label_col), "Treatment"]))[0]           
    #print(f"cat_feat_idx: {cat_feat_idx}")

    if cls_encoding == 'index':
        #print("save_file_to_parquet")
        save_file_to_parquet(encoded_data, results_dir, f"{data_type}_{cls_method}_{cls_encoding}_encoded", dataset_name, cat_cols=cat_feat_idx)
    else:
        save_file_to_parquet(encoded_data, results_dir, f"{data_type}_{cls_method}_{cls_encoding}_encoded", dataset_name)
    gc.collect()

        

  
    
for cls_method in cls_methods:    
    gc.collect()
    #print(f"cls_method: {cls_method}")
    

    for task_type in task_types:           
        gc.collect()

        for dataset_name in dataset_ref_to_datasets.keys():            
            gc.collect()
            results_dir = "./../prepared_data/%s/%s/" % (task_type, dataset_name)
            dataset_manager = DatasetManager(dataset_name, task_type)
            data = dataset_manager.read_dataset()
            train, val, test = dataset_manager.split_data(data, train_ratio, val_ratio, split_type="temporal", seed=22)
            save_file_to_parquet(train, results_dir, "train", dataset_name)
            save_file_to_parquet(val, results_dir, "val", dataset_name)
            save_file_to_parquet(test, results_dir, "test", dataset_name)


            min_prefix_length = 1
            max_prefix_length = min(20, dataset_manager.get_pos_case_length_quantile(data, 0.90))
            del data
            
            
            cls_encoder_args = {'case_id_col': dataset_manager.case_id_col,
                                'static_cat_cols': dataset_manager.static_cat_cols,
                                'static_num_cols': dataset_manager.static_num_cols,
                                'dynamic_cat_cols': dataset_manager.dynamic_cat_cols,
                                'dynamic_num_cols': dataset_manager.dynamic_num_cols,
                                'fillna': True,
                                'model': cls_method,
                                'dataset_name':dataset_name,
                                "results_dir":results_dir}
            
            for cls_encoding in encoding_dict.keys():
                gc.collect()
                print(f"Dataset: {dataset_name}, Encoding: {cls_encoding}, Method: {cls_method}, Task: {task_type}")
                methods = encoding_dict[cls_encoding]
                
                # Create FeatureUnion
                feature_combiner = FeatureUnion([(method, EncoderFactory.get_encoder(method, **cls_encoder_args)) for method in methods])
          
                # Call the function to save encoded data                
                #print("train")
                train_encoded = process_data(train, "train")
                save_encoded_data(train_encoded, results_dir, dataset_name, cls_encoding, data_type="train")
                del train_encoded
                gc.collect()
                
                # test
                #print("test")
                test_encoded = process_data(test, "test")
                save_encoded_data(test_encoded, results_dir, dataset_name, cls_encoding, data_type="test")
                del test_encoded
                gc.collect()
                
                # val
                #print("val")
                val_encoded = process_data(val, "val")
                save_encoded_data(val_encoded, results_dir, dataset_name, cls_encoding, data_type="val")
                del val_encoded
                gc.collect()
                #print("Done!\n")
                



            
        