# Experiment pipeline v2

## Dependencies

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from ylearn import Why
import networkx as nx
from CI_Experiments.config import PROJECT_DIR
import time as t
from func_timeout import func_timeout, FunctionTimedOut

2023-04-20 16:49:38,684 - /Users/lukas/opt/anaconda3/envs/ci_experiments/lib/python3.8/site-packages/castle/algorithms/__init__.py[line:36] - INFO: You are using ``pytorch`` as the backend.


## Parameters

In [2]:
dataset = 'BPIC_17'
one_hot_encoded_activities = 'activities'
one_hot_encoded_activities_numeric = 'activities_numeric'
target_encoded_activities = 'activities2'
target_encoded_activities_numeric = 'activities_numeric2'

## Definitions

### Constants

In [3]:
not_used_algorithms = ['ANMNonlinear', 'GES']
TIMEOUT_SEC = 1800 
DATASETS_PATH = PROJECT_DIR / 'data/prepared_process_logs'
RESULTS_PATH = PROJECT_DIR / 'results/activities_discovery_results'
TRAIN_FILE_NAME = 'train.csv'
TEST_FILE_NAME = 'test.csv'
OUTCOME = 'Outcome'
ALGORITHMS = {
    'ylearn': ['notears'], 
    'gcastle': [ 
        'TTPM', 
        'DirectLiNGAM', 
        'ICALiNGAM', 
        'PC', 
        'Notears', 
        'DAG_GNN',
        'NotearsLowRank', 
        'RL', 
        'CORL', 
        'GraNDAG', 
        'NotearsNonlinear', 
        'GOLEM', 
        'MCSL', 
        'GAE'
    ],
    'pgm': [
        'BaseEstimator',
        'ParameterEstimator',
        'MaximumLikelihoodEstimator',
        'BayesianEstimator',
        'StructureEstimator',
        'ExhaustiveSearch',
        'HillClimbSearch',
        'TreeSearch',
        'StructureScore',
        'K2Score',
        'BDeuScore',
        'BDsScore',
        'BicScore',
        'AICScore',
        'ScoreCache',
        'SEMEstimator',
        'IVEstimator',
        'MmhcEstimator',
        'PC',
        'ExpectationMaximization'
    ]
}

### Classes

In [4]:
class DiscoverResult:
    def __init__(self, identifier_name, r_tuple, duration, causal_graph):
        self.identifier_name = identifier_name
        self.treatment = r_tuple[0]
        self.adjustment = r_tuple[1]
        self.covariate = r_tuple[2]
        self.instrument = r_tuple[3]
        self.duration = duration
        self.causal_graph = causal_graph
        
        
    def to_df(self):
        data = {
            'identifier_name': [self.identifier_name],
            'treatment': [self.treatment],
            'adjustment': [self.covariate],
            'instrument': [self.causal_graph],
            'duration': [self.duration]
        }
        if self.causal_graph is not None:
            data['causal_graph'] = [nx.to_pandas_dataframe(self.causal_graph.dag)]
        else:
            data['causal_graph'] = [None]
        return pd.DataFrame(data=data)
    
    
    def log(self, path):
        df = self.to_df()
        df.to_csv(path / f'{self.identifier_name}.csv')
        

In [5]:
class DiscoverParams:
    def __init__(self, data, discrete, identifier=None, identifier_options=None, random_state=0):
        self.data = data
        self.discrete = discrete
        self.identifier = identifier
        self.identifier_options = identifier_options
        self.random_state = random_state
        
        
    def log(self, msg):
        print(f'**************************\n{msg}\nidentifier: {self.identifier}\nidentifier_options: {self.identifier_options}')

### Functions

In [6]:
def _create_identifier_name(discover_params):
    if discover_params.identifier_options is not None:
        learner_name = discover_params.identifier_options.get('learner')
        return f'{discover_params.identifier}_{learner_name}'
    else:
        return str(discover_params.identifier)

def _discover(discover_params):
    try:
        dr = func_timeout(TIMEOUT_SEC, discover, args=(discover_params,))
        return dr
    except FunctionTimedOut:
        identifier_name = _create_identifier_name(discover_params)
        print (f"{identifier_name} identify could not complete within {TIMEOUT_SEC} seconds and was terminated.\n")
        return None
    
def discover(discover_params):
    dp = discover_params
    why = Why(identifier=dp.identifier, identifier_options=dp.identifier_options, random_state=dp.random_state)
    data = dp.data.copy()
    start = t.time()
    r = why.identify(data, OUTCOME)
    end = t.time()
    time = end - start
    cg = None
    try:
        cg = why.causal_graph()
    except (KeyboardInterrupt, SystemExit):
        raise
    except Exception as error:
        print("Unexpected error:", error)
    dp.log('Identify finished')
    print(f'Duration: {time}')
    identifier_name = _create_identifier_name(dp)
    dr = DiscoverResult(identifier_name=identifier_name, r_tuple=r, duration=time, causal_graph=cg)
    dr.log(RESULTS_PATH)
    return dr

In [7]:
def discover_ylearn(discover_params):
    discover_params.identifier = 'notears'
    return _discover(discover_params)

In [8]:
def discover_gcastle(discover_params):
    learners = ALGORITHMS['gcastle']
    discover_params.identifier = 'gcastle'
    discover_results = []
    for learner in learners:
        try:
            discover_params.identifier_options = {'learner': learner}
            d = _discover(discover_params)
            if d is not None:
                discover_results.append(d)
        except (KeyboardInterrupt, SystemExit):
            raise
        except Exception as error:
            print("Unexpected error:", error)
    return discover_results

In [9]:
def _create_pc_identifier_options(discover_params):
    ci_test = 'chi_square'
    max_cond_vars = 50
    if not discover_params.discrete:
        ci_test = 'pearsonr'
    identifier_options = {
        'learner': 'PC',
        'ci_test': ci_test,
        'max_cond_vars': max_cond_vars
    }
    discover_params.identifier_options = identifier_options
    
    
def _set_identifier_options(discover_params, learner):
    if learner == 'PC':
        _create_pc_identifier_options(discover_params)
    else:
        discover_params.identifier_options = {'learner': learner}
    
    
def discover_pgm(discover_params):
    learners = ALGORITHMS['pgm']
    discover_results = []
    discover_params.identifier = 'pgm'
    for learner in learners:
        try:
            _set_identifier_options(discover_params, learner)
            d = _discover(discover_params)
            if d is not None:
                discover_results.append(d)
        except (KeyboardInterrupt, SystemExit):
            raise
        except Exception as error:
            print("Unexpected error:", error)
    return discover_results

In [10]:
def discover_pipeline(data, discrete=True, random_state=0):
    dp = DiscoverParams(data=data, discrete=discrete, random_state=random_state)
    results = []
    # ylearn_results = discover_ylearn(dp)
    # results.append(ylearn_results)
    gcastle_results = discover_gcastle(dp)
    results = results + gcastle_results
    pgm_results = discover_pgm(dp)
    results = results + pgm_results
    return results

## Experiment

In [11]:
TRAIN_DATA_PATH = DATASETS_PATH / dataset / target_encoded_activities / TRAIN_FILE_NAME
TRAIN_DATA_PATH

PosixPath('/Users/lukas/Desktop/CI_Experiments/CI_Experiments/data/prepared_process_logs/BPIC_17/activities2/train.csv')

In [12]:
data = pd.read_csv(TRAIN_DATA_PATH)
data.head()

Unnamed: 0,Outcome,Activity_A_Accepted,Activity_A_Cancelled,Activity_A_Complete,Activity_A_Concept,Activity_A_Create Application,Activity_A_Denied,Activity_A_Incomplete,Activity_A_Pending,Activity_A_Submitted,...,Activity_O_Sent (mail and online),Activity_O_Sent (online only),Activity_W_Assess potential fraud,Activity_W_Call after offers,Activity_W_Call incomplete files,Activity_W_Complete application,Activity_W_Handle leads,Activity_W_Personal Loan collection,Activity_W_Shortened completion,Activity_W_Validate application
0,0.0,0.0,0.0,0.0,0.0,0.400061,0.0,0.400061,0.0,0.0,...,0.0,0.0,0.0,0.0,0.409135,0.0,0.0,0.0,0.0,0.400061
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.576641,0.0,0.0,...,0.0,0.0,0.0,0.0,0.742512,0.0,0.0,0.0,0.0,0.576641
2,0.0,0.566506,0.0,0.394822,0.536738,0.536738,0.0,0.576641,0.568225,0.315888,...,0.03513,0.315888,0.0,0.400061,0.576641,0.576641,0.536738,0.0,0.0,0.568225
3,1.0,0.400061,0.566506,0.566506,0.394822,0.566506,0.0,0.0,0.0,0.0,...,0.400061,0.0,0.0,0.742512,0.40722,0.400061,0.0,0.0,0.0,0.0
4,0.0,0.400061,0.566506,0.568225,0.566506,0.566506,0.0,0.0,0.394822,0.315888,...,0.576641,0.0,0.0,0.566506,0.40722,0.566506,0.400061,0.0,0.0,0.0


In [13]:
RANDOM_STATE = 23
results = discover_pipeline(data, discrete=False, random_state=RANDOM_STATE)

Unexpected error: discover() argument after * must be an iterable, not DiscoverParams
Unexpected error: discover() argument after * must be an iterable, not DiscoverParams
Unexpected error: discover() argument after * must be an iterable, not DiscoverParams
Unexpected error: discover() argument after * must be an iterable, not DiscoverParams
Unexpected error: discover() argument after * must be an iterable, not DiscoverParams
Unexpected error: discover() argument after * must be an iterable, not DiscoverParams
Unexpected error: discover() argument after * must be an iterable, not DiscoverParams
Unexpected error: discover() argument after * must be an iterable, not DiscoverParams
Unexpected error: discover() argument after * must be an iterable, not DiscoverParams
Unexpected error: discover() argument after * must be an iterable, not DiscoverParams
Unexpected error: discover() argument after * must be an iterable, not DiscoverParams
Unexpected error: discover() argument after * must be 

In [None]:
dfs = []
for result in results:
    df = result.to_df()
    dfs.append(df)

In [None]:
final_df = pd.concat(dfs, ignore_index=True)

In [None]:
path = '/Users/lukas/Desktop/CI_Experiments/CI_Experiments/other/activites2_results.csv'
final_df.to_csv(path)