# Notebook for Finding Process Outcome Explanations

## Main Configuration

In [42]:
import pandas as pd
import numpy as np
import re

from rulelearn.algorithms.ripper import RipperExplainer
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score, recall_score
from dtree_helper import get_rules

In [43]:
STRING_IMPUTE_VAL = 'missing'
FLOAT_IMPUTE_VAL = np.finfo(np.float32).min  # Imputation value for floats
TRAIN_TEST_SPLIT = 33  # Percentage of data to use for testing

BINA, ALGO =  'NATIVE', 'RIPPER'   # alternative: 'LENC', 'CART:4'  
# for other options, see: https://github.com/hvoelzer/rulebenchmarking

In [44]:

INPUT_FILE_NAME_BERLIN = 'cases_berlin-preprocessed-gesetzgebung-2006-2020_with_context.csv'
INPUT_FILE_NAME_BRANDENBURG = 'cases_brandenburg-preprocessed-gesetzgebung-2006-2020_processed_with_context.csv'
INPUT_FILE_NAME_BAWUE = 'cases_baden-württemberg-preprocessed-gesetzgebung-gesetz-2006-2020_processed_with_context.csv'
'''

INPUT_FILE_NAME_BERLIN = 'cases_berlin-preprocessed-gesetzgebung-2006-2020_with_context_passed_bills.csv'
INPUT_FILE_NAME_BRANDENBURG = 'cases_brandenburg-preprocessed-gesetzgebung-2006-2020_processed_with_context_passed_bills.csv'
INPUT_FILE_NAME_BAWUE = 'cases_baden-württemberg-preprocessed-gesetzgebung-gesetz-2006-2020_processed_with_context_passed_bills.csv'
'''

def read_and_sanitize(file_name):
    df = pd.read_csv(file_name, dtype={'is_passed_bill': object})
    print('Read', len(df), 'rows from', file_name)
    df.columns = df.columns.str.replace(r'[()]', '_', regex=True)
    df = df.dropna(axis=1, how='all')  # drop all completely empty columns
    return df

df_berlin = read_and_sanitize(INPUT_FILE_NAME_BERLIN)
df_brandenburg = read_and_sanitize(INPUT_FILE_NAME_BRANDENBURG)
df_baWue = read_and_sanitize(INPUT_FILE_NAME_BAWUE)

bawue_average_cycle_time = df_baWue["duration"].mean()
bawue_average_cycle_time_upper_bound = bawue_average_cycle_time + 0.1*bawue_average_cycle_time
bawue_average_cycle_time_lower_bound = bawue_average_cycle_time - 0.1*bawue_average_cycle_time
print(bawue_average_cycle_time_upper_bound)

print('Berlin:', len(df_berlin), 'Brandenburg:', len(df_brandenburg), 'BaWue:', len(df_baWue))
print('Mean duration values:')
print('Berlin:', df_berlin['duration'].mean())
print('Brandenburg:', df_brandenburg['duration'].mean())
print('BaWue:', df_baWue['duration'].mean())

outcome = lambda row: row['duration'] > bawue_average_cycle_time_upper_bound

Read 731 rows from cases_berlin-preprocessed-gesetzgebung-2006-2020_with_context.csv
Read 718 rows from cases_brandenburg-preprocessed-gesetzgebung-2006-2020_processed_with_context.csv
Read 1005 rows from cases_baden-württemberg-preprocessed-gesetzgebung-gesetz-2006-2020_processed_with_context.csv
67.05950248756218
Berlin: 731 Brandenburg: 718 BaWue: 1005
Mean duration values:
Berlin: 176.33789329685362
Brandenburg: 119.7883008356546
BaWue: 60.96318407960199


In [45]:
# Prep data: cast all (remaining) Boolean columns as object and all Integer columns as float
for df in [df_berlin, df_brandenburg, df_baWue]:
    for column in df.select_dtypes(include=['bool']).columns:
        print(column, 'bool->object')
        df[column] = df[column].astype('object')
    for column in df.select_dtypes(include=['int64']).columns:
        print(column, 'int->float')
        df[column] = df[column].astype('float64')
    df.info(verbose=True)

case_id int->float
event_count int->float
start_time_rel int->float
duration int->float
case:start_month int->float
case:start_weekday int->float
case:WIP_during_start int->float
case:pdf_bytes int->float
case:pdf_word_count int->float
case:plenDays int->float
case:is_election_year int->float
case:yearly_frequency int->float
case:yearly_variants int->float
case:author_first_activity_bündnis 90/die grünen _grüne_ int->float
case:author_first_activity_nan int->float
case:author_first_activity_ausschuss int->float
case:author_first_activity_sozialdemokratische partei deutschlands _spd_ int->float
case:author_first_activity_partei des demokratischen sozialismus _die linkspartei.pds_ int->float
case:author_first_activity_freie demokratische partei _fdp_ int->float
case:author_first_activity_christlich demokratische union deutschlands _cdu_ int->float
case:author_first_activity_die linke fraktion berlin _die linke_ int->float
case:author_first_activity_piraten _piraten_ int->float
case:autho

In [46]:
# Prep data: Imputation whenever needed
for df in [df_berlin, df_brandenburg, df_baWue]:
    for column in df.select_dtypes(include=['object']).columns:
        df[column] = df[column].fillna(STRING_IMPUTE_VAL)  # normalize missing string values
    if ALGO in ['CART:4'] or BINA in ['TREES']:
        for column in df.select_dtypes(include=['float64']).columns:
            df[column] = df[column].fillna(FLOAT_IMPUTE_VAL)

## Configuration of the Explanation Problem

In [47]:
POS_CLASS = True
df_berlin['outcome'] = df_berlin.apply(outcome, axis=1)
df_brandenburg['outcome'] = df_brandenburg.apply(outcome, axis=1)
df_baWue['outcome'] = df_baWue.apply(outcome, axis=1)

### Attribute Hiding

In [48]:
# Use attribute names or regular expressions to hide
HIDE = [
    
    # hide what we want to explain
    'duration',
    r'.*cycle.*',
    
    # hide meaningless columns
    'case_id',
    r'.*none.*',
    r'.*nan.*',
    

        
    # temporal columns
    'start_time_rel',
    'start_time', 
    r'.*start.*', 
    #r'.*delay.*',
    
    r'.*Geset.*',

    # experiments
    #r'.*count.*',

    #"is_passed_bill",
    
    #r'.*:[0-9].*',

    ]

for df in [df_berlin, df_brandenburg, df_baWue]:
    for pattern in HIDE:
        for col in df.columns:
            if re.fullmatch(pattern, col):
                # print('Dropping {} on pattern "{}".'.format(col, pattern))
                df.drop(columns=[col], inplace=True)


## Run Rule Induction

In [49]:
stateParliaments = ['Berlin', 'Brandenburg', 'BaWue']
for index, df in enumerate([df_berlin, df_brandenburg, df_baWue]):    
    print(f"\nHandling dataset: {stateParliaments[index]}\n")
    
    # Prep data: Split into training and test set
    if TRAIN_TEST_SPLIT > 0.0:
        x_train, x_test, y_train, y_test = train_test_split(
            df.drop(columns=['outcome']),
            df['outcome'],
            test_size=TRAIN_TEST_SPLIT,
            random_state=3) 
    else:
        x_train = df.drop(columns=['outcome'])
        y_train = df['outcome']
        x_test = x_train
        y_test = y_train
    
    
    # Run Binarizer / Encoding
    if BINA == 'LENC':
        x_train_bin = x_train
        x_test_bin = x_test
        categorical_features = x_train_bin.select_dtypes(include=['object']).columns
        for col in categorical_features:
            label_encoder = LabelEncoder()
            label_encoder = label_encoder.fit(df[col])
            x_train_bin[col] = label_encoder.transform(x_train_bin[col])
            x_test_bin[col] = label_encoder.transform(x_test_bin[col])
            print(col, label_encoder.classes_, label_encoder.transform(label_encoder.classes_))
    elif BINA == 'NATIVE':
        x_train_bin = x_train.copy() # RIPPER implementation messes with the training set
        x_test_bin = x_test
        
    if ALGO == 'RIPPER':
        estimator = RipperExplainer()
        estimator.fit(x_train_bin, y_train, target_label=POS_CLASS)
    else:
        print('Not supported.')
        
    
    # Evaluation
    y_predicted = estimator.predict(x_test_bin)
    # print('Accuracy:', round(accuracy_score(y_test, y_predicted), 2))
    print('Predictive:')
    print('Precision:', round(precision_score(y_test, y_predicted, pos_label=POS_CLASS, zero_division=1), 6))
    print('Recall:', round(recall_score(y_test, y_predicted, pos_label=POS_CLASS), 6))
    y_predicted_train = estimator.predict(x_train_bin)
    print('Descriptive:')
    print('Precision:', round(precision_score(y_train, y_predicted_train, pos_label=POS_CLASS, zero_division=1), 6))
    print('Recall:', round(recall_score(y_train, y_predicted_train, pos_label=POS_CLASS), 6))
    
    

    # Model export
    print("bawue_average_cycle_time_upper_bound:", bawue_average_cycle_time_upper_bound)
    if ALGO in ('RIPPER'):
        rule_set = estimator.explain()
        print(rule_set)
        
            
    


Handling dataset: Berlin

Predictive:
Precision: 0.952381
Recall: 0.769231
Descriptive:
Precision: 0.988235
Recall: 0.919037
bawue_average_cycle_time_upper_bound: 67.05950248756218
if
([II. Lesung:Ausschussberatung.delay <= -45.0]) v
([Beschlussempfehlung:I. Lesung.delay <= -24.0] ^ [II. Lesung:I. Lesung.delay <= -49.0]) v
([event_count >= 8.0] ^ [Änderungsantrag.count <= 0.0] ^ [Ausschussberatung.count <= 2.0]) v
([event_count >= 7.0] ^ [Ausschussberatung.count <= 1.0] ^ [Änderungsantrag.count <= 0.0])
then
True

Handling dataset: Brandenburg

Predictive:
Precision: 1.0
Recall: 0.952381
Descriptive:
Precision: 0.961538
Recall: 0.988372
bawue_average_cycle_time_upper_bound: 67.05950248756218
if
([case:author_first_activity == Landesregierung] ^ [case:pdf_bytes >= 1056500.0]) v
([event_count >= 5.0] ^ [Beschlussempfehlung und Bericht:Sitzung.delay <= -65.0]) v
([2. Lesung:1. Lesung.delay <= -27.0] ^ [Bekanntmachung.count >= 1.0]) v
([2. Lesung:1. Lesung.delay <= -42.0]) v
([Beschlussem