# Notebook for testing hypotheses

## Load enriched case log

In [380]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score, confusion_matrix, f1_score

In [381]:

INPUT_FILE_NAME_BRANDENBURG = 'cases_brandenburg-preprocessed-gesetzgebung-2006-2020_processed_with_context.csv'
INPUT_FILE_NAME_BERLIN = 'cases_berlin-preprocessed-gesetzgebung-2006-2020_with_context.csv'
INPUT_FILE_NAME_BAWUE = 'cases_baden-württemberg-preprocessed-gesetzgebung-gesetz-2006-2020_processed_with_context.csv'

'''
INPUT_FILE_NAME = 'cases_brandenburg-preprocessed-gesetzgebung-2006-2020_processed_with_context_passed_bills.csv'
#INPUT_FILE_NAME = 'cases_berlin-preprocessed-gesetzgebung-2006-2020_with_context_passed_bills.csv'
INPUT_FILE_NAME_COMPARISON = 'cases_baden-württemberg-preprocessed-gesetzgebung-gesetz-2006-2020_processed_with_context_passed_bills.csv'
'''

df_brandenburg = pd.read_csv(INPUT_FILE_NAME_BRANDENBURG)
df_berlin = pd.read_csv(INPUT_FILE_NAME_BERLIN)
df_baWue = pd.read_csv(INPUT_FILE_NAME_BAWUE)

def sanitize_column_names(df):
    # Replace spaces and special characters with underscores
    df.columns = df.columns.str.replace(r'[()]', '_', regex=True)
    return df

df_brandenburg = sanitize_column_names(df_brandenburg)
df_berlin = sanitize_column_names(df_berlin)
df_baWue = sanitize_column_names(df_baWue)

print('Read', len(df_brandenburg), 'rows from', INPUT_FILE_NAME_BRANDENBURG)
print('Read', len(df_berlin), 'rows from', INPUT_FILE_NAME_BERLIN)
print('Read', len(df_baWue), 'rows from', INPUT_FILE_NAME_BAWUE)

df_brandenburg = df_brandenburg.dropna(axis=1, how='all')  # drop all completely empty columns
df_berlin = df_berlin.dropna(axis=1, how='all')  # drop all completely empty columns
df_baWue = df_baWue.dropna(axis=1, how='all')  # drop all completely empty columns

Read 718 rows from cases_brandenburg-preprocessed-gesetzgebung-2006-2020_processed_with_context.csv
Read 731 rows from cases_berlin-preprocessed-gesetzgebung-2006-2020_with_context.csv
Read 1005 rows from cases_baden-württemberg-preprocessed-gesetzgebung-gesetz-2006-2020_processed_with_context.csv


In [382]:
bawue_average_cycle_time = df_baWue["duration"].mean()
bawue_average_cycle_time_upper_bound = bawue_average_cycle_time + 0.1*bawue_average_cycle_time
bawue_average_cycle_time_lower_bound = bawue_average_cycle_time - 0.1*bawue_average_cycle_time
print("BaWue average cycle time with threshold 10%:", bawue_average_cycle_time_upper_bound)

outcome = lambda row: row['duration'] > bawue_average_cycle_time_upper_bound

DATA_TYPES = {
    'slower_than_bawue_avg_tolerance': object,
    'faster_than_bawue_avg_tolerance': object,
    'in_bawue_avg_tolerance': object,
}

BaWue average cycle time with threshold 10%: 67.05950248756218


## Evaluate a explanation against the enriched case log

### Set outcome and explanation

In [383]:
# For df_berlin
df_berlin['outcome'] = df_berlin.apply(outcome, axis=1)
y_true_berlin = df_berlin['outcome']
print('Outcome distribution for Berlin:')
print(y_true_berlin.value_counts())

# For df_brandenburg
df_brandenburg['outcome'] = df_brandenburg.apply(outcome, axis=1)
y_true_brandenburg = df_brandenburg['outcome']
print('Outcome distribution for Brandenburg:')
print(y_true_brandenburg.value_counts())

# For df_baWue
df_baWue['outcome'] = df_baWue.apply(outcome, axis=1)
y_true_baWue = df_baWue['outcome']
print('Outcome distribution for BaWue:')
print(y_true_baWue.value_counts())



Outcome distribution for Berlin:
outcome
True     483
False    248
Name: count, dtype: int64
Outcome distribution for Brandenburg:
outcome
True     451
False    267
Name: count, dtype: int64
Outcome distribution for BaWue:
outcome
False    719
True     286
Name: count, dtype: int64


In [384]:

##### Berlin


### from inducer
'''
if
([Gesetz- und Verordnungsblatt.count >= 1.0] ^ [case:staff >= 84028.2265625] ^ [case:pdf_word_count >= 44450.0]) v
([Gesetz- und Verordnungsblatt.count >= 1.0] ^ [event_count >= 7.0] ^ [case:pdf_bytes <= 171156.0] ^ [case:commDays >= 164.0] ^ [Änderungsantrag.count >= 1.0]) v
([Gesetz- und Verordnungsblatt.count >= 2.0] ^ [case:squire >= 0.2511136314821711]) v
([event_count >= 4.0] ^ [Gesetz- und Verordnungsblatt.count <= 0.0]) v
([Ausschussberatung.count >= 3.0]) v
([Gesetz- und Verordnungsblatt.count >= 1.0] ^ [case:plenDays >= 16.0] ^ [case:pdf_bytes <= 119670.0] ^ [case:pdf_word_count >= 1733.0]) v
([case:pdf_word_count >= 2107.0] ^ [case:yearly_arrival_rate >= 6.86] ^ [case:pdf_bytes >= 128785.0] ^ [case:pdf_bytes <= 278085.0]) v
([case:squire <= 0.269069859161224] ^ [case:DokTypLFirstDoc == Vorlage zur Beschlussfassung (Gesetzentwurf)] ^ [case:staff >= 105634.8828125] ^ [case:pdf_bytes <= 83146.0]) v
([Gesetz- und Verordnungsblatt.count >= 1.0] ^ [case:squire >= 0.269069859161224] ^ [case:author_first_activity_count <= 4.0] ^ [case:yearly_variants <= 24.0])
then
True
'''

rule1_berlin = lambda row: (row["Gesetz- und Verordnungsblatt.count"] >= 1.0 and 
                            row["case:staff"] >= 84028.2265625 and 
                            row["case:pdf_word_count"] >= 44450.0)

rule2_berlin = lambda row: (row["Gesetz- und Verordnungsblatt.count"] >= 1.0 and 
                            row["event_count"] >= 7.0 and 
                            row["case:pdf_bytes"] <= 171156.0 and 
                            row["case:commDays"] >= 164.0 and 
                            row["Änderungsantrag.count"] >= 1.0)

rule3_berlin = lambda row: (row["Gesetz- und Verordnungsblatt.count"] >= 2.0 and 
                            row["case:squire"] >= 0.2511136314821711)

rule4_berlin = lambda row: (row["event_count"] >= 4.0 and 
                            row["Gesetz- und Verordnungsblatt.count"] <= 0.0)

rule5_berlin = lambda row: row["Ausschussberatung.count"] >= 3.0

rule6_berlin = lambda row: (row["Gesetz- und Verordnungsblatt.count"] >= 1.0 and 
                            row["case:plenDays"] >= 16.0 and 
                            row["case:pdf_bytes"] <= 119670.0 and 
                            row["case:pdf_word_count"] >= 1733.0)

rule7_berlin = lambda row: (row["case:pdf_word_count"] >= 2107.0 and 
                            row["case:yearly_arrival_rate"] >= 6.86 and 
                            row["case:pdf_bytes"] >= 128785.0 and 
                            row["case:pdf_bytes"] <= 278085.0)

rule8_berlin = lambda row: (row["case:squire"] <= 0.269069859161224 and 
                            row["case:DokTypLFirstDoc"] == "Vorlage zur Beschlussfassung (Gesetzentwurf)" and 
                            row["case:staff"] >= 105634.8828125 and 
                            row["case:pdf_bytes"] <= 83146.0)

rule9_berlin = lambda row: (row["Gesetz- und Verordnungsblatt.count"] >= 1.0 and 
                            row["case:squire"] >= 0.269069859161224 and 
                            row["case:author_first_activity_count"] <= 4.0 and 
                            row["case:yearly_variants"] <= 24.0)

combined_rule_berlin = lambda row: (rule1_berlin(row) or rule2_berlin(row) or rule3_berlin(row) or rule4_berlin(row) or 
                                    rule5_berlin(row) or rule6_berlin(row) or rule7_berlin(row) or rule8_berlin(row) or rule9_berlin(row))


##### BRANDENBURG

### from inducer

'''
if
([case:VSysL == ['Öffentlicher Haushalt', 'Landesregierung']] ^ [case:pdf_word_count >= 21053.0] ^ [event_count >= 16.0]) v
([event_count >= 7.0] ^ [2. Lesung.count >= 1.0]) v
([event_count >= 6.0] ^ [case:author_first_activity == Landesregierung]) v
([Sitzung.count >= 4.0] ^ [case:plenDays <= 19.0]) v
([case:salary <= 4730.81982421875] ^ [Gesetzentwurf.count <= 0.0] ^ [event_count <= 10.0] ^ [event_count >= 5.0]) v
([case:VSysL == ['Öffentlicher Haushalt', 'Landesregierung']] ^ [case:author_first_activity == Ausschuss für Wissenschaft, Forschung und Kultur]) v
([case:af_score >= -0.102261976147379] ^ [event_count >= 7.0]) v
([case:VSysL == ['Öffentlicher Haushalt', 'Parlament']]) v
([case:author_first_activity_sozialdemokratische partei deutschlands _spd_ >= 1.0] ^ [case:af_score <= -0.462958637408005] ^ [case:pdf_bytes >= 7702.0] ^ [Gesetzentwurf.count <= 1.0]) v
([case:author_first_activity == Landesregierung] ^ [case:pdf_bytes >= 1056500.0]) v
([case:VorgangsDeskriptoren == ['Bauvorhaben', 'Einzelplan 13 - Landesrechnungshof', 'Landeshaushalt', 'Landesrechnungshof', 'Öffentliche Ausgaben', 'Öffentliche Einnahmen', 'Öffentliche Finanzplanung', 'Öffentliche Mittel', 'Öffentlicher Haushalt', 'Personalkosten', 'Personalplanung']]) v
([case:VorgangsDeskriptoren == ['Biber', 'Einzelplan 10 - Ministerium für Ländliche Entwicklung, Umwelt und Landwirtschaft', 'EU-Mittel', 'Gemeinschaftsaufgabe Verbesserung der Agrarstruktur und des Küstenschutzes', 'Hochwasserschutz', 'Immissionsschutz', 'Internationale Naturausstellung Lieberoser Heide', 'Landesbetrieb Forst Brandenburg', 'Landeshaushalt', 'Nachhaltige Entwicklung', 'Naturschutz', 'Öffentliche Ausgaben', 'Öffentliche Einnahmen', 'Öffentliche Mittel', 'Öffentlicher Haushalt', 'Personalplanung', 'Schleuse', 'Tierhaltung', 'Wasser- und Bodenverband']]) v
([event_count >= 27.0] ^ [event_count <= 27.0])
then
True
'''
# Rule 1
rule1_brandenburg = lambda row: (row["case:VSysL"] == "['Öffentlicher Haushalt', 'Landesregierung']" and 
                                 row["case:pdf_word_count"] >= 21053.0 and 
                                 row["event_count"] >= 16.0)

# Rule 2
rule2_brandenburg = lambda row: (row["event_count"] >= 7.0 and 
                                 row["2. Lesung.count"] >= 1.0)

# Rule 3
rule3_brandenburg = lambda row: (row["event_count"] >= 6.0 and 
                                 row["case:author_first_activity"] == "Landesregierung")

# Rule 4
rule4_brandenburg = lambda row: (row["Sitzung.count"] >= 4.0 and 
                                 row["case:plenDays"] <= 19.0)

# Rule 5
rule5_brandenburg = lambda row: (row["case:salary"] <= 4730.81982421875 and 
                                 row["Gesetzentwurf.count"] <= 0.0 and 
                                 row["event_count"] <= 10.0 and 
                                 row["event_count"] >= 5.0)

# Rule 6
rule6_brandenburg = lambda row: (row["case:VSysL"] == str(['Öffentlicher Haushalt', 'Landesregierung']) and 
                                 row["case:author_first_activity"] == "Ausschuss für Wissenschaft, Forschung und Kultur")

# Rule 7
rule7_brandenburg = lambda row: (row["case:af_score"] >= -0.102261976147379 and 
                                 row["event_count"] >= 7.0)

# Rule 8
rule8_brandenburg = lambda row: row["case:VSysL"] == str(['Öffentlicher Haushalt', 'Parlament'])

# Rule 9
rule9_brandenburg = lambda row: (row["case:author_first_activity_sozialdemokratische partei deutschlands _spd_"] >= 1.0 and 
                                 row["case:af_score"] <= -0.462958637408005 and 
                                 row["case:pdf_bytes"] >= 7702.0 and 
                                 row["Gesetzentwurf.count"] <= 1.0)

# Rule 10
rule10_brandenburg = lambda row: (row["case:author_first_activity"] == "Landesregierung" and 
                                  row["case:pdf_bytes"] >= 1056500.0)

# Rule 11 - Descriptor list 1
rule11_brandenburg = lambda row: row["case:VorgangsDeskriptoren"] == str(['Bauvorhaben', 'Einzelplan 13 - Landesrechnungshof', 
                                                                         'Landeshaushalt', 'Landesrechnungshof', 'Öffentliche Ausgaben', 
                                                                         'Öffentliche Einnahmen', 'Öffentliche Finanzplanung', 
                                                                         'Öffentliche Mittel', 'Öffentlicher Haushalt', 'Personalkosten', 
                                                                         'Personalplanung'])

# Rule 12 - Descriptor list 2
rule12_brandenburg = lambda row: row["case:VorgangsDeskriptoren"] == str(['Biber', 'Einzelplan 10 - Ministerium für Ländliche Entwicklung, Umwelt und Landwirtschaft', 
                                                                         'EU-Mittel', 'Gemeinschaftsaufgabe Verbesserung der Agrarstruktur und des Küstenschutzes', 
                                                                         'Hochwasserschutz', 'Immissionsschutz', 'Internationale Naturausstellung Lieberoser Heide', 
                                                                         'Landesbetrieb Forst Brandenburg', 'Landeshaushalt', 'Nachhaltige Entwicklung', 'Naturschutz', 
                                                                         'Öffentliche Ausgaben', 'Öffentliche Einnahmen', 'Öffentliche Mittel', 'Öffentlicher Haushalt', 
                                                                         'Personalplanung', 'Schleuse', 'Tierhaltung', 'Wasser- und Bodenverband'])

# Rule 13
rule13_brandenburg = lambda row: (row["event_count"] >= 27.0 and row["event_count"] <= 27.0)  # This is equivalent to row["event_count"] == 27.0

# Combined rule
combined_rule_brandenburg = lambda row: (rule1_brandenburg(row) or rule2_brandenburg(row) or rule3_brandenburg(row) or rule4_brandenburg(row) or 
                                         rule5_brandenburg(row) or rule6_brandenburg(row) or rule7_brandenburg(row) or rule8_brandenburg(row) or 
                                         rule9_brandenburg(row) or rule10_brandenburg(row) or rule11_brandenburg(row) or rule12_brandenburg(row) or 
                                         rule13_brandenburg(row))




In [385]:
##### Berlin

### with delay but without .*Gesetz.*
'''
if
([II. Lesung:Ausschussberatung.delay <= -45.0]) v
([Beschlussempfehlung:I. Lesung.delay <= -24.0] ^ [II. Lesung:I. Lesung.delay <= -49.0]) v
([event_count >= 8.0] ^ [Änderungsantrag.count <= 0.0] ^ [Ausschussberatung.count <= 2.0]) v
([event_count >= 7.0] ^ [Ausschussberatung.count <= 1.0] ^ [Änderungsantrag.count <= 0.0])
then
True
'''


# Rule 1
dRule1_berlin = lambda row: row["II. Lesung:Ausschussberatung.delay"] <= -45.0

# Rule 2
dRule2_berlin = lambda row: (row["Beschlussempfehlung:I. Lesung.delay"] <= -24.0 and 
                             row["II. Lesung:I. Lesung.delay"] <= -49.0)

# Rule 3
dRule3_berlin = lambda row: (row["event_count"] >= 8.0 and 
                             row["Änderungsantrag.count"] <= 0.0 and 
                             row["Ausschussberatung.count"] <= 2.0)

# Rule 4
dRule4_berlin = lambda row: (row["event_count"] >= 7.0 and 
                             row["Ausschussberatung.count"] <= 1.0 and 
                             row["Änderungsantrag.count"] <= 0.0)

# Combined rule
dCombined_rule_berlin = lambda row: (dRule1_berlin(row) or dRule2_berlin(row) or dRule3_berlin(row) or dRule4_berlin(row))



###### Brandenburg
'''
if
([case:author_first_activity == Landesregierung] ^ [case:pdf_bytes >= 1056500.0]) v
([event_count >= 5.0] ^ [Beschlussempfehlung und Bericht:Sitzung.delay <= -65.0]) v
([2. Lesung:1. Lesung.delay <= -27.0] ^ [Bekanntmachung.count >= 1.0]) v
([2. Lesung:1. Lesung.delay <= -42.0]) v
([Beschlussempfehlung und Bericht:1. Lesung.delay <= -22.0] ^ [event_count <= 6.0] ^ [2. Lesung:1. Lesung.delay >= -30.0]) v
([Beschlussempfehlung und Bericht:Sitzung.delay <= -63.0]) v
([2. Lesung:1. Lesung.delay <= -34.0] ^ [case:author_first_activity_count <= 2.0]) v
([case:VSysL == ['Abgaben', 'Erneuerbare Energien']]) v
([2. Lesung:Bekanntmachung.delay >= 13.0])
then
True
'''

# Rule 1
dRule1_brandenburg = lambda row: (row["case:author_first_activity"] == "Landesregierung" and 
                                  row["case:pdf_bytes"] >= 1056500.0)

# Rule 2
dRule2_brandenburg = lambda row: (row["event_count"] >= 5.0 and 
                                  row["Beschlussempfehlung und Bericht:Sitzung.delay"] <= -65.0)

# Rule 3
dRule3_brandenburg = lambda row: (row["2. Lesung:1. Lesung.delay"] <= -27.0 and 
                                  row["Bekanntmachung.count"] >= 1.0)

# Rule 4
dRule4_brandenburg = lambda row: row["2. Lesung:1. Lesung.delay"] <= -42.0

# Rule 5
dRule5_brandenburg = lambda row: (row["Beschlussempfehlung und Bericht:1. Lesung.delay"] <= -22.0 and 
                                  row["event_count"] <= 6.0 and 
                                  row["2. Lesung:1. Lesung.delay"] >= -30.0)

# Rule 6
dRule6_brandenburg = lambda row: row["Beschlussempfehlung und Bericht:Sitzung.delay"] <= -63.0

# Rule 7
dRule7_brandenburg = lambda row: (row["2. Lesung:1. Lesung.delay"] <= -34.0 and 
                                  row["case:author_first_activity_count"] <= 2.0)

# Rule 8
dRule8_brandenburg = lambda row: row["case:VSysL"] == str(['Abgaben', 'Erneuerbare Energien'])

# Rule 9
dRule9_brandenburg = lambda row: row["2. Lesung:Bekanntmachung.delay"] >= 13.0

# Combined rule
dCombined_rule_brandenburg = lambda row: (dRule1_brandenburg(row) or dRule2_brandenburg(row) or dRule3_brandenburg(row) or dRule4_brandenburg(row) or 
                                          dRule5_brandenburg(row) or dRule6_brandenburg(row) or dRule7_brandenburg(row) or dRule8_brandenburg(row) or 
                                          dRule9_brandenburg(row))

### Get scores

In [386]:
def test(df, y_true, rule): 
    df['explanation'] = df.apply(rule, axis=1)

    y_pred = df['explanation']

    precision = precision_score(y_true, y_pred, pos_label=True, zero_division=1)

    recall = recall_score(y_true, y_pred, pos_label=True)

    f1 = f1_score(y_true, y_pred, pos_label=True)

    print('Precision on case log:', round(precision, 3), ', Recall on case log:', round(recall, 3), ', f1 on case log:', round(f1, 3))

    nof_true_neg, nof_false_pos, nof_false_neg, nof_true_pos = confusion_matrix(y_true, y_pred).ravel()

    #print("\nTEST RESULTS")
    print('True positives (case delayed and predicted correctly):', nof_true_pos, nof_true_pos/y_true.sum())
    print('False positives (case not delayed, but predicted as such):', nof_false_pos, nof_false_pos/(y_true == False).sum())
    print('True negatives (case not delayed and not predicted as such):', nof_true_neg, nof_true_neg/(y_true == False).sum())
    print('False negatives (case delayed, but not predicted as such):', nof_false_neg, nof_false_neg/y_true.sum())
    print("\n")

### Manually Derived Rules (from the induced rules)

In [387]:
### from me / with modifications

# this could show that many activites are a stronger driver in brandenbrug than in berlin for delays...
# motivated by induced rule with highest recall for Brandenburg when hiding all temporal attributes
rule_berlin_1 = lambda row: row["event_count"] >= 8
rule_brandenburg_1 = lambda row: row["event_count"] >= 8
rule_baWue_1 = lambda row: row["event_count"] >= 8

# this shows the same as above, but more specific
# motivated by induced rules with relatively high recalls for Berlin and Brandenburg when hiding temporal attributes
rule_berlin_2 = lambda row: row["Ausschussberatung.count"] >= 3.0
rule_brandenburg_2 = lambda row: row["Sitzung.count"] >= 3.0
rule_baWue_2 = lambda row: row["Beschlussempfehlung und Bericht.count"] >= 3.0

# this is what we expect to be a main driver in delay in berlin
# motivated by rule with highest recall for berlin when including delay and hiding gesetz... 
rule_berlin_3 = lambda row: row['Ausschussberatung:I. Lesung.delay'] <= -24.0
rule_brandenburg_3 = lambda row: row['Sitzung:1. Lesung.delay'] <= -24.0
rule_baWue_3 = lambda row: row['Beschlussempfehlung und Bericht:Erste Beratung.delay'] <= -24.0 

# shows that in Brandenburg this is also a good indicator for delay but then the idea is, 
# that in berlin not much happens in between, while in Brandenburg Sitzungen happen... right?
# motivated by Brandenburg when allowing delay and hiding gesetz
rule_berlin_4 = lambda row: row['II. Lesung:I. Lesung.delay'] <= -30.0 
rule_brandenburg_4 = lambda row: row['2. Lesung:1. Lesung.delay'] <= -30.0 
rule_baWue_4 = lambda row: row['Zweite Beratung:Erste Beratung.delay'] <= -30.0 

# this motivates to do another comparison only with passed bills - but first do logistic regression quickly...
# even though the inducer did not get any rules with this, 
# we saw the big difference through the descriptive statistics 
# and obviously there could be many processes that just stop after the first reading for example and therefore do not take much time...
rule_berlin_5 = lambda row: row['is_passed_bill'] == 1
rule_brandenburg_5 = lambda row: row['is_passed_bill'] == 1
rule_baWue_5 = lambda row: row['is_passed_bill'] == 1


rules_berlin = [rule_berlin_1, rule_berlin_2, rule_berlin_3, rule_berlin_4, rule_berlin_5]
rules_brandenburg = [rule_brandenburg_1, rule_brandenburg_2, rule_brandenburg_3, rule_brandenburg_4, rule_brandenburg_5]
rules_baWue = [rule_baWue_1, rule_baWue_2, rule_baWue_3, rule_baWue_4, rule_baWue_5]

for i in range(len(rules_berlin)):
    print(f"-----------Testing rule set {i+1}--------------------")
    test(df_berlin, y_true_berlin, rules_berlin[i])
    test(df_brandenburg, y_true_brandenburg, rules_brandenburg[i])
    test(df_baWue, y_true_baWue, rules_baWue[i])



-----------Testing rule set 1--------------------
Precision on case log: 0.841 , Recall on case log: 0.373 , f1 on case log: 0.516
True positives (case delayed and predicted correctly): 180 0.37267080745341613
False positives (case not delayed, but predicted as such): 34 0.13709677419354838
True negatives (case not delayed and not predicted as such): 214 0.8629032258064516
False negatives (case delayed, but not predicted as such): 303 0.6273291925465838


Precision on case log: 0.826 , Recall on case log: 0.714 , f1 on case log: 0.766
True positives (case delayed and predicted correctly): 322 0.7139689578713969
False positives (case not delayed, but predicted as such): 68 0.2546816479400749
True negatives (case not delayed and not predicted as such): 199 0.7453183520599251
False negatives (case delayed, but not predicted as such): 129 0.2860310421286031


Precision on case log: 0.56 , Recall on case log: 0.486 , f1 on case log: 0.521
True positives (case delayed and predicted correctly

In [388]:
rule_berlin_1 = lambda row: row["case:squire"] <= 0.273447
rule_brandenburg_1 = lambda row: row["case:squire"] <= 0.363853
rule_baWue_1 = lambda row: row["case:squire"] <= 0.401770

rule_berlin_2 = lambda row: row["case:staff"] <= 92478
rule_brandenburg_2 = lambda row: row["case:staff"] <= 80399
rule_baWue_2 = lambda row: row["case:staff"] <= 108111

rule_berlin_3 = lambda row: row['case:plenDays'] <= 15.93
rule_brandenburg_3 = lambda row: row['case:plenDays'] <= 19.53
rule_baWue_3 = lambda row: row['case:plenDays'] <= 24.18

rule_berlin_4 = lambda row: row['II. Lesung:I. Lesung.delay'] <= -30.0 
rule_brandenburg_4 = lambda row: row['2. Lesung:1. Lesung.delay'] <= -30.0 
rule_baWue_4 = lambda row: row['Zweite Beratung:Erste Beratung.delay'] <= -30.0 

rule_berlin_5 = lambda row: row['is_passed_bill'] == 1
rule_brandenburg_5 = lambda row: row['is_passed_bill'] == 1
rule_baWue_5 = lambda row: row['is_passed_bill'] == 1

rules_berlin = [rule_berlin_1, rule_berlin_2, rule_berlin_3, rule_berlin_4, rule_berlin_5]
rules_brandenburg = [rule_brandenburg_1, rule_brandenburg_2, rule_brandenburg_3, rule_brandenburg_4, rule_brandenburg_5]
rules_baWue = [rule_baWue_1, rule_baWue_2, rule_baWue_3, rule_baWue_4, rule_baWue_5]

for i in range(len(rules_berlin)):
    print(f"-----------Testing rule set {i+1}--------------------")
    test(df_berlin, y_true_berlin, rules_berlin[i])
    test(df_brandenburg, y_true_brandenburg, rules_brandenburg[i])
    test(df_baWue, y_true_baWue, rules_baWue[i])




-----------Testing rule set 1--------------------
Precision on case log: 0.608 , Recall on case log: 0.36 , f1 on case log: 0.453
True positives (case delayed and predicted correctly): 174 0.36024844720496896
False positives (case not delayed, but predicted as such): 112 0.45161290322580644
True negatives (case not delayed and not predicted as such): 136 0.5483870967741935
False negatives (case delayed, but not predicted as such): 309 0.639751552795031


Precision on case log: 0.638 , Recall on case log: 0.568 , f1 on case log: 0.601
True positives (case delayed and predicted correctly): 256 0.5676274944567627
False positives (case not delayed, but predicted as such): 145 0.5430711610486891
True negatives (case not delayed and not predicted as such): 122 0.45692883895131087
False negatives (case delayed, but not predicted as such): 195 0.43237250554323725


Precision on case log: 0.233 , Recall on case log: 0.486 , f1 on case log: 0.315
True positives (case delayed and predicted correc