In [1]:
import pandas as pd 
import re, os, glob
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_predict
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion

In [2]:
def extract_uitspraak(uitspraak):
    # ontbinding & ontruiming = obon
    # Ontruiming = on
    # Afwijzing = af
    # Ontbinding = ob
    # Voorwaardelijke ontruiming = von
    match = None
    for match in re.finditer(r'beslissing', uitspraak, re.IGNORECASE|re.DOTALL):
        pass
    if match != None:
        index = match.span()[0]
        verdict = uitspraak[index:].lower()
    else:
        verdict = uitspraak[-2500:].lower()
    m = re.findall('ontbonden|ontbinding|ontbind|ontruimen|ontruiming|verlat|wijst.*?af|weiger|niet rechtvaardigt|afgewezen', verdict, re.IGNORECASE|re.DOTALL)
    if m != []:
        if (('ontbonden' in m) or ('ontbinding' in m) or ('ontbind' in m)) and (('ontruimen' in m) or ('ontruiming' in m) or ('verlat' in m)) :
            return True
        elif (('ontruimen' in m) or ('ontruiming' in m) or ('verlat' in m)) :
            return True
        else:
            return False
    return False

In [3]:
def test_model(Xtrain, Ytrain, Xtest, Ytest):
    vec3 = CountVectorizer(analyzer = 'char',ngram_range = (1,7), max_features=2000, max_df = 0.9, lowercase=True, binary = True)

    pipeline =  Pipeline([
        ('features', FeatureUnion([('charvec', vec3)]
        )),
        ('classifier', LinearSVC(C = 0.001))])
    pipeline.fit(Xtrain, Ytrain)

    Ypredict = pipeline.predict(Xtest)
    print('Performance on a test set:\n')
    evaluate(Ytest, Ypredict)

In [4]:
def evaluate(Ytest, Ypredict):
        print("\nAccuracy: ", accuracy_score(Ytest, Ypredict), "\n")
        print("Classification report:\n\n", classification_report(Ytest, Ypredict))
        print("Confusion matrix:\n\n", confusion_matrix(Ytest, Ypredict), "\n")

In [5]:
train_set = pd.DataFrame(pd.read_csv('training_set_outcome_identification', sep='|')).set_index('Zaaknummer (LJN/ECLI)')
test_set = pd.DataFrame(pd.read_csv('test_set_outcome_identification', sep='|')).set_index('Zaaknummer (LJN/ECLI)')

In [6]:
full_set = pd.concat([train_set,test_set])
for index, row in full_set.iterrows():
    uitspraak = full_set.loc[index, 'extracted_text']
    full_set.at[index,'extracted_verdict'] = extract_uitspraak(uitspraak)

In [7]:
print('Performance using keyword search:\n')
evaluate(full_set['Uitspraak'].tolist(), full_set['extracted_verdict'].tolist())

Performance using keyword search:


Accuracy:  0.8596491228070176 

Classification report:

               precision    recall  f1-score   support

       False       0.88      0.65      0.75       292
        True       0.85      0.96      0.90       620

    accuracy                           0.86       912
   macro avg       0.87      0.80      0.82       912
weighted avg       0.86      0.86      0.85       912

Confusion matrix:

 [[189 103]
 [ 25 595]] 



In [None]:
#based on machine learning (2500 last characters of the text) - parameters optimised using GridSearchCV (cv=3)
Xtrain = [i.replace('\n', '')[-2500:] for i in train_set['extracted_text'].tolist()]
Ytrain = train_set['Uitspraak'].tolist()
vec3 = CountVectorizer(analyzer = 'char',ngram_range = (1,7), max_features=2000, max_df = 0.9, lowercase=True, binary = True)
svm =  Pipeline([
        ('features', FeatureUnion([('charvec', vec3)]
        )),
        ('classifier', LinearSVC(C = 0.001))])
Ypredict = cross_val_predict(svm, Xtrain, Ytrain, cv = 10)
print('Cross-validation performance:')
evaluate(Ytrain, Ypredict)

In [None]:
Xtest = [i.replace('\n', '')[-2500:] for i in test_set['extracted_text'].tolist()]
Ytest = test_set['Uitspraak'].tolist()
test_model(Xtrain, Ytrain, Xtest, Ytest)