In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!tar -xvf  /content/drive/MyDrive/crystal_ball_data.tar.gz

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
test_violations/Article10/001-75591.txt
test_violations/Article13/001-94449.txt
test_violations/Article3/001-115007.txt
test_violations/Article5/001-140912.txt
test_violations/Article6/001-58189.txt
train/Article3/both/001-163101.txt
test_violations/Article6/001-59762.txt
train/Article3/violation/001-98660.txt
test_violations/Article5/001-175493.txt
test_violations/Article6/001-92522.txt
test20/Article6/non-violation/001-122261.txt
test_violations/Article10/001-154839.txt
train/Article5/non-violation/001-105336.txt
test_violations/Article13/001-78223.txt
test_violations/Article5/001-75437.txt
test_violations/Article13/001-118599.txt
test_violations/Article5/001-100281.txt
test_violations/Article6/001-71507.txt
train/Article5/both/001-111428.txt
train/Article6/non-violation/001-59887.txt
test_violations/Article5/001-98882.txt
test_violations/Article2/001-86605.txt
train/Article3/non-violation/001-106771.txt
train/Article8/

In [3]:
#all necessary imports
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
import glob,re, os, sys, random
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_recall_fscore_support
from nltk.corpus import stopwords
from random import shuffle

In [4]:

def extract_text(starts, ends, cases, violation):
    facts = []
    D = []
    years = []
    for case in cases:
        contline = ''
        year = 0
        with open(case, 'r') as f:
            for line in f:
                dat = re.search('^([0-9]{1,2}\s\w+\s([0-9]{4}))', line)
                if dat != None:
                    year = int(dat.group(2))
                    break
            if year>0:
                years.append(year)
                wr = 0
                for line in f:
                    if wr == 0:
                        if re.search(starts, line) != None:
                            wr = 1
                    if wr == 1 and re.search(ends, line) == None:
                        contline += line
                        contline += '\n'
                    elif re.search(ends, line) != None:
                        break
                facts.append(contline)
    for i in range(len(facts)):
        D.append((facts[i], violation, years[i])) 
    return D


In [5]:
def extract_parts(train_path, violation, part): #extract text from different parts
    cases = glob.glob(train_path)

    facts = []
    D = []
    years = []
    
    if part == 'relevant_law': #seprarte extraction for relevant law
        for case in cases:
            year = 0
            contline = ''
            with open(case, 'r') as f:
                for line in f:
                    dat = re.search('^([0-9]{1,2}\s\w+\s([0-9]{4}))', line)
                    if dat != None:
                        year = int(dat.group(2))
                        break
                if year> 0:
                    years.append(year)
                    wr = 0
                    for line in f:
                        if wr == 0:
                            if re.search('RELEVANT', line) != None:
                                wr = 1
                        if wr == 1 and re.search('THE LAW', line) == None and re.search('PROCEEDINGS', line) == None:
                            contline += line
                            contline += '\n'
                        elif re.search('THE LAW', line) != None or re.search('PROCEEDINGS', line) != None:
                            break
                    facts.append(contline)
        for i in range(len(facts)):
            D.append((facts[i], violation, years[i]))
        
    if part == 'facts':
        starts = 'THE FACTS'
        ends ='THE LAW'
        D = extract_text(starts, ends, cases, violation)
    if part == 'circumstances':
        starts = 'CIRCUMSTANCES'
        ends ='RELEVANT'
        D = extract_text(starts, ends, cases, violation)
    if part == 'procedure':
        starts = 'PROCEDURE'
        ends ='THE FACTS'
        D = extract_text(starts, ends, cases, violation)
    if part == 'procedure+facts':
        starts = 'PROCEDURE'
        ends ='THE LAW'
        D = extract_text(starts, ends, cases, violation)
    return D


In [6]:
def train_model_cross_val(Xtrain, Ytrain, vec, c): #Linear SVC model cross-validation
    print('***10-fold cross-validation***')
    pipeline = Pipeline([
        ('features', FeatureUnion(
            [vec],
        )),
        ('classifier', LinearSVC(C=c))
        ])
    Ypredict = cross_val_predict(pipeline, Xtrain, Ytrain, cv=10) #10-fold cross-validation
    evaluate(Ytrain, Ypredict)

In [7]:
def train_model_test(Xtrain, Ytrain, Xtest_v, Ytest_v, vec, c): #test on 'violations' test set
    pipeline = Pipeline([
        ('features', FeatureUnion([vec]
        )),
        ('classifier', LinearSVC(C=c))
        ])
    pipeline.fit(Xtrain, Ytrain)
    print('***testing on violation testset***')
    Ypredict = pipeline.predict(Xtest_v)
    evaluate(Ytest_v, Ypredict)

In [8]:
def evaluate(Ytest, Ypredict): #evaluate the model (accuracy, precision, recall, f-score, confusion matrix)
        print('Accuracy:', accuracy_score(Ytest, Ypredict) )
        print('\nClassification report:\n', classification_report(Ytest, Ypredict))
        print('\nCR:', precision_recall_fscore_support(Ytest, Ypredict, average='macro'))
        print('\nConfusion matrix:\n', confusion_matrix(Ytest, Ypredict), '\n\n_______________________\n\n')

In [14]:
def run_pipeline(part, vec, c): #run tests
    
    print('Trained on *' + part + '* part of the cases')
    
    v = extract_parts(path+'train/'+article+'/violation/*.txt', 'violation', part)
    nv = extract_parts(path+'train/'+article+'/non-violation/*.txt', 'non-violation', part)
    trainset =v+nv
    print(trainset)
    shuffle(trainset)

    Xtrain = [i[0] for i in trainset]
    Ytrain = [i[1] for i in trainset]
    
    #test set with violations only
    if article == 'Article14':
        test = extract_parts('./test_violations/'+article+'/*.txt', 'non-violation', part)
    else:
        test = extract_parts('./test_violations/'+article+'/*.txt', 'violation', part)
    Xtest_v = [i[0] for i in test]
    Ytest_v = [i[1] for i in test]


    print('Training on', Ytrain.count('violation'),'+', Ytrain.count('non-violation'), '=', Ytrain.count('violation') + Ytrain.count('non-violation'), 'cases', '\nCases available for testing(violation):', Ytest_v.count('violation'))
    #train_model_test(Xtrain, Ytrain, Xtest_v, Ytest_v, vec, c)
    train_model_cross_val(Xtrain, Ytrain, vec, c) #use for cross-validation

In [15]:
if __name__ == "__main__":
    ##INDICATE THE PATH TO THE DATA
    #path = '~/Documents/ECtHR_crystal_ball/'
    path = '/content/'
    articles = ['Article2', 'Article3', 'Article5', 'Article6', 'Article8', 'Article10', 'Article11', 'Article13', 'Article14']
    for article in articles: #the parameters were determined using grid-search
        print (article)
        if article == 'Article2':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', ngram_range = (3,4), binary = False, lowercase = True, min_df = 2, norm = 'l2', stop_words = None, use_idf = True))
            c = 0.1
            run_pipeline('procedure+facts', vec, c)
        if article == 'Article3':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', binary = True,  lowercase = True,  min_df = 1,  ngram_range = (1,1),  norm = None,  stop_words = None,  use_idf = True))
            c = 0.1
            run_pipeline('facts', vec, c)
        if article == 'Article5':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', binary = True,  lowercase = True,  min_df = 3,  ngram_range = (1, 1),  norm = 'l2',  stop_words = None,  use_idf = True))
            c = 1
            run_pipeline('facts', vec, c) 
        if article == 'Article6':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', binary = True,  lowercase = True,  min_df = 2,  ngram_range = (2,4),  norm = 'l2',  stop_words = None,  use_idf = True))
            c = 5
            run_pipeline('procedure+facts', vec, c)
        if article == 'Article8':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', binary = True,  lowercase = True,  min_df = 1,  ngram_range = (3, 3),  norm = 'l2',  stop_words = None,  use_idf = False))
            c = 1
            run_pipeline('facts', vec, c)
        if article == 'Article10':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', binary = False,  lowercase = False,  min_df = 1,  ngram_range = (1, 1),  norm = 'l2',  stop_words = None,  use_idf = False))
            c = 5
            run_pipeline('procedure+facts', vec, c)
        if article == 'Article11':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', binary = False,  lowercase = True,  min_df = 2,  ngram_range = (1, 1),  norm = 'l1',  stop_words = 'english',  use_idf = False))
            c = 1
            run_pipeline('procedure', vec, c)
        if article == 'Article13':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', binary = False,  lowercase = True,  min_df = 1,  ngram_range = (1, 2),  norm = 'l2',  stop_words = None,  use_idf = True))
            c = 5
            run_pipeline('procedure+facts', vec, c)
        if article == 'Article14':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', binary = True,  lowercase = True,  min_df = 3,  ngram_range = (1, 1),  norm = 'l2',  stop_words = 'english',  use_idf = True))
            c = 5
            run_pipeline('procedure+facts', vec, c)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Training on 57 + 57 = 114 cases 
Cases available for testing(violation): 398
***10-fold cross-validation***
Accuracy: 0.7543859649122807

Classification report:
                precision    recall  f1-score   support

non-violation       0.74      0.79      0.76        57
    violation       0.77      0.72      0.75        57

     accuracy                           0.75       114
    macro avg       0.76      0.75      0.75       114
 weighted avg       0.76      0.75      0.75       114


CR: (0.7556449118465822, 0.7543859649122807, 0.7540832049306626, None)

Confusion matrix:
 [[45 12]
 [16 41]] 

_______________________


Article3
Trained on *facts* part of the cases


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Training on 284 + 284 = 568 cases 
Cases available for testing(violation): 851
***10-fold cross-validation***
Accuracy: 0.7975352112676056

Classification report:
                precision    recall  f1-score   support

non-violation       0.81      0.77      0.79       284
    violation       0.78      0.82      0.80       284

     accuracy                           0.80       568
    macro avg       0.80      0.80      0.80       568
 weighted avg       0.80      0.80      0.80       568


CR: (0.7981599512964827, 0.7975352112676056, 0.7974290986339179, None)

Confusion matrix:
 [[220  64]
 [ 51 233]] 

_______________________


Article5
Trained on *facts* part of the cases


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Training on 150 + 150 = 300 cases 
Cases available for testing(violation): 1118
***10-fold cross-validation***
Accuracy: 0.7566666666666667

Classification report:
                precision    recall  f1-score   support

non-violation       0.76      0.75      0.75       150
    violation       0.75      0.77      0.76       150

     accuracy                           0.76       300
    macro avg       0.76      0.76      0.76       300
 weighted avg       0.76      0.76      0.76       300


CR: (0.7567693744164332, 0.7566666666666667, 0.7566423308997567, None)

Confusion matrix:
 [[112  38]
 [ 35 115]] 

_______________________


Article6
Trained on *procedure+facts* part of the cases


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Training on 458 + 458 = 916 cases 
Cases available for testing(violation): 4092
***10-fold cross-validation***
Accuracy: 0.8067685589519651

Classification report:
                precision    recall  f1-score   support

non-violation       0.78      0.86      0.82       458
    violation       0.85      0.75      0.80       458

     accuracy                           0.81       916
    macro avg       0.81      0.81      0.81       916
 weighted avg       0.81      0.81      0.81       916


CR: (0.8106201396967605, 0.8067685589519651, 0.8061676958114901, None)

Confusion matrix:
 [[395  63]
 [114 344]] 

_______________________


Article8
Trained on *facts* part of the cases


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Training on 106 + 106 = 212 cases 
Cases available for testing(violation): 252
***10-fold cross-validation***
Accuracy: 0.6415094339622641

Classification report:
                precision    recall  f1-score   support

non-violation       0.63      0.68      0.65       106
    violation       0.65      0.60      0.63       106

     accuracy                           0.64       212
    macro avg       0.64      0.64      0.64       212
 weighted avg       0.64      0.64      0.64       212


CR: (0.6423200859291085, 0.6415094339622641, 0.6409982174688056, None)

Confusion matrix:
 [[72 34]
 [42 64]] 

_______________________


Article11
Trained on *procedure* part of the cases
[('PROCEDURE\n\n1.\xa0\xa0The case was referred to the Court by the European Commission of Human Rights (“the Commission”) on 28 October 1996, within the three-month period laid down by Article 32 § 1 and Article 47 of the Convention for the Protection of Human Rights and Fundamental Freedoms (“the Convention”).

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Training on 106 + 106 = 212 cases 
Cases available for testing(violation): 1060
***10-fold cross-validation***
Accuracy: 0.839622641509434

Classification report:
                precision    recall  f1-score   support

non-violation       0.84      0.84      0.84       106
    violation       0.84      0.84      0.84       106

     accuracy                           0.84       212
    macro avg       0.84      0.84      0.84       212
 weighted avg       0.84      0.84      0.84       212


CR: (0.839622641509434, 0.839622641509434, 0.839622641509434, None)

Confusion matrix:
 [[89 17]
 [17 89]] 

_______________________


Article14
Trained on *procedure+facts* part of the cases


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Accuracy: 0.7673611111111112

Classification report:
                precision    recall  f1-score   support

non-violation       0.76      0.78      0.77       144
    violation       0.77      0.76      0.76       144

     accuracy                           0.77       288
    macro avg       0.77      0.77      0.77       288
 weighted avg       0.77      0.77      0.77       288


CR: (0.7674772036474165, 0.7673611111111112, 0.7673358654367879, None)

Confusion matrix:
 [[112  32]
 [ 35 109]] 

_______________________




In [16]:
# Decision Tree model 
from sklearn.tree import DecisionTreeClassifier
def train_model_cross_val(Xtrain, Ytrain, vec, c): #Linear SVC model cross-validation
    print('***10-fold cross-validation***')
    pipeline = Pipeline([
        ('features', FeatureUnion(
            [vec],
        )),
        ('classifier', DecisionTreeClassifier())
        ])
    Ypredict = cross_val_predict(pipeline, Xtrain, Ytrain, cv=10) #10-fold cross-validation
    evaluate(Ytrain, Ypredict)

In [17]:
def train_model_test(Xtrain, Ytrain, Xtest_v, Ytest_v, vec, c): #test on 'violations' test set
    pipeline = Pipeline([
        ('features', FeatureUnion([vec]
        )),
        ('classifier', DecisionTreeClassifier)
        ])
    pipeline.fit(Xtrain, Ytrain)
    print('***testing on violation testset***')
    Ypredict = pipeline.predict(Xtest_v)
    evaluate(Ytest_v, Ypredict)

In [18]:
if __name__ == "__main__":
    ##INDICATE THE PATH TO THE DATA
    #path = '~/Documents/ECtHR_crystal_ball/'
    path = '/content/'
    articles = ['Article2', 'Article3', 'Article5', 'Article6', 'Article8', 'Article10', 'Article11', 'Article13', 'Article14']
    for article in articles: #the parameters were determined using grid-search
        print (article)
        if article == 'Article2':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', ngram_range = (3,4), binary = False, lowercase = True, min_df = 2, norm = 'l2', stop_words = None, use_idf = True))
            c = 0.1
            run_pipeline('procedure+facts', vec, c)
        if article == 'Article3':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', binary = True,  lowercase = True,  min_df = 1,  ngram_range = (1,1),  norm = None,  stop_words = None,  use_idf = True))
            c = 0.1
            run_pipeline('facts', vec, c)
        if article == 'Article5':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', binary = True,  lowercase = True,  min_df = 3,  ngram_range = (1, 1),  norm = 'l2',  stop_words = None,  use_idf = True))
            c = 1
            run_pipeline('facts', vec, c) 
        if article == 'Article6':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', binary = True,  lowercase = True,  min_df = 2,  ngram_range = (2,4),  norm = 'l2',  stop_words = None,  use_idf = True))
            c = 5
            run_pipeline('procedure+facts', vec, c)
        if article == 'Article8':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', binary = True,  lowercase = True,  min_df = 1,  ngram_range = (3, 3),  norm = 'l2',  stop_words = None,  use_idf = False))
            c = 1
            run_pipeline('facts', vec, c)
        if article == 'Article10':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', binary = False,  lowercase = False,  min_df = 1,  ngram_range = (1, 1),  norm = 'l2',  stop_words = None,  use_idf = False))
            c = 5
            run_pipeline('procedure+facts', vec, c)
        if article == 'Article11':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', binary = False,  lowercase = True,  min_df = 2,  ngram_range = (1, 1),  norm = 'l1',  stop_words = 'english',  use_idf = False))
            c = 1
            run_pipeline('procedure', vec, c)
        if article == 'Article13':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', binary = False,  lowercase = True,  min_df = 1,  ngram_range = (1, 2),  norm = 'l2',  stop_words = None,  use_idf = True))
            c = 5
            run_pipeline('procedure+facts', vec, c)
        if article == 'Article14':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', binary = True,  lowercase = True,  min_df = 3,  ngram_range = (1, 1),  norm = 'l2',  stop_words = 'english',  use_idf = True))
            c = 5
            run_pipeline('procedure+facts', vec, c)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Training on 57 + 57 = 114 cases 
Cases available for testing(violation): 398
***10-fold cross-validation***
Accuracy: 0.6403508771929824

Classification report:
                precision    recall  f1-score   support

non-violation       0.65      0.60      0.62        57
    violation       0.63      0.68      0.66        57

     accuracy                           0.64       114
    macro avg       0.64      0.64      0.64       114
 weighted avg       0.64      0.64      0.64       114


CR: (0.641439205955335, 0.6403508771929824, 0.6396576979415619, None)

Confusion matrix:
 [[34 23]
 [18 39]] 

_______________________


Article3
Trained on *facts* part of the cases


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Training on 284 + 284 = 568 cases 
Cases available for testing(violation): 851
***10-fold cross-validation***
Accuracy: 0.6461267605633803

Classification report:
                precision    recall  f1-score   support

non-violation       0.65      0.65      0.65       284
    violation       0.65      0.64      0.65       284

     accuracy                           0.65       568
    macro avg       0.65      0.65      0.65       568
 weighted avg       0.65      0.65      0.65       568


CR: (0.6461285723141776, 0.6461267605633803, 0.6461256637003562, None)

Confusion matrix:
 [[184 100]
 [101 183]] 

_______________________


Article5
Trained on *facts* part of the cases


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Training on 150 + 150 = 300 cases 
Cases available for testing(violation): 1118
***10-fold cross-validation***
Accuracy: 0.54

Classification report:
                precision    recall  f1-score   support

non-violation       0.54      0.55      0.54       150
    violation       0.54      0.53      0.54       150

     accuracy                           0.54       300
    macro avg       0.54      0.54      0.54       300
 weighted avg       0.54      0.54      0.54       300


CR: (0.5400071123755334, 0.54, 0.5399795546468731, None)

Confusion matrix:
 [[82 68]
 [70 80]] 

_______________________


Article6
Trained on *procedure+facts* part of the cases


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Training on 458 + 458 = 916 cases 
Cases available for testing(violation): 4092
***10-fold cross-validation***
Accuracy: 0.6572052401746725

Classification report:
                precision    recall  f1-score   support

non-violation       0.68      0.59      0.63       458
    violation       0.64      0.72      0.68       458

     accuracy                           0.66       916
    macro avg       0.66      0.66      0.66       916
 weighted avg       0.66      0.66      0.66       916


CR: (0.6597674418604651, 0.6572052401746725, 0.6558253519239146, None)

Confusion matrix:
 [[272 186]
 [128 330]] 

_______________________


Article8
Trained on *facts* part of the cases


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Training on 229 + 229 = 458 cases 
Cases available for testing(violation): 496
***10-fold cross-validation***
Accuracy: 0.5414847161572053

Classification report:
                precision    recall  f1-score   support

non-violation       0.56      0.41      0.48       229
    violation       0.53      0.67      0.59       229

     accuracy                           0.54       458
    macro avg       0.54      0.54      0.53       458
 weighted avg       0.54      0.54      0.53       458


CR: (0.5443283004258614, 0.5414847161572053, 0.5340116279069768, None)

Confusion matrix:
 [[ 95 134]
 [ 76 153]] 

_______________________


Article10
Trained on *procedure+facts* part of the cases


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Accuracy: 0.5518867924528302

Classification report:
                precision    recall  f1-score   support

non-violation       0.55      0.60      0.57       106
    violation       0.56      0.50      0.53       106

     accuracy                           0.55       212
    macro avg       0.55      0.55      0.55       212
 weighted avg       0.55      0.55      0.55       212


CR: (0.5524516419253261, 0.5518867924528301, 0.5506771077348682, None)

Confusion matrix:
 [[64 42]
 [53 53]] 

_______________________


Article11
Trained on *procedure* part of the cases
[('PROCEDURE\n\n1.\xa0\xa0The case was referred to the Court by the European Commission of Human Rights (“the Commission”) on 28 October 1996, within the three-month period laid down by Article 32 § 1 and Article 47 of the Convention for the Protection of Human Rights and Fundamental Freedoms (“the Convention”). It originated in an application (no. 19392/92) against the Republic of Turkey lodged with the Commission unde

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Training on 106 + 106 = 212 cases 
Cases available for testing(violation): 1060
***10-fold cross-validation***
Accuracy: 0.6509433962264151

Classification report:
                precision    recall  f1-score   support

non-violation       0.66      0.63      0.64       106
    violation       0.65      0.67      0.66       106

     accuracy                           0.65       212
    macro avg       0.65      0.65      0.65       212
 weighted avg       0.65      0.65      0.65       212


CR: (0.6511586452762923, 0.6509433962264151, 0.6508190883190883, None)

Confusion matrix:
 [[67 39]
 [35 71]] 

_______________________


Article14
Trained on *procedure+facts* part of the cases


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Accuracy: 0.6215277777777778

Classification report:
                precision    recall  f1-score   support

non-violation       0.62      0.61      0.62       144
    violation       0.62      0.63      0.63       144

     accuracy                           0.62       288
    macro avg       0.62      0.62      0.62       288
 weighted avg       0.62      0.62      0.62       288


CR: (0.621580547112462, 0.6215277777777778, 0.6214867064568639, None)

Confusion matrix:
 [[88 56]
 [53 91]] 

_______________________




In [19]:
# Random Forest model 
from sklearn.ensemble import RandomForestClassifier
def train_model_cross_val(Xtrain, Ytrain, vec, c): #Linear SVC model cross-validation
    print('***10-fold cross-validation***')
    pipeline = Pipeline([
        ('features', FeatureUnion(
            [vec],
        )),
        ('classifier', RandomForestClassifier())
        ])
    Ypredict = cross_val_predict(pipeline, Xtrain, Ytrain, cv=10) #10-fold cross-validation
    evaluate(Ytrain, Ypredict)

In [20]:
def train_model_test(Xtrain, Ytrain, Xtest_v, Ytest_v, vec, c): #test on 'violations' test set
    pipeline = Pipeline([
        ('features', FeatureUnion([vec]
        )),
        ('classifier', RandomForestClassifier())
        ])
    pipeline.fit(Xtrain, Ytrain)
    print('***testing on violation testset***')
    Ypredict = pipeline.predict(Xtest_v)
    evaluate(Ytest_v, Ypredict)

In [21]:
if __name__ == "__main__":
    ##INDICATE THE PATH TO THE DATA
    #path = '~/Documents/ECtHR_crystal_ball/'
    path = '/content/'
    articles = ['Article2', 'Article3', 'Article5', 'Article6', 'Article8', 'Article10', 'Article11', 'Article13', 'Article14']
    for article in articles: #the parameters were determined using grid-search
        print (article)
        if article == 'Article2':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', ngram_range = (3,4), binary = False, lowercase = True, min_df = 2, norm = 'l2', stop_words = None, use_idf = True))
            c = 0.1
            run_pipeline('procedure+facts', vec, c)
        if article == 'Article3':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', binary = True,  lowercase = True,  min_df = 1,  ngram_range = (1,1),  norm = None,  stop_words = None,  use_idf = True))
            c = 0.1
            run_pipeline('facts', vec, c)
        if article == 'Article5':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', binary = True,  lowercase = True,  min_df = 3,  ngram_range = (1, 1),  norm = 'l2',  stop_words = None,  use_idf = True))
            c = 1
            run_pipeline('facts', vec, c) 
        if article == 'Article6':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', binary = True,  lowercase = True,  min_df = 2,  ngram_range = (2,4),  norm = 'l2',  stop_words = None,  use_idf = True))
            c = 5
            run_pipeline('procedure+facts', vec, c)
        if article == 'Article8':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', binary = True,  lowercase = True,  min_df = 1,  ngram_range = (3, 3),  norm = 'l2',  stop_words = None,  use_idf = False))
            c = 1
            run_pipeline('facts', vec, c)
        if article == 'Article10':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', binary = False,  lowercase = False,  min_df = 1,  ngram_range = (1, 1),  norm = 'l2',  stop_words = None,  use_idf = False))
            c = 5
            run_pipeline('procedure+facts', vec, c)
        if article == 'Article11':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', binary = False,  lowercase = True,  min_df = 2,  ngram_range = (1, 1),  norm = 'l1',  stop_words = 'english',  use_idf = False))
            c = 1
            run_pipeline('procedure', vec, c)
        if article == 'Article13':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', binary = False,  lowercase = True,  min_df = 1,  ngram_range = (1, 2),  norm = 'l2',  stop_words = None,  use_idf = True))
            c = 5
            run_pipeline('procedure+facts', vec, c)
        if article == 'Article14':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', binary = True,  lowercase = True,  min_df = 3,  ngram_range = (1, 1),  norm = 'l2',  stop_words = 'english',  use_idf = True))
            c = 5
            run_pipeline('procedure+facts', vec, c)

Article2
Trained on *procedure+facts* part of the cases


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Training on 57 + 57 = 114 cases 
Cases available for testing(violation): 398
***10-fold cross-validation***
Accuracy: 0.7105263157894737

Classification report:
                precision    recall  f1-score   support

non-violation       0.68      0.81      0.74        57
    violation       0.76      0.61      0.68        57

     accuracy                           0.71       114
    macro avg       0.72      0.71      0.71       114
 weighted avg       0.72      0.71      0.71       114


CR: (0.7186700767263428, 0.7105263157894737, 0.7078058252427185, None)

Confusion matrix:
 [[46 11]
 [22 35]] 

_______________________


Article3
Trained on *facts* part of the cases


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Training on 284 + 284 = 568 cases 
Cases available for testing(violation): 851
***10-fold cross-validation***
Accuracy: 0.772887323943662

Classification report:
                precision    recall  f1-score   support

non-violation       0.79      0.74      0.77       284
    violation       0.76      0.80      0.78       284

     accuracy                           0.77       568
    macro avg       0.77      0.77      0.77       568
 weighted avg       0.77      0.77      0.77       568


CR: (0.7738686276705613, 0.772887323943662, 0.7726836986365118, None)

Confusion matrix:
 [[211  73]
 [ 56 228]] 

_______________________


Article5
Trained on *facts* part of the cases


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Training on 150 + 150 = 300 cases 
Cases available for testing(violation): 1118
***10-fold cross-validation***
Accuracy: 0.6866666666666666

Classification report:
                precision    recall  f1-score   support

non-violation       0.69      0.67      0.68       150
    violation       0.68      0.70      0.69       150

     accuracy                           0.69       300
    macro avg       0.69      0.69      0.69       300
 weighted avg       0.69      0.69      0.69       300


CR: (0.686799501867995, 0.6866666666666666, 0.6866109530583214, None)

Confusion matrix:
 [[101  49]
 [ 45 105]] 

_______________________


Article6
Trained on *procedure+facts* part of the cases


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Training on 458 + 458 = 916 cases 
Cases available for testing(violation): 4092
***10-fold cross-validation***
Accuracy: 0.732532751091703

Classification report:
                precision    recall  f1-score   support

non-violation       0.76      0.67      0.72       458
    violation       0.71      0.79      0.75       458

     accuracy                           0.73       916
    macro avg       0.74      0.73      0.73       916
 weighted avg       0.74      0.73      0.73       916


CR: (0.7356889178806987, 0.732532751091703, 0.7316343137852812, None)

Confusion matrix:
 [[309 149]
 [ 96 362]] 

_______________________


Article8
Trained on *facts* part of the cases


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Training on 106 + 106 = 212 cases 
Cases available for testing(violation): 252
***10-fold cross-validation***
Accuracy: 0.6179245283018868

Classification report:
                precision    recall  f1-score   support

non-violation       0.60      0.70      0.65       106
    violation       0.64      0.54      0.58       106

     accuracy                           0.62       212
    macro avg       0.62      0.62      0.62       212
 weighted avg       0.62      0.62      0.62       212


CR: (0.6210377272312049, 0.6179245283018868, 0.6154517971111858, None)

Confusion matrix:
 [[74 32]
 [49 57]] 

_______________________


Article11
Trained on *procedure* part of the cases
[('PROCEDURE\n\n1.\xa0\xa0The case was referred to the Court by the European Commission of Human Rights (“the Commission”) on 28 October 1996, within the three-month period laid down by Article 32 § 1 and Article 47 of the Convention for the Protection of Human Rights and Fundamental Freedoms (“the Convention”).

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Training on 106 + 106 = 212 cases 
Cases available for testing(violation): 1060
***10-fold cross-validation***
Accuracy: 0.7830188679245284

Classification report:
                precision    recall  f1-score   support

non-violation       0.76      0.83      0.79       106
    violation       0.81      0.74      0.77       106

     accuracy                           0.78       212
    macro avg       0.79      0.78      0.78       212
 weighted avg       0.79      0.78      0.78       212


CR: (0.7855603448275862, 0.7830188679245282, 0.7825350102577825, None)

Confusion matrix:
 [[88 18]
 [28 78]] 

_______________________


Article14
Trained on *procedure+facts* part of the cases


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Accuracy: 0.6944444444444444

Classification report:
                precision    recall  f1-score   support

non-violation       0.68      0.74      0.71       144
    violation       0.71      0.65      0.68       144

     accuracy                           0.69       288
    macro avg       0.70      0.69      0.69       288
 weighted avg       0.70      0.69      0.69       288


CR: (0.6958041958041958, 0.6944444444444444, 0.693913043478261, None)

Confusion matrix:
 [[106  38]
 [ 50  94]] 

_______________________




In [22]:
from xgboost import XGBClassifier
def train_model_cross_val(Xtrain, Ytrain, vec, c): #Linear SVC model cross-validation
    print('***10-fold cross-validation***')
    pipeline = Pipeline([
        ('features', FeatureUnion(
            [vec],
        )),
        ('classifier', XGBClassifier())
        ])
    Ypredict = cross_val_predict(pipeline, Xtrain, Ytrain, cv=10) #10-fold cross-validation
    evaluate(Ytrain, Ypredict)

In [23]:
def train_model_test(Xtrain, Ytrain, Xtest_v, Ytest_v, vec, c): #test on 'violations' test set
    pipeline = Pipeline([
        ('features', FeatureUnion([vec]
        )),
        ('classifier', XGBClassifier())
        ])
    pipeline.fit(Xtrain, Ytrain)
    print('***testing on violation testset***')
    Ypredict = pipeline.predict(Xtest_v)
    evaluate(Ytest_v, Ypredict)

In [24]:
if __name__ == "__main__":
    ##INDICATE THE PATH TO THE DATA
    #path = '~/Documents/ECtHR_crystal_ball/'
    path = '/content/'
    articles = ['Article2', 'Article3', 'Article5', 'Article6', 'Article8', 'Article10', 'Article11', 'Article13', 'Article14']
    for article in articles: #the parameters were determined using grid-search
        print (article)
        if article == 'Article2':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', ngram_range = (3,4), binary = False, lowercase = True, min_df = 2, norm = 'l2', stop_words = None, use_idf = True))
            c = 0.1
            run_pipeline('procedure+facts', vec, c)
        if article == 'Article3':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', binary = True,  lowercase = True,  min_df = 1,  ngram_range = (1,1),  norm = None,  stop_words = None,  use_idf = True))
            c = 0.1
            run_pipeline('facts', vec, c)
        if article == 'Article5':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', binary = True,  lowercase = True,  min_df = 3,  ngram_range = (1, 1),  norm = 'l2',  stop_words = None,  use_idf = True))
            c = 1
            run_pipeline('facts', vec, c) 
        if article == 'Article6':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', binary = True,  lowercase = True,  min_df = 2,  ngram_range = (2,4),  norm = 'l2',  stop_words = None,  use_idf = True))
            c = 5
            run_pipeline('procedure+facts', vec, c)
        if article == 'Article8':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', binary = True,  lowercase = True,  min_df = 1,  ngram_range = (3, 3),  norm = 'l2',  stop_words = None,  use_idf = False))
            c = 1
            run_pipeline('facts', vec, c)
        if article == 'Article10':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', binary = False,  lowercase = False,  min_df = 1,  ngram_range = (1, 1),  norm = 'l2',  stop_words = None,  use_idf = False))
            c = 5
            run_pipeline('procedure+facts', vec, c)
        if article == 'Article11':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', binary = False,  lowercase = True,  min_df = 2,  ngram_range = (1, 1),  norm = 'l1',  stop_words = 'english',  use_idf = False))
            c = 1
            run_pipeline('procedure', vec, c)
        if article == 'Article13':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', binary = False,  lowercase = True,  min_df = 1,  ngram_range = (1, 2),  norm = 'l2',  stop_words = None,  use_idf = True))
            c = 5
            run_pipeline('procedure+facts', vec, c)
        if article == 'Article14':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', binary = True,  lowercase = True,  min_df = 3,  ngram_range = (1, 1),  norm = 'l2',  stop_words = 'english',  use_idf = True))
            c = 5
            run_pipeline('procedure+facts', vec, c)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Training on 57 + 57 = 114 cases 
Cases available for testing(violation): 398
***10-fold cross-validation***
Accuracy: 0.6754385964912281

Classification report:
                precision    recall  f1-score   support

non-violation       0.67      0.68      0.68        57
    violation       0.68      0.67      0.67        57

     accuracy                           0.68       114
    macro avg       0.68      0.68      0.68       114
 weighted avg       0.68      0.68      0.68       114


CR: (0.6754926108374384, 0.6754385964912281, 0.6754136206233166, None)

Confusion matrix:
 [[39 18]
 [19 38]] 

_______________________


Article3
Trained on *facts* part of the cases


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Training on 284 + 284 = 568 cases 
Cases available for testing(violation): 851
***10-fold cross-validation***
Accuracy: 0.7834507042253521

Classification report:
                precision    recall  f1-score   support

non-violation       0.79      0.77      0.78       284
    violation       0.78      0.80      0.79       284

     accuracy                           0.78       568
    macro avg       0.78      0.78      0.78       568
 weighted avg       0.78      0.78      0.78       568


CR: (0.7836230104085253, 0.7834507042253521, 0.7834178098116718, None)

Confusion matrix:
 [[219  65]
 [ 58 226]] 

_______________________


Article5
Trained on *facts* part of the cases


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Training on 150 + 150 = 300 cases 
Cases available for testing(violation): 1118
***10-fold cross-validation***
Accuracy: 0.7066666666666667

Classification report:
                precision    recall  f1-score   support

non-violation       0.72      0.67      0.69       150
    violation       0.69      0.75      0.72       150

     accuracy                           0.71       300
    macro avg       0.71      0.71      0.71       300
 weighted avg       0.71      0.71      0.71       300


CR: (0.7079978529253892, 0.7066666666666667, 0.7061965811965811, None)

Confusion matrix:
 [[100  50]
 [ 38 112]] 

_______________________


Article6
Trained on *procedure+facts* part of the cases


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Training on 458 + 458 = 916 cases 
Cases available for testing(violation): 4092
***10-fold cross-validation***
Accuracy: 0.7543668122270742

Classification report:
                precision    recall  f1-score   support

non-violation       0.77      0.72      0.75       458
    violation       0.74      0.79      0.76       458

     accuracy                           0.75       916
    macro avg       0.76      0.75      0.75       916
 weighted avg       0.76      0.75      0.75       916


CR: (0.7555375162234259, 0.7543668122270742, 0.754085157410556, None)

Confusion matrix:
 [[330 128]
 [ 97 361]] 

_______________________


Article8
Trained on *facts* part of the cases


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Training on 106 + 106 = 212 cases 
Cases available for testing(violation): 252
***10-fold cross-validation***
Accuracy: 0.6226415094339622

Classification report:
                precision    recall  f1-score   support

non-violation       0.62      0.62      0.62       106
    violation       0.62      0.62      0.62       106

     accuracy                           0.62       212
    macro avg       0.62      0.62      0.62       212
 weighted avg       0.62      0.62      0.62       212


CR: (0.6226415094339622, 0.6226415094339622, 0.6226415094339622, None)

Confusion matrix:
 [[66 40]
 [40 66]] 

_______________________


Article11
Trained on *procedure* part of the cases
[('PROCEDURE\n\n1.\xa0\xa0The case was referred to the Court by the European Commission of Human Rights (“the Commission”) on 28 October 1996, within the three-month period laid down by Article 32 § 1 and Article 47 of the Convention for the Protection of Human Rights and Fundamental Freedoms (“the Convention”).

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Training on 106 + 106 = 212 cases 
Cases available for testing(violation): 1060
***10-fold cross-validation***
Accuracy: 0.75

Classification report:
                precision    recall  f1-score   support

non-violation       0.73      0.78      0.76       106
    violation       0.77      0.72      0.74       106

     accuracy                           0.75       212
    macro avg       0.75      0.75      0.75       212
 weighted avg       0.75      0.75      0.75       212


CR: (0.7510950210065255, 0.75, 0.7497271411070275, None)

Confusion matrix:
 [[83 23]
 [30 76]] 

_______________________


Article14
Trained on *procedure+facts* part of the cases


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Accuracy: 0.7048611111111112

Classification report:
                precision    recall  f1-score   support

non-violation       0.71      0.70      0.70       144
    violation       0.70      0.71      0.71       144

     accuracy                           0.70       288
    macro avg       0.70      0.70      0.70       288
 weighted avg       0.70      0.70      0.70       288


CR: (0.7048709910778876, 0.7048611111111112, 0.7048575527772085, None)

Confusion matrix:
 [[101  43]
 [ 42 102]] 

_______________________


