In [26]:
#all necessary imports
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
import glob,re, os, sys, random
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_recall_fscore_support
from nltk.corpus import stopwords
from random import shuffle

In [2]:
def extract_text(starts, ends, cases, violation):
    facts = []
    D = []
    years = []
    for case in cases:
        contline = ''
        year = 0
        with open(case, 'r') as f:
            for line in f:
                dat = re.search('^([0-9]{1,2}\s\w+\s([0-9]{4}))', line)
                if dat != None:
                    year = int(dat.group(2))
                    break
            if year>0:
                years.append(year)
                wr = 0
                for line in f:
                    if wr == 0:
                        if re.search(starts, line) != None:
                            wr = 1
                    if wr == 1 and re.search(ends, line) == None:
                        contline += line
                        contline += '\n'
                    elif re.search(ends, line) != None:
                        break
                facts.append(contline)
    for i in range(len(facts)):
        D.append((facts[i], violation, years[i])) 
    return D

In [3]:
def extract_parts(train_path, violation, part): #extract text from different parts
    cases = glob.glob(train_path)

    facts = []
    D = []
    years = []
    
    if part == 'relevant_law': #seprarte extraction for relevant law
        for case in cases:
            year = 0
            contline = ''
            with open(case, 'r') as f:
                for line in f:
                    dat = re.search('^([0-9]{1,2}\s\w+\s([0-9]{4}))', line)
                    if dat != None:
                        year = int(dat.group(2))
                        break
                if year> 0:
                    years.append(year)
                    wr = 0
                    for line in f:
                        if wr == 0:
                            if re.search('RELEVANT', line) != None:
                                wr = 1
                        if wr == 1 and re.search('THE LAW', line) == None and re.search('PROCEEDINGS', line) == None:
                            contline += line
                            contline += '\n'
                        elif re.search('THE LAW', line) != None or re.search('PROCEEDINGS', line) != None:
                            break
                    facts.append(contline)
        for i in range(len(facts)):
            D.append((facts[i], violation, years[i]))
        
    if part == 'facts':
        starts = 'THE FACTS'
        ends ='THE LAW'
        D = extract_text(starts, ends, cases, violation)
    if part == 'circumstances':
        starts = 'CIRCUMSTANCES'
        ends ='RELEVANT'
        D = extract_text(starts, ends, cases, violation)
    if part == 'procedure':
        starts = 'PROCEDURE'
        ends ='THE FACTS'
        D = extract_text(starts, ends, cases, violation)
    if part == 'procedure+facts':
        starts = 'PROCEDURE'
        ends ='THE LAW'
        D = extract_text(starts, ends, cases, violation)
    return D

In [13]:
def train_model_cross_val(Xtrain, Ytrain, vec, c): #Linear SVC model cross-validation
    print('***10-fold cross-validation***')
    pipeline = Pipeline([
        ('features', FeatureUnion(
            [vec],
        )),
        ('classifier', LinearSVC(C=c))
        ])
    Ypredict = cross_val_predict(pipeline, Xtrain, Ytrain, cv=10) #10-fold cross-validation
    evaluate(Ytrain, Ypredict)

In [14]:
def train_model_test(Xtrain, Ytrain, Xtest_v, Ytest_v, vec, c): #test on 'violations' test set
    pipeline = Pipeline([
        ('features', FeatureUnion([vec]
        )),
        ('classifier', LinearSVC(C=c))
        ])
    pipeline.fit(Xtrain, Ytrain)
    print('***testing on violation testset***')
    Ypredict = pipeline.predict(Xtest_v)
    evaluate(Ytest_v, Ypredict)

In [30]:
def evaluate(Ytest, Ypredict): #evaluate the model (accuracy, precision, recall, f-score, confusion matrix)
        print('Accuracy:', accuracy_score(Ytest, Ypredict) )
        print('\nClassification report:\n', classification_report(Ytest, Ypredict))
        print('\nCR:', precision_recall_fscore_support(Ytest, Ypredict, average='macro'))
        print('\nConfusion matrix:\n', confusion_matrix(Ytest, Ypredict), '\n\n_______________________\n\n')

In [31]:
def run_pipeline(part, vec, c): #run tests
    
    print('Trained on *' + part + '* part of the cases')
    
    v = extract_parts(path+'train/'+article+'/violation/*.txt', 'violation', part)
    nv = extract_parts(path+'train/'+article+'/non-violation/*.txt', 'non-violation', part)
    trainset =v+nv
    shuffle(trainset)

    Xtrain = [i[0] for i in trainset]
    Ytrain = [i[1] for i in trainset]
    
    #test set with violations only
    if article == 'Article14':
        test = extract_parts('./test_violations/'+article+'/*.txt', 'non-violation', part)
    else:
        test = extract_parts('./test_violations/'+article+'/*.txt', 'violation', part)
    Xtest_v = [i[0] for i in test]
    Ytest_v = [i[1] for i in test]


    print('Training on', Ytrain.count('violation'),'+', Ytrain.count('non-violation'), '=', Ytrain.count('violation') + Ytrain.count('non-violation'), 'cases', '\nCases available for testing(violation):', Ytest_v.count('violation'))
    #train_model_test(Xtrain, Ytrain, Xtest_v, Ytest_v, vec, c)
    train_model_cross_val(Xtrain, Ytrain, vec, c) #use for cross-validation

In [32]:
if __name__ == "__main__":
    ##INDICATE THE PATH TO THE DATA
    #path = '~/Documents/ECtHR_crystal_ball/'
    path = '../../crystal_ball_data/'
    articles = ['Article2', 'Article3', 'Article5', 'Article6', 'Article8', 'Article10', 'Article11', 'Article13', 'Article14']
    for article in articles: #the parameters were determined using grid-search
        print (article)
        if article == 'Article2':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', ngram_range = (3,4), binary = False, lowercase = True, min_df = 2, norm = 'l2', stop_words = None, use_idf = True))
            c = 0.1
            run_pipeline('procedure+facts', vec, c)
        if article == 'Article3':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', binary = True,  lowercase = True,  min_df = 1,  ngram_range = (1,1),  norm = None,  stop_words = None,  use_idf = True))
            c = 0.1
            run_pipeline('facts', vec, c)
        if article == 'Article5':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', binary = True,  lowercase = True,  min_df = 3,  ngram_range = (1, 1),  norm = 'l2',  stop_words = None,  use_idf = True))
            c = 1
            run_pipeline('facts', vec, c) 
        if article == 'Article6':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', binary = True,  lowercase = True,  min_df = 2,  ngram_range = (2,4),  norm = 'l2',  stop_words = None,  use_idf = True))
            c = 5
            run_pipeline('procedure+facts', vec, c)
        if article == 'Article8':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', binary = True,  lowercase = True,  min_df = 1,  ngram_range = (3, 3),  norm = 'l2',  stop_words = None,  use_idf = False))
            c = 1
            run_pipeline('facts', vec, c)
        if article == 'Article10':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', binary = False,  lowercase = False,  min_df = 1,  ngram_range = (1, 1),  norm = 'l2',  stop_words = None,  use_idf = False))
            c = 5
            run_pipeline('procedure+facts', vec, c)
        if article == 'Article11':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', binary = False,  lowercase = True,  min_df = 2,  ngram_range = (1, 1),  norm = 'l1',  stop_words = 'english',  use_idf = False))
            c = 1
            run_pipeline('procedure', vec, c)
        if article == 'Article13':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', binary = False,  lowercase = True,  min_df = 1,  ngram_range = (1, 2),  norm = 'l2',  stop_words = None,  use_idf = True))
            c = 5
            run_pipeline('procedure+facts', vec, c)
        if article == 'Article14':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', binary = True,  lowercase = True,  min_df = 3,  ngram_range = (1, 1),  norm = 'l2',  stop_words = 'english',  use_idf = True))
            c = 5
            run_pipeline('procedure+facts', vec, c)

Article2
Trained on *procedure+facts* part of the cases
Training on 57 + 57 = 114 cases 
Cases available for testing(violation): 0
***10-fold cross-validation***


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


Accuracy: 0.7017543859649122

Classification report:
                precision    recall  f1-score   support

non-violation       0.69      0.74      0.71        57
    violation       0.72      0.67      0.69        57

  avg / total       0.70      0.70      0.70       114


CR: (0.702752861119703, 0.7017543859649122, 0.7013867488443759, None)

Confusion matrix:
 [[42 15]
 [19 38]] 

_______________________


Article3
Trained on *facts* part of the cases
Training on 284 + 284 = 568 cases 
Cases available for testing(violation): 0
***10-fold cross-validation***


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


Accuracy: 0.801056338028169

Classification report:
                precision    recall  f1-score   support

non-violation       0.81      0.78      0.80       284
    violation       0.79      0.82      0.80       284

  avg / total       0.80      0.80      0.80       568


CR: (0.8015086608306947, 0.801056338028169, 0.8009816962942979, None)

Confusion matrix:
 [[222  62]
 [ 51 233]] 

_______________________


Article5
Trained on *facts* part of the cases
Training on 150 + 150 = 300 cases 
Cases available for testing(violation): 0
***10-fold cross-validation***


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


Accuracy: 0.7633333333333333

Classification report:
                precision    recall  f1-score   support

non-violation       0.77      0.75      0.76       150
    violation       0.76      0.77      0.77       150

  avg / total       0.76      0.76      0.76       300


CR: (0.76343870881686, 0.7633333333333333, 0.7633096642997632, None)

Confusion matrix:
 [[113  37]
 [ 34 116]] 

_______________________


Article6
Trained on *procedure+facts* part of the cases
Training on 458 + 458 = 916 cases 
Cases available for testing(violation): 0
***10-fold cross-validation***


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


Accuracy: 0.8165938864628821

Classification report:
                precision    recall  f1-score   support

non-violation       0.79      0.87      0.83       458
    violation       0.86      0.76      0.81       458

  avg / total       0.82      0.82      0.82       916


CR: (0.8204126138644434, 0.8165938864628821, 0.8160457877296917, None)

Confusion matrix:
 [[399  59]
 [109 349]] 

_______________________


Article8
Trained on *facts* part of the cases
Training on 229 + 229 = 458 cases 
Cases available for testing(violation): 0
***10-fold cross-validation***


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


Accuracy: 0.7096069868995634

Classification report:
                precision    recall  f1-score   support

non-violation       0.69      0.76      0.72       229
    violation       0.73      0.66      0.70       229

  avg / total       0.71      0.71      0.71       458


CR: (0.7113846153846153, 0.7096069868995634, 0.7089951892529727, None)

Confusion matrix:
 [[173  56]
 [ 77 152]] 

_______________________


Article10
Trained on *procedure+facts* part of the cases
Training on 106 + 106 = 212 cases 
Cases available for testing(violation): 0
***10-fold cross-validation***


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


Accuracy: 0.6320754716981132

Classification report:
                precision    recall  f1-score   support

non-violation       0.63      0.63      0.63       106
    violation       0.63      0.63      0.63       106

  avg / total       0.63      0.63      0.63       212


CR: (0.6320754716981132, 0.6320754716981132, 0.6320754716981132, None)

Confusion matrix:
 [[67 39]
 [39 67]] 

_______________________


Article11
Trained on *procedure* part of the cases
Training on 32 + 32 = 64 cases 
Cases available for testing(violation): 0
***10-fold cross-validation***


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


Accuracy: 0.84375

Classification report:
                precision    recall  f1-score   support

non-violation       0.87      0.81      0.84        32
    violation       0.82      0.88      0.85        32

  avg / total       0.85      0.84      0.84        64


CR: (0.8450980392156863, 0.84375, 0.8435972629521017, None)

Confusion matrix:
 [[26  6]
 [ 4 28]] 

_______________________


Article13
Trained on *procedure+facts* part of the cases
Training on 106 + 106 = 212 cases 
Cases available for testing(violation): 0
***10-fold cross-validation***


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


Accuracy: 0.8254716981132075

Classification report:
                precision    recall  f1-score   support

non-violation       0.82      0.84      0.83       106
    violation       0.83      0.81      0.82       106

  avg / total       0.83      0.83      0.83       212


CR: (0.8257326088892847, 0.8254716981132075, 0.8254367419606097, None)

Confusion matrix:
 [[89 17]
 [20 86]] 

_______________________


Article14
Trained on *procedure+facts* part of the cases
Training on 144 + 144 = 288 cases 
Cases available for testing(violation): 0
***10-fold cross-validation***


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


Accuracy: 0.7604166666666666

Classification report:
                precision    recall  f1-score   support

non-violation       0.77      0.75      0.76       144
    violation       0.76      0.77      0.76       144

  avg / total       0.76      0.76      0.76       288


CR: (0.7605297438124186, 0.7604166666666667, 0.7603906673901248, None)

Confusion matrix:
 [[108  36]
 [ 33 111]] 

_______________________




  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
