In [None]:
#all necessary imports
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
import glob,re, os, sys, random
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_recall_fscore_support
from nltk.corpus import stopwords
from random import shuffle

In [None]:
def extract_text(starts, ends, cases, violation):
    facts = []
    D = []
    years = []
    for case in cases:
        contline = ''
        year = 0
        with open(case, 'r') as f:
            for line in f:
                dat = re.search('^([0-9]{1,2}\s\w+\s([0-9]{4}))', line)
                if dat != None:
                    year = int(dat.group(2))
                    break
            if year>0:
                years.append(year)
                wr = 0
                for line in f:
                    if wr == 0:
                        if re.search(starts, line) != None:
                            wr = 1
                    if wr == 1 and re.search(ends, line) == None:
                        contline += line
                        contline += '\n'
                    elif re.search(ends, line) != None:
                        break
                facts.append(contline)
    for i in range(len(facts)):
        D.append((facts[i], violation, years[i])) 
    return D

In [None]:
def extract_parts(train_path, violation, part): #extract text from different parts
    cases = glob.glob(train_path)

    facts = []
    D = []
    years = []
    
    if part == 'relevant_law': #seprarte extraction for relevant law
        for case in cases:
            year = 0
            contline = ''
            with open(case, 'r') as f:
                for line in f:
                    dat = re.search('^([0-9]{1,2}\s\w+\s([0-9]{4}))', line)
                    if dat != None:
                        year = int(dat.group(2))
                        break
                if year> 0:
                    years.append(year)
                    wr = 0
                    for line in f:
                        if wr == 0:
                            if re.search('RELEVANT', line) != None:
                                wr = 1
                        if wr == 1 and re.search('THE LAW', line) == None and re.search('PROCEEDINGS', line) == None:
                            contline += line
                            contline += '\n'
                        elif re.search('THE LAW', line) != None or re.search('PROCEEDINGS', line) != None:
                            break
                    facts.append(contline)
        for i in range(len(facts)):
            D.append((facts[i], violation, years[i]))
        
    if part == 'facts':
        starts = 'THE FACTS'
        ends ='THE LAW'
        D = extract_text(starts, ends, cases, violation)
    if part == 'circumstances':
        starts = 'CIRCUMSTANCES'
        ends ='RELEVANT'
        D = extract_text(starts, ends, cases, violation)
    if part == 'procedure':
        starts = 'PROCEDURE'
        ends ='THE FACTS'
        D = extract_text(starts, ends, cases, violation)
    if part == 'procedure+facts':
        starts = 'PROCEDURE'
        ends ='THE LAW'
        D = extract_text(starts, ends, cases, violation)
    return D

In [None]:
def train_model_cross_val(Xtrain, Ytrain, vec, c): #Linear SVC model cross-validation
    print('***10-fold cross-validation***')
    pipeline = Pipeline([
        ('features', FeatureUnion(
            [vec],
        )),
        ('classifier', LinearSVC(C=c))
        ])
    Ypredict = cross_val_predict(pipeline, Xtrain, Ytrain, cv=10) #10-fold cross-validation
    evaluate(Ytrain, Ypredict)

In [None]:
def train_model_test(Xtrain, Ytrain, Xtest_v, Ytest_v, vec, c): #test on 'violations' test set
    pipeline = Pipeline([
        ('features', FeatureUnion([vec]
        )),
        ('classifier', LinearSVC(C=c))
        ])
    pipeline.fit(Xtrain, Ytrain)
    print('***testing on violation testset***')
    Ypredict = pipeline.predict(Xtest_v)
    evaluate(Ytest_v, Ypredict)

In [None]:
def evaluate(Ytest, Ypredict): #evaluate the model (accuracy, precision, recall, f-score, confusion matrix)
        print('Accuracy:', accuracy_score(Ytest, Ypredict) )
        print('\nClassification report:\n', classification_report(Ytest, Ypredict))
        print('\nCR:', precision_recall_fscore_support(Ytest, Ypredict, average='macro'))
        print('\nConfusion matrix:\n', confusion_matrix(Ytest, Ypredict), '\n\n_______________________\n\n')

In [None]:
def run_pipeline(part, vec, c): #run tests
    
    print('Trained on *' + part + '* part of the cases')
    
    v = extract_parts(path+'train/'+article+'/violation/*.txt', 'violation', part)
    nv = extract_parts(path+'train/'+article+'/non-violation/*.txt', 'non-violation', part)
    trainset =v+nv
    shuffle(trainset)

    Xtrain = [i[0] for i in trainset]
    Ytrain = [i[1] for i in trainset]
    
    #test set with violations only
    if article == 'Article14':
        test = extract_parts('./test_violations/'+article+'/*.txt', 'non-violation', part)
    else:
        test = extract_parts('./test_violations/'+article+'/*.txt', 'violation', part)
    Xtest_v = [i[0] for i in test]
    Ytest_v = [i[1] for i in test]


    print('Training on', Ytrain.count('violation'),'+', Ytrain.count('non-violation'), '=', Ytrain.count('violation') + Ytrain.count('non-violation'), 'cases', '\nCases available for testing(violation):', Ytest_v.count('violation'))
    #train_model_test(Xtrain, Ytrain, Xtest_v, Ytest_v, vec, c)
    train_model_cross_val(Xtrain, Ytrain, vec, c) #use for cross-validation

In [None]:
if __name__ == "__main__":
    ##INDICATE THE PATH TO THE DATA
    #path = '~/Documents/ECtHR_crystal_ball/'
    path = '../../crystal_ball_data/'
    articles = ['Article2', 'Article3', 'Article5', 'Article6', 'Article8', 'Article10', 'Article11', 'Article13', 'Article14']
    for article in articles: #the parameters were determined using grid-search
        print (article)
        if article == 'Article2':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', ngram_range = (3,4), binary = False, lowercase = True, min_df = 2, norm = 'l2', stop_words = None, use_idf = True))
            c = 0.1
            run_pipeline('procedure+facts', vec, c)
        if article == 'Article3':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', binary = True,  lowercase = True,  min_df = 1,  ngram_range = (1,1),  norm = None,  stop_words = None,  use_idf = True))
            c = 0.1
            run_pipeline('facts', vec, c)
        if article == 'Article5':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', binary = True,  lowercase = True,  min_df = 3,  ngram_range = (1, 1),  norm = 'l2',  stop_words = None,  use_idf = True))
            c = 1
            run_pipeline('facts', vec, c) 
        if article == 'Article6':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', binary = True,  lowercase = True,  min_df = 2,  ngram_range = (2,4),  norm = 'l2',  stop_words = None,  use_idf = True))
            c = 5
            run_pipeline('procedure+facts', vec, c)
        if article == 'Article8':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', binary = True,  lowercase = True,  min_df = 1,  ngram_range = (3, 3),  norm = 'l2',  stop_words = None,  use_idf = False))
            c = 1
            run_pipeline('facts', vec, c)
        if article == 'Article10':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', binary = False,  lowercase = False,  min_df = 1,  ngram_range = (1, 1),  norm = 'l2',  stop_words = None,  use_idf = False))
            c = 5
            run_pipeline('procedure+facts', vec, c)
        if article == 'Article11':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', binary = False,  lowercase = True,  min_df = 2,  ngram_range = (1, 1),  norm = 'l1',  stop_words = 'english',  use_idf = False))
            c = 1
            run_pipeline('procedure', vec, c)
        if article == 'Article13':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', binary = False,  lowercase = True,  min_df = 1,  ngram_range = (1, 2),  norm = 'l2',  stop_words = None,  use_idf = True))
            c = 5
            run_pipeline('procedure+facts', vec, c)
        if article == 'Article14':
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', binary = True,  lowercase = True,  min_df = 3,  ngram_range = (1, 1),  norm = 'l2',  stop_words = 'english',  use_idf = True))
            c = 5
            run_pipeline('procedure+facts', vec, c)