# Notebook for Everything

### Things to try:
1. Different classifiers 
2. Tweaking the pipeline
3. Tweaking the tfidf
4. Doing something instead of the tfidf? Eg. BERT Legal Tokenizer?
5. After the splits into sections do some preprocessing on the text? (doing it in the tokenizer for now)
6. Class imbalance
7. Use the holdout data (test20)

In [1]:
#all necessary imports
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
import glob,re, os, sys, random
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_recall_fscore_support
from nltk.corpus import stopwords
from random import shuffle
import os
from xgboost import XGBClassifier

In [32]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")

In [2]:
#this 
def extract_text(starts, ends, cases, violation):
    facts = []
    D = []
    years = []
    for case in cases:
        contline = ''
        year = 0
        with open(case, 'r', encoding="mbcs") as f:
            for line in f:
                dat = re.search('^([0-9]{1,2}\s\w+\s([0-9]{4}))', line)
                if dat != None:
                    year = int(dat.group(2))
                    break
            if year>0:
                years.append(year)
                wr = 0
                for line in f:
                    if wr == 0:
                        if re.search(starts, line) != None:
                            wr = 1
                    if wr == 1 and re.search(ends, line) == None:
                        contline += line
                        contline += '\n'
                    elif re.search(ends, line) != None:
                        break
                facts.append(contline)
    for i in range(len(facts)):
        D.append((facts[i], violation, years[i])) 
    return D

In [3]:
def extract_parts(train_path, violation, part): #extract text from different parts
    cases = glob.glob(train_path)

    facts = []
    D = []
    years = []
    
    if part == 'relevant_law': #seprarte extraction for relevant law
        for case in cases:
            year = 0
            contline = ''
            with open(case, 'r', encoding="mbcs") as f:
                for line in f:
                    dat = re.search('^([0-9]{1,2}\s\w+\s([0-9]{4}))', line)
                    if dat != None:
                        year = int(dat.group(2))
                        break
                if year> 0:
                    years.append(year)
                    wr = 0
                    for line in f:
                        if wr == 0:
                            if re.search('RELEVANT', line) != None:
                                wr = 1
                        if wr == 1 and re.search('THE LAW', line) == None and re.search('PROCEEDINGS', line) == None:
                            contline += line
                            contline += '\n'
                        elif re.search('THE LAW', line) != None or re.search('PROCEEDINGS', line) != None:
                            break
                    facts.append(contline)
        for i in range(len(facts)):
            D.append((facts[i], violation, years[i]))
        
    if part == 'facts':
        starts = 'THE FACTS'
        ends ='THE LAW'
        D = extract_text(starts, ends, cases, violation)
    if part == 'circumstances':
        starts = 'CIRCUMSTANCES'
        ends ='RELEVANT'
        D = extract_text(starts, ends, cases, violation)
    if part == 'procedure':
        starts = 'PROCEDURE'
        ends ='THE FACTS'
        D = extract_text(starts, ends, cases, violation)
    if part == 'procedure+facts':
        starts = 'PROCEDURE'
        ends ='THE LAW'
        D = extract_text(starts, ends, cases, violation)
    if part == 'facts+circumstances':
        starts = 'THE FACTS'
        ends = 'RELEVANT'
        D = extract_text(starts, ends, cases, violation)
    if part == 'facts+circumstances+procedure':
        starts = 'PROCEDURE'
        ends = 'THE LAW'
        D = extract_text(starts, ends, cases, violation)
    return D

In [4]:
def train_model_cross_val(Xtrain, Ytrain, vec, c): #Linear SVC model cross-validation
    print('***10-fold cross-validation***')
    pipeline = Pipeline([
        ('features', FeatureUnion(
            [vec],
        )),
        ('classifier', LinearSVC(C=c))
        ])
    Ypredict = cross_val_predict(pipeline, Xtrain, Ytrain, cv=10) #10-fold cross-validation
    evaluate(Ytrain, Ypredict)

In [33]:
def train_model_test(Xtrain, Ytrain, Xtest_v, Ytest_v, vec, c): #test on 'violations' test set
    pipeline = Pipeline([
        ('features', FeatureUnion([vec]
        )),
        ('classifier', XGBClassifier())
        ])
    pipeline.fit(Xtrain, Ytrain)
    print('***testing on violation testset***')
    Ypredict = pipeline.predict(Xtest_v)
    evaluate(Ytest_v, Ypredict)

In [6]:
def evaluate(Ytest, Ypredict): #evaluate the model (accuracy, precision, recall, f-score, confusion matrix)
        print('Accuracy:', accuracy_score(Ytest, Ypredict) )
        print('\nClassification report:\n', classification_report(Ytest, Ypredict))
        print('\nCR:', precision_recall_fscore_support(Ytest, Ypredict, average='macro'))
        print('\nConfusion matrix:\n', confusion_matrix(Ytest, Ypredict), '\n\n_______________________\n\n')

In [34]:
def run_pipeline(part, vec, c): #run tests
    
    print('Trained on *' + part + '* part of the cases')
    
    v = extract_parts(path+'train/'+article+'/violation/*.txt', 'violation', part)
    nv = extract_parts(path+'train/'+article+'/non-violation/*.txt', 'non-violation', part)
    trainset =v+nv
    shuffle(trainset)

    Xtrain = [i[0] for i in trainset]
    Ytrain = [i[1] for i in trainset]
    
    #test set with violations only
    if article == 'Article14':
        test = extract_parts('./test_violations/'+article+'/*.txt', 'non-violation', part)
    else:
        test = extract_parts('./test_violations/'+article+'/*.txt', 'violation', part)
    Xtest_v = [i[0] for i in test]
    Ytest_v = [i[1] for i in test]
    

    print('Training on', Ytrain.count('violation'),'+', Ytrain.count('non-violation'), '=', Ytrain.count('violation') + Ytrain.count('non-violation'), 'cases', '\nCases available for testing(violation):', Ytest_v.count('violation'))
    #train_model_test(Xtrain, Ytrain, Xtest_v, Ytest_v, vec, c)
    train_model_cross_val(Xtrain, Ytrain, vec, c) #use for cross-validation
    print(len(v[1]), len(nv[1]), len(Xtrain[1]), len(Ytrain))
    return v, nv, Xtrain, Ytrain, Xtest_v, Ytest_v, trainset

In [35]:
if __name__ == "__main__":
    #Path to the data
    path = 'C:\\Users\\35387\\Dropbox\\Documents\\Forecasting\\Final_Project\\data\\'
    
    articles = ['Article2'] # 'Article3', 'Article5', 'Article6', 'Article8', 'Article10', 'Article11', 'Article13', 'Article14']
    for part in ['facts']: #, 'facts+circumstances', 'facts+circumstances+procedure']:
        for article in articles: #the parameters were determined using grid-search
            vec = ('wordvec', TfidfVectorizer(analyzer = 'word', ngram_range = (3,4), binary = False, lowercase = True, min_df = 2, norm = 'l2', stop_words = None, use_idf = True))
            c = 0.1
            v, nv, Xtrain, Ytrain, Xtest_v, Ytest_v, trainset = run_pipeline(part, vec, c) 

Trained on *facts* part of the cases
Training on 56 + 57 = 113 cases 
Cases available for testing(violation): 0
***10-fold cross-validation***
Accuracy: 0.7079646017699115

Classification report:
                precision    recall  f1-score   support

non-violation       0.70      0.74      0.72        57
    violation       0.72      0.68      0.70        56

     accuracy                           0.71       113
    macro avg       0.71      0.71      0.71       113
 weighted avg       0.71      0.71      0.71       113


CR: (0.7084905660377359, 0.7077067669172932, 0.7075982121853681, None)

Confusion matrix:
 [[42 15]
 [18 38]] 

_______________________


3 3 34059 113
