# Notebook for Everything

### Things to try:
1. Tune the different classifiers
2. Tweaking the pipeline
3. Tweaking the tfidf
4. Doing something instead of the tfidf? Eg. BERT Legal Tokenizer?
5. After the splits into sections do some preprocessing on the text? (doing it in the tokenizer for now)
6. Class imbalance
7. Use the holdout data (test20)

In [1]:
#all necessary imports
#basics
import glob
import re
import os
import sys
import random
from random import shuffle

#text stuff
from nltk.corpus import stopwords

#sklearn stuff
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_recall_fscore_support
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

#models
from xgboost import XGBClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

import warnings
warnings.filterwarnings('ignore')

In [2]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")

In [3]:
#this 
def extract_text(starts, ends, cases, violation):
    facts = []
    D = []
    years = []
    for case in cases:
        contline = ''
        year = 0
        with open(case, 'r', encoding="mbcs") as f:
            for line in f:
                dat = re.search('^([0-9]{1,2}\s\w+\s([0-9]{4}))', line)
                if dat != None:
                    year = int(dat.group(2))
                    break
            if year>0:
                years.append(year)
                wr = 0
                for line in f:
                    if wr == 0:
                        if re.search(starts, line) != None:
                            wr = 1
                    if wr == 1 and re.search(ends, line) == None:
                        contline += line
                        contline += '\n'
                    elif re.search(ends, line) != None:
                        break
                facts.append(contline)
    for i in range(len(facts)):
        D.append((facts[i], violation, years[i])) 
    return D

In [4]:
def extract_parts(train_path, violation, part): #extract text from different parts
    cases = glob.glob(train_path)

    facts = []
    D = []
    years = []
    
    if part == 'relevant_law': #seprarte extraction for relevant law
        for case in cases:
            year = 0
            contline = ''
            with open(case, 'r', encoding="mbcs") as f:
                for line in f:
                    dat = re.search('^([0-9]{1,2}\s\w+\s([0-9]{4}))', line)
                    if dat != None:
                        year = int(dat.group(2))
                        break
                if year> 0:
                    years.append(year)
                    wr = 0
                    for line in f:
                        if wr == 0:
                            if re.search('RELEVANT', line) != None:
                                wr = 1
                        if wr == 1 and re.search('THE LAW', line) == None and re.search('PROCEEDINGS', line) == None:
                            contline += line
                            contline += '\n'
                        elif re.search('THE LAW', line) != None or re.search('PROCEEDINGS', line) != None:
                            break
                    facts.append(contline)
        for i in range(len(facts)):
            D.append((facts[i], violation, years[i]))
        
    if part == 'facts':
        starts = 'THE FACTS'
        ends ='THE LAW'
        D = extract_text(starts, ends, cases, violation)
    if part == 'circumstances':
        starts = 'CIRCUMSTANCES'
        ends ='RELEVANT'
        D = extract_text(starts, ends, cases, violation)
    if part == 'procedure':
        starts = 'PROCEDURE'
        ends ='THE FACTS'
        D = extract_text(starts, ends, cases, violation)
    if part == 'procedure+facts':
        starts = 'PROCEDURE'
        ends ='THE LAW'
        D = extract_text(starts, ends, cases, violation)
    if part == 'facts+circumstances':
        starts = 'THE FACTS'
        ends = 'RELEVANT'
        D = extract_text(starts, ends, cases, violation)
    if part == 'facts+circumstances+procedure':
        starts = 'PROCEDURE'
        ends = 'THE LAW'
        D = extract_text(starts, ends, cases, violation)
    return D

In [5]:
models = [RandomForestClassifier(), LinearSVC(), XGBClassifier()]
params = {}

In [15]:
def train_model_cross_val(Xtrain, Ytrain, Xtest, Ytest, vec, models): # model cross-validation and evaluation
    print('***10-fold cross-validation***')
    for i, model in enumerate(models):
        pipeline = Pipeline([
            ('features', FeatureUnion([vec],)),
            ('classifier', model)
        ])
        
        #cross validate and find the best model
        best_model = HalvingGridSearchCV(pipeline, params, scoring = 'roc_auc', cv = 3, verbose = 3)
       
        #in sample predictions
        print('***IN SAMPLE***')
        best_model.fit(Xtrain, Ytrain)
        Ypredict_in = best_model.predict(Xtrain)
        evaluate(Ytrain, Ypredict_in)
        
        #out of sample
        print('***OUT OF SAMPLE***')
        Ypredict_out = best_model.predict(Xtest)
        evaluate(Ytest, Ypredict_out)

In [7]:
def evaluate(Ytest, Ypredict): #evaluate the model (accuracy, precision, recall, f-score, confusion matrix)
        print('Accuracy:', accuracy_score(Ytest, Ypredict) )
        print('\nClassification report:\n', classification_report(Ytest, Ypredict))
        print('\nCR:', precision_recall_fscore_support(Ytest, Ypredict, average='macro'))
        print('\nConfusion matrix:\n', confusion_matrix(Ytest, Ypredict), '\n\n_______________________\n\n')

In [16]:
def run_pipeline(part, vec): #run tests
    
    print('Trained on *' + part + '* part of the cases')
    
    v = extract_parts(path+'train/'+article+'/violation/*.txt', 'violation', part)
    nv = extract_parts(path+'train/'+article+'/non-violation/*.txt', 'non-violation', part)
    trainset =v+nv
    shuffle(trainset)

    Xtrain = [i[0] for i in trainset]
    Ytrain = [i[1] for i in trainset]
    
    #test set
    test_nv = extract_parts(path + '/test_violations/'+article+'/*.txt', 'non-violation', part)
    test_v = extract_parts(path + '/test_violations/'+article+'/*.txt', 'violation', part)
    
    testset = test_nv+test_v
    shuffle(testset)
    Xtest = [i[0] for i in testset]
    Ytest = [i[1] for i in testset]
    
    print('Training on', Ytrain.count('violation'),'+', Ytrain.count('non-violation'), '=', Ytrain.count('violation') + Ytrain.count('non-violation'), 'cases', '\nCases available for testing(violation):', Ytest_v.count('violation'))
    #train_model_test(Xtrain, Ytrain, Xtest_v, Ytest_v, vec, c)
    train_model_cross_val(Xtrain, Ytrain, Xtest, Ytest, vec, models) #use for cross-validation
    #print(len(v[1]), len(nv[1]), len(Xtrain[1]), len(Ytrain))
    return v, nv, Xtrain, Ytrain, Xtest, Ytest, trainset, test

In [17]:
if __name__ == "__main__":
    #Path to the data
    path = 'C:\\Users\\35387\\Dropbox\\Documents\\Forecasting\\Final_Project\\data\\'
    
    articles = ['Article3'] # 'Article3', 'Article5', 'Article6', 'Article8', 'Article10', 'Article11', 'Article13', 'Article14']
    for part in ['facts']: #, 'facts+circumstances', 'facts+circumstances+procedure']:
        for article in articles: #the parameters were determined using grid-search
                vec = ('wordvec', TfidfVectorizer(analyzer = 'word', ngram_range = (3,4), binary = False, lowercase = True, min_df = 2, norm = 'l2', stop_words = None, use_idf = True))
                run_pipeline(part, vec) 

Trained on *facts* part of the cases
Training on 278 + 279 = 557 cases 
Cases available for testing(violation): 833
***10-fold cross-validation***
***IN SAMPLE***
n_iterations: 1
n_required_iterations: 1
n_possible_iterations: 1
min_resources_: 557
max_resources_: 557
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 1
n_resources: 557
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV 1/3] END ..............., score=(train=1.000, test=0.827) total time=   7.0s
[CV 2/3] END ..............., score=(train=1.000, test=0.844) total time=   6.9s
[CV 3/3] END ..............., score=(train=1.000, test=0.800) total time=   6.9s
Accuracy: 1.0

Classification report:
                precision    recall  f1-score   support

non-violation       1.00      1.00      1.00       279
    violation       1.00      1.00      1.00       278

     accuracy                           1.00       557
    macro avg       1.00      1.00      1.00       557
 weighted avg       1