# Notebook for Everything

### Things to try:
1. Tune the different classifiers
2. Tweaking the pipeline
3. Tweaking the tfidf
4. Doing something instead of the tfidf? Eg. BERT Legal Tokenizer?
5. After the splits into sections do some preprocessing on the text? (doing it in the tokenizer for now)
6. Class imbalance
7. Use the holdout data (test20)

In [101]:
#all necessary imports
#basics
import glob
import re
import os
import sys
import random
from random import shuffle
import warnings

#text stuff
from nltk.corpus import stopwords

#sklearn stuff
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_recall_fscore_support
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

#models
from xgboost import XGBClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

SEED = 42

In [2]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")

In [3]:
#this 
def extract_text(starts, ends, cases, violation):
    facts = []
    D = []
    years = []
    for case in cases:
        contline = ''
        year = 0
        with open(case, 'r', encoding="mbcs") as f:
            for line in f:
                dat = re.search('^([0-9]{1,2}\s\w+\s([0-9]{4}))', line)
                if dat != None:
                    year = int(dat.group(2))
                    break
            if year>0:
                years.append(year)
                wr = 0
                for line in f:
                    if wr == 0:
                        if re.search(starts, line) != None:
                            wr = 1
                    if wr == 1 and re.search(ends, line) == None:
                        contline += line
                        contline += '\n'
                    elif re.search(ends, line) != None:
                        break
                facts.append(contline)
    for i in range(len(facts)):
        D.append((facts[i], violation, years[i])) 
    return D

In [97]:
def extract_parts(train_path, violation, part): #extract text from different parts
    cases = glob.glob(train_path)

    facts = []
    D = []
    years = []
    
    if part == 'relevant_law': #seprarte extraction for relevant law
        for case in cases:
            year = 0
            contline = ''
            with open(case, 'r', encoding="mbcs") as f:
                for line in f:
                    dat = re.search('^([0-9]{1,2}\s\w+\s([0-9]{4}))', line)
                    if dat != None:
                        year = int(dat.group(2))
                        break
                if year> 0:
                    years.append(year)
                    wr = 0
                    for line in f:
                        if wr == 0:
                            if re.search('RELEVANT', line) != None:
                                wr = 1
                        if wr == 1 and re.search('THE LAW', line) == None and re.search('PROCEEDINGS', line) == None:
                            contline += line
                            contline += '\n'
                        elif re.search('THE LAW', line) != None or re.search('PROCEEDINGS', line) != None:
                            break
                    facts.append(contline)
        for i in range(len(facts)):
            D.append((facts[i], violation, years[i]))
        
    if part == 'facts':
        starts = 'THE FACTS'
        ends ='THE LAW'
        D = extract_text(starts, ends, cases, violation)
    if part == 'circumstances':
        starts = 'CIRCUMSTANCES'
        ends ='RELEVANT'
        D = extract_text(starts, ends, cases, violation)
    if part == 'procedure':
        starts = 'PROCEDURE'
        ends ='THE FACTS'
        D = extract_text(starts, ends, cases, violation)
    if part == 'procedure+facts':
        starts = 'PROCEDURE'
        ends ='THE LAW'
        D = extract_text(starts, ends, cases, violation)
    if part == 'facts+circumstances':
        starts = 'THE FACTS'
        ends = 'RELEVANT'
        D = extract_text(starts, ends, cases, violation)
    if part == 'facts+circumstances+procedure':
        starts = 'PROCEDURE'
        ends = 'THE LAW'
        D = extract_text(starts, ends, cases, violation)
    return D

In [5]:
svc = LinearSVC()
svc.get_params().keys()

dict_keys(['C', 'class_weight', 'dual', 'fit_intercept', 'intercept_scaling', 'loss', 'max_iter', 'multi_class', 'penalty', 'random_state', 'tol', 'verbose'])

In [110]:
models = [RandomForestClassifier(class_weight = 'balanced')]

#model specific params
'''
svc_params = {'classifier__class_weight':[None, 'balanced'],
              'classifier__probability':[True],
             # 'classifier__C':[0.5, 1],
              'classifier__kernel':['linear', 'poly', 'rbf'],
              'classifier__random_state': [SEED]}

randomforest_params = {'classifier__max_features': range(4, 10),
                  'classifier__n_estimators': [50, 100],
                  'classifier__max_depth': range(3, 6),
                  'classifier__min_samples_leaf': range(5, 10),
                  'classifier__random_state': [SEED]}

'''

xgb_params = {'classifier__objective':['binary:logistic'],
              'classifier__eval_metric':['auc'],
                  'classifier__learning_rate': [0.05, 1],
                  'classifier__max_depth': range(3, 10),
                  'classifier__lambda':[0.5, 1.5],
              'classifier__use_label_encoder':[False],
                  'classifier__seed': [SEED]}

#params = [svc_params, randomforest_params, 
#param = [xgb_params]
params = {}

In [111]:
def train_model_cross_val(Xtrain, Ytrain, Xtest, Ytest, vec, models): # model cross-validation and evaluation
    print('***10-fold cross-validation***')
    for i, model in enumerate(models):
        pipeline = Pipeline([
            ('features', FeatureUnion([vec],)),
            ('classifier', LinearSVC())
        ])
        
        
        #cross validate and find the best model
        print('***CV IN PROGRESS***')
        best_model = HalvingGridSearchCV(pipeline, params, scoring = 'roc_auc', cv = 3, verbose = 3) #Halving
       
        #in sample predictions
        print('***IN SAMPLE***')
        best_model.fit(Xtrain, Ytrain)
        Ypredict_in = best_model.predict(Xtrain)
        print(model)
        evaluate(Ytrain, Ypredict_in)
        
        #out of sample
        print('***OUT OF SAMPLE***')     
        Ypredict_out = best_model.predict(Xtest)
        print(model)
        evaluate(Ytest, Ypredict_out)
        
        return best_model

In [90]:
def evaluate(Ytest, Ypredict): #evaluate the model (accuracy, precision, recall, f-score, confusion matrix)
        print('Accuracy:', accuracy_score(Ytest, Ypredict))
        print('\nClassification report:\n', classification_report(Ytest, Ypredict))
        print('\nCR:', precision_recall_fscore_support(Ytest, Ypredict, average='macro'))
        print('\nConfusion matrix:\n', confusion_matrix(Ytest, Ypredict), '\n\n_______________________\n\n')

In [112]:
def run_pipeline(part, vec): #run tests
    
    print('Trained on *' + part + '* part of the cases')
    
    v = extract_parts(path+'train/'+article+'/violation/*.txt', 'violation', part)
    nv = extract_parts(path+'train/'+article+'/non-violation/*.txt', 'non-violation', part)
    trainset =v+nv
    shuffle(trainset)

    Xtrain = [i[0] for i in trainset]
    Ytrain = [i[1] for i in trainset]
    
    #test set
    test_nv = extract_parts(path + '/test_violations/'+article+'/*.txt', 'non-violation', part)
    test_v = extract_parts(path + '/test_violations/'+article+'/*.txt', 'violation', part)
    
    testset = test_nv+test_v
    shuffle(testset)
    Xtest = [i[0] for i in testset]
    Ytest = [i[1] for i in testset]
    
    print(article)
    print('Training on', Ytrain.count('violation'), 'violations', '+', Ytrain.count('non-violation'), 'non-violations', '=', Ytrain.count('violation') + Ytrain.count('non-violation'), 'cases')
    print('Testing on', Ytest.count('violation'), 'violations','+', Ytest.count('non-violation'), 'non-violations', '=', Ytest.count('violation') + Ytest.count('non-violation'), 'cases')
       
    best_model = train_model_cross_val(Xtrain, Ytrain, Xtest, Ytest, vec, models) 
    
    return v, nv, Xtrain, Ytrain, Xtest, Ytest, trainset, testset, best_model

In [113]:
def run_pipeline(part, vec): #run tests
    
    print('Trained on *' + part + '* part of the cases')
    
    v = extract_parts(path+'train/'+article+'/violation/*.txt', 'violation', part)
    nv = extract_parts(path+'train/'+article+'/non-violation/*.txt', 'non-violation', part)
    test_nv = extract_parts(path + '/test_violations/'+article+'/*.txt', 'non-violation', part)
    test_v = extract_parts(path + '/test_violations/'+article+'/*.txt', 'violation', part)
    data =v+nv+test_nv+test_v
    shuffle(data)

    X = [i[0] for i in data]
    y = [i[1] for i in data]
    
    Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, y, test_size=0.3, random_state=SEED)
    
    print(article)
    print('Training on', Ytrain.count('violation'), 'violations', '+', Ytrain.count('non-violation'), 'non-violations', '=', Ytrain.count('violation') + Ytrain.count('non-violation'), 'cases')
    print('Testing on', Ytest.count('violation'), 'violations','+', Ytest.count('non-violation'), 'non-violations', '=', Ytest.count('violation') + Ytest.count('non-violation'), 'cases')
       
    best_model = train_model_cross_val(Xtrain, Ytrain, Xtest, Ytest, vec, models) 
    
    return v, nv, Xtrain, Ytrain, Xtest, Ytest, trainset, testset, best_model

In [None]:
#'''
warnings.filterwarnings('ignore')
if __name__ == "__main__":
    #Path to the data
    path = 'C:\\Users\\35387\\Dropbox\\Documents\\Forecasting\\Final_Project\\data\\'
    
    articles = ['Article2'] #'Article3', 'Article4' 'Article5', 'Article6', 'Article8', 'Article10', 'Article11', 'Article13', 'Article14']
    for part in ['facts+circumstances+procedure']: # 'facts' 'facts+circumstances+procedure']:
        for article in articles: #the parameters were determined using grid-search
                vec = ('wordvec', TfidfVectorizer(analyzer = 'word', ngram_range = (3,4), binary = False, lowercase = True, min_df = 2, norm = 'l2', stop_words = None, use_idf = True))
                v, nv, Xtrain, Ytrain, Xtest, Ytest, trainset, testset, best_model = run_pipeline(part, vec) 
                
#'''

Trained on *facts+circumstances+procedure* part of the cases
Article2
Training on 308 violations + 324 non-violations = 632 cases
Testing on 143 violations + 128 non-violations = 271 cases
***10-fold cross-validation***
***CV IN PROGRESS***
***IN SAMPLE***
n_iterations: 1
n_required_iterations: 1
n_possible_iterations: 1
min_resources_: 632
max_resources_: 632
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 1
n_resources: 632
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV 1/3] END ..............., score=(train=0.922, test=0.208) total time=   9.0s
[CV 2/3] END ..............., score=(train=0.925, test=0.193) total time=   8.7s
[CV 3/3] END ..............., score=(train=0.926, test=0.178) total time=  11.2s


In [12]:
print(len(v))
print(len(nv))
print(len(Xtrain))
print(len(Ytrain))
print(len(Xtest))
print(len(Ytest)) 
print(len(trainset))
print(len(testset))

4
4
8
8
6
6
8
6
