In [1]:
# load libraries
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
import glob,re, os, sys, random
from sklearn.model_selection import cross_val_predict, train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_recall_fscore_support
from nltk.corpus import stopwords
from random import shuffle
import os
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from mlxtend.preprocessing import DenseTransformer
import xgboost as xgb
import pickle

  from pandas import MultiIndex, Int64Index


In [2]:
def extract_text(starts, ends, cases, violation):
    facts = []
    D = []
    years = []
    for case in cases:
        contline = ''
        year = 0
        with open(case, 'r', encoding= 'utf-8') as f: # or mbcs CP1252 ISO-8859-1
            for line in f:
                dat = re.search('^([0-9]{1,2}\s\w+\s([0-9]{4}))', line)
                if dat != None:
                    year = int(dat.group(2))
                    break
            if year>0:
                years.append(year)
                wr = 0
                for line in f:
                    if wr == 0:
                        if re.search(starts, line) != None:
                            wr = 1
                    if wr == 1 and re.search(ends, line) == None:
                        contline += line
                        contline += '\n'
                    elif re.search(ends, line) != None:
                        break
                facts.append(contline)
    for i in range(len(facts)):
        D.append((facts[i], violation, years[i])) 
    return D

def extract_parts(train_path, violation, part): 
    cases = glob.glob(train_path)

    facts = []
    D = []
    years = []
    
    if part == 'relevant_law': #do extraction separate for relevant law
        for case in cases:
            year = 0
            contline = ''
            with open(case, 'r', encoding="utf-8") as f: # or mbcs #CP1252 ISO-8859-1
                for line in f:
                    dat = re.search('^([0-9]{1,2}\s\w+\s([0-9]{4}))', line)
                    if dat != None:
                        year = int(dat.group(2))
                        break
                if year> 0:
                    years.append(year)
                    wr = 0
                    for line in f:
                        if wr == 0:
                            if re.search('RELEVANT', line) != None:
                                wr = 1
                        if wr == 1 and re.search('THE LAW', line) == None and re.search('PROCEEDINGS', line) == None:
                            contline += line
                            contline += '\n'
                        elif re.search('THE LAW', line) != None or re.search('PROCEEDINGS', line) != None:
                            break
                    facts.append(contline)
        for i in range(len(facts)):
            D.append((facts[i], violation, years[i]))
        
    if part == 'facts':
        starts = 'THE FACTS'
        ends ='THE LAW'
        D = extract_text(starts, ends, cases, violation)
    if part == 'circumstances':
        starts = 'CIRCUMSTANCES'
        ends ='RELEVANT'
        D = extract_text(starts, ends, cases, violation)
    if part == 'procedure':
        starts = 'PROCEDURE'
        ends ='THE FACTS'
        D = extract_text(starts, ends, cases, violation)
    if part == 'procedure+facts':
        starts = 'PROCEDURE'
        ends ='THE LAW'
        D = extract_text(starts, ends, cases, violation)
    if part == 'facts+circumstances':
        starts = 'THE FACTS'
        ends = 'RELEVANT'
        D = extract_text(starts, ends, cases, violation)
    if part == 'facts+circumstances+procedure':
        starts = 'PROCEDURE'
        ends = 'THE LAW'
        D = extract_text(starts, ends, cases, violation)
    return D

def run_pipeline(part, article): #run tests
    
    print('Trained on *' + part + '* part of the cases')
    
    v = extract_parts(path+'/train/'+article+'/violation/*.txt', 'violation', part)
    nv = extract_parts(path+'/train/'+article+'/non-violation/*.txt', 'non-violation', part)
    #test_nv = extract_parts(path + '/test_violations/'+article+'/*.txt', 'non-violation', part)
    test_v = extract_parts(path + '/test_violations/'+article+'/*.txt', 'violation', part)

    data= v+nv+test_v
    shuffle(data)

    features = [i[0] for i in data]
    target = [i[1] for i in data]
    years = [i[2] for i in data]
   
    return features, target, years 

In [3]:
path = r'G:\Geteilte Ablagen\Now_Forecasting_Final_Project\data'
part_list = ['facts', 'circumstances', 'procedure', 'procedure+facts', 'facts+circumstances', 'facts+circumstances+procedure']
articles = ['Article2', 'Article3', 'Article5', 'Article6', 'Article8', 'Article10', 'Article11', 'Article13', 'Article14']
def return_train(type_part):
    
    

    X_list = []
    y_list = []
    year_list = []
    article_list = []

  #X_test_list = []
  #y_test_list = []
    for article in articles: 
        
        X, y, year = run_pipeline(type_part, article) #X_test, y_test

        X_list.extend(X) # then I don't need to flatten the lists
        y_list.extend(y)
        year_list.extend(year)
        append_article = [article] * len(year)
        article_list.extend(append_article)
  
  # some preprocess steps
  # for weird encoding
    X = [re.sub("\xa0", " ", item) for item in X_list]
    X = [re.sub("\n\w|\n", "",item) for item in X]
    X = [re.sub(' +', ' ', item) for item in X]
    X = [re.sub('\.+', ".",item) for item in X]
    #X = [re.sub("\n", "", item) for item in X]
    #y = np.array([1 if x == 'violation' else 0 for x in y_list])
    #y = y.reshape((-1,1))
    
    feature_df = pd.DataFrame({'text': X, 'year': year_list, 'outcome': y_list, 
                              'article': article_list})
    return feature_df

In [29]:
# just extract the full text without the relevant law 
final_df = return_train(part_list[5])

Trained on *facts+circumstances+procedure* part of the cases
Trained on *facts+circumstances+procedure* part of the cases
Trained on *facts+circumstances+procedure* part of the cases
Trained on *facts+circumstances+procedure* part of the cases
Trained on *facts+circumstances+procedure* part of the cases
Trained on *facts+circumstances+procedure* part of the cases
Trained on *facts+circumstances+procedure* part of the cases
Trained on *facts+circumstances+procedure* part of the cases
Trained on *facts+circumstances+procedure* part of the cases


In [81]:
fact_df = return_train(part_list[0])

Trained on *facts* part of the cases
Trained on *facts* part of the cases
Trained on *facts* part of the cases
Trained on *facts* part of the cases
Trained on *facts* part of the cases
Trained on *facts* part of the cases
Trained on *facts* part of the cases
Trained on *facts* part of the cases
Trained on *facts* part of the cases


In [84]:
circumstances_df = return_train(part_list[1])

Trained on *circumstances* part of the cases
Trained on *circumstances* part of the cases
Trained on *circumstances* part of the cases
Trained on *circumstances* part of the cases
Trained on *circumstances* part of the cases
Trained on *circumstances* part of the cases
Trained on *circumstances* part of the cases
Trained on *circumstances* part of the cases
Trained on *circumstances* part of the cases


In [86]:
procedure_df = return_train(part_list[2])

Trained on *procedure* part of the cases
Trained on *procedure* part of the cases
Trained on *procedure* part of the cases
Trained on *procedure* part of the cases
Trained on *procedure* part of the cases
Trained on *procedure* part of the cases
Trained on *procedure* part of the cases
Trained on *procedure* part of the cases
Trained on *procedure* part of the cases


In [32]:
os.chdir(r'G:\Geteilte Ablagen\Now_Forecasting_Final_Project\data')
final_df.to_csv('complete_df.csv', index = False)

In [110]:
# save all the other df's -> stupid memory errors
fact_df.to_csv('fact_df.csv', index = False)
procedure_df.to_csv('procedure_df.csv', index = False)
circumstances_df.to_csv('circumstances_df.csv', index = False)

In [32]:
# checkpoint -> load saved csv 
os.chdir(r'G:\Geteilte Ablagen\Now_Forecasting_Final_Project\data')
final_df = pd.read_csv('complete_df.csv')

In [4]:
# prepare data for classification 
def data_prepper(df):
    
    target = 'outcome'
    X = df.drop(target, axis = 1)
    y = df[target]
    y = y.apply(lambda x: 1 if x == 'violation' else 0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

In [97]:
X_train, X_test, y_train, y_test = data_prepper(final_df)

In [5]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=42)

In [None]:
X_res, y_res = rus.fit_resample(X_train, y_train)

In [6]:
## prepare text data 
model = xgb.XGBClassifier(n_estimators = 1000)
vec = ('wordvec', TfidfVectorizer(analyzer = 'word', ngram_range = (3,4), binary = False, lowercase = True, min_df = 2, norm = 'l2', stop_words = None, use_idf = True))

pipeline = Pipeline([
            ('features', FeatureUnion([vec],)),
            #("densifier", DenseTransformer()),
            ('classifier', model)
        ])

In [53]:
pipeline.fit(X_res['text'], y_res)



Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('wordvec',
                                                 TfidfVectorizer(min_df=2,
                                                                 ngram_range=(3,
                                                                              4)))])),
                ('classifier',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, enable_categorical=False,
                               gamma=0, gpu_id=-1, importance_type=None,
                               interaction_constraints='',
                               learning_rate=0.300000012, max_delta_step=0,
                               max_depth=6, min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=1000,
                               n_jobs=8, num_parallel_tree=1,

In [54]:
y_hat = pipeline.predict(X_test['text'])

In [100]:
classrep_final = classification_report(y_test, y_hat)
acc_final = accuracy_score(y_test, y_hat)

In [108]:
rs_final = [classrep_final, acc_final]

In [109]:
# need to pickle this because otherwise I'm running into memory error's
pickle.dump(rs_final, open("result_final.p","wb"))

In [8]:
# check_point as otherwise memory error
fact_df = pd.read_csv('fact_df.csv')
circumstances_df = pd.read_csv('circumstances_df.csv')
procedure_df = pd.read_csv('procedure_df.csv')

In [26]:
# loop over all the other parts
acc_list = []
classrep_list = []
for data in [fact_df, circumstances_df, procedure_df]:
    data.dropna(inplace = True)
    X_train, X_test, y_train, y_test = data_prepper(data)
    X_res, y_res = rus.fit_resample(X_train, y_train)
    pipeline.fit(X_res['text'], y_res)
    y_hat = pipeline.predict(X_test['text'])
    classrep_list.append(classification_report(y_test, y_hat))
    acc_list.append(accuracy_score(y_test, y_hat))













In [31]:
print(classrep_list[0])

              precision    recall  f1-score   support

           0       0.30      0.75      0.43       318
           1       0.95      0.72      0.82      1981

    accuracy                           0.73      2299
   macro avg       0.62      0.73      0.62      2299
weighted avg       0.86      0.73      0.77      2299



In [50]:
rs_other = [classrep_list, acc_list]
pickle.dump(rs_other, open("result_other.p","wb"))

## Forecasting

Now we will use historic values up a certain time point to predict the future

In [58]:
# we didn't do that before, but no error... 
final_df.dropna(inplace = True)

In [59]:
train_df = final_df[final_df.year <= 2011]
test_df = final_df[final_df.year > 2011]
# results in 75/25 split

In [60]:
def data_prepper(df):
    
    target = 'outcome'
    X = df.drop(target, axis = 1)
    y = df[target]
    y = y.apply(lambda x: 1 if x == 'violation' else 0)
    return X, y

In [61]:
X_train, y_train = data_prepper(train_df)
X_test, y_test = data_prepper(test_df)

In [62]:
X_res, y_res = rus.fit_resample(X_train, y_train)
pipeline.fit(X_res['text'], y_res)





Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('wordvec',
                                                 TfidfVectorizer(min_df=2,
                                                                 ngram_range=(3,
                                                                              4)))])),
                ('classifier',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, enable_categorical=False,
                               gamma=0, gpu_id=-1, importance_type=None,
                               interaction_constraints='',
                               learning_rate=0.300000012, max_delta_step=0,
                               max_depth=6, min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=1000,
                               n_jobs=8, num_parallel_tree=1,

In [63]:
y_hat = pipeline.predict(X_test['text'])

In [64]:
t_classrep_final = classification_report(y_test, y_hat)
t_acc_final = accuracy_score(y_test, y_hat)

In [65]:
print(t_classrep_final)

              precision    recall  f1-score   support

           0       0.28      0.67      0.39       569
           1       0.88      0.60      0.71      2429

    accuracy                           0.61      2998
   macro avg       0.58      0.63      0.55      2998
weighted avg       0.77      0.61      0.65      2998



In [66]:
ts_final = [t_classrep_final, t_acc_final]
pickle.dump(ts_final, open("time_result_final.p","wb"))

## Hyperparameter optimization time

In [None]:
# takes way too long as one fit is already 20 + minutes and we don't have the computing time atm 