In [2]:
import spacy
import fasttext
import re

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.ensemble import VotingClassifier

from xgboost import XGBClassifier

from gensim.models import KeyedVectors



In [13]:
def embed_sentences(embed, sentences):
    embeddings = embed(sentences)

    with tf.Session() as session:
        session.run([tf.global_variables_initializer(), tf.tables_initializer()])
        sentence_embedings = session.run(embeddings)

    return sentence_embedings

def get_best_estimator(algorithm, param_grid, x_train, y_train):
    grid_search = GridSearchCV(estimator=algorithm, param_grid=param_grid, n_jobs=-1, verbose=2)
    grid_search.fit(x_train, y_train)
    print(grid_search.best_params_)
    
    return grid_search.best_estimator_, grid_search

def get_classification_report(model, x_test, y_test):
    predictions = model.predict(x_test)
    
    print(classification_report(y_test, predictions))
    
def get_vectors(vectorizer, text):
    documents = vectorizer.pipe(text)
    
    return np.asarray([doc.vector for doc in documents])

def process_text(text):
    text = re.sub('(\.|,|\?|!|-|;|\*|"|:|—|\(|\)|%|#|\$|&|_|\/|@)', '', text)
    text = re.sub('\d', '', text)
    
    return text

def process_text_ft(text):
    text = re.sub('(\.|,|\?|!|-|;|\*|"|:|—|\(|\)|%|#|\$|&|_|\/|@)', '', text)
    text = re.sub('\d', '', text)
    
    return text.lower()

In [14]:
df = pd.read_csv('../data/obligation_extraction_df.csv')

df_train, df_test = train_test_split(df, train_size=0.8, random_state=42)

In [9]:
from gensim.models import FastText

ft = FastText.load('../models/legal_model.cc')

In [10]:
x_train_fasttext = np.asarray([ft.wv[sent] for sent in df_train.sentence])
x_test_fasttext = np.asarray([ft.wv[sent] for sent in df_test.sentence])

In [11]:
# Models trained on the fasttext embeddings

svc_fasttext = SVC()
xgb_fasttext = XGBClassifier()

In [15]:
# param_grid_svc = {
#     'C': [0.001, 0.01, 0.1, 1, 10],
#     'gamma' : [0.001, 0.01, 0.1, 1],
#     'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
#     'degree': [1, 2, 3, 4, 5]
# }

param_grid_svc = {
    'C': [0.01, 0.1, 1],
    'gamma' : [0.01, 0.1, 1],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': [1, 2, 3]
}

svc_fasttext, svc_glove_grid = get_best_estimator(
    svc_fasttext, 
    param_grid_svc, 
    list(x_train_fasttext), 
    df_train.is_obligation.values
)

get_classification_report(svc_fasttext, list(x_test_fasttext), df_test.is_obligation.values)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
{'C': 0.1, 'degree': 3, 'gamma': 1, 'kernel': 'poly'}
              precision    recall  f1-score   support

       False       0.79      0.74      0.76      1411
        True       0.78      0.82      0.80      1578

    accuracy                           0.78      2989
   macro avg       0.78      0.78      0.78      2989
weighted avg       0.78      0.78      0.78      2989



In [16]:

param_grid_xgb = {
    'clf__max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, 20],
    'clf__n_estimators': [50, 100, 200],
}

xgb_fasttext, param_grid_xgb = get_best_estimator(
    xgb_fasttext, 
    param_grid_xgb, 
    x_train_fasttext, 
    df_train.is_obligation.values
)

get_classification_report(
    xgb_fasttext, 
    x_test_fasttext, 
    df_test.is_obligation.values
)

Fitting 5 folds for each of 30 candidates, totalling 150 fits




Parameters: { "clf__max_depth", "clf__n_estimators" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


{'clf__max_depth': 2, 'clf__n_estimators': 50}
              precision    recall  f1-score   support

       False       0.77      0.72      0.74      1411
        True       0.76      0.81      0.79      1578

    accuracy                           0.77      2989
   macro avg       0.77      0.76      0.76      2989
weighted avg       0.77      0.77      0.77      2989



In [18]:
# FastText preprocessing

sentences_train = df_train.sentence.apply(lambda x: process_text_ft(x))
sentences_test = df_test.sentence.apply(lambda x: process_text_ft(x))

x_train_fasttext = np.asarray([ft.wv[sent] for sent in df_train.sentence])
x_test_fasttext = np.asarray([ft.wv[sent] for sent in df_test.sentence])

del sentences_train, sentences_test

In [19]:
# param_grid_svc = {
#     'C': [0.001, 0.01, 0.1, 1, 10],
#     'gamma' : [0.001, 0.01, 0.1, 1],
#     'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
#     'degree': [1, 2, 3, 4, 5]
# }

param_grid_svc = {
    'C': [0.01, 0.1, 1],
    'gamma' : [0.01, 0.1, 1],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': [1, 2, 3]
}

svc_fasttext, svc_glove_grid = get_best_estimator(
    svc_fasttext, 
    param_grid_svc, 
    list(x_train_fasttext), 
    df_train.is_obligation.values
)

get_classification_report(svc_fasttext, list(x_test_fasttext), df_test.is_obligation.values)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
{'C': 0.1, 'degree': 3, 'gamma': 1, 'kernel': 'poly'}
              precision    recall  f1-score   support

       False       0.79      0.74      0.76      1411
        True       0.78      0.82      0.80      1578

    accuracy                           0.78      2989
   macro avg       0.78      0.78      0.78      2989
weighted avg       0.78      0.78      0.78      2989



In [None]:

param_grid_xgb = {
    'clf__max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, 20],
    'clf__n_estimators': [50, 100, 200],
}

xgb_fasttext, param_grid_xgb = get_best_estimator(
    xgb_fasttext, 
    param_grid_xgb, 
    x_train_fasttext, 
    df_train.is_obligation.values
)

get_classification_report(
    xgb_fasttext, 
    x_test_fasttext, 
    df_test.is_obligation.values
)