In [4]:
import spacy
import fasttext
import re

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.externals import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.ensemble import VotingClassifier

from xgboost import XGBClassifier

from gensim.models import KeyedVectors

ModuleNotFoundError: No module named 'gensim'

In [None]:
def embed_sentences(embed, sentences):
    embeddings = embed(sentences)

    with tf.Session() as session:
        session.run([tf.global_variables_initializer(), tf.tables_initializer()])
        sentence_embedings = session.run(embeddings)

    return sentence_embedings

def get_best_estimator(algorithm, param_grid, x_train, y_train):
    grid_search = GridSearchCV(estimator=algorithm, param_grid=param_grid, n_jobs=-1, verbose=2)
    grid_search.fit(x_train, y_train)
    print(grid_search.best_params_)
    
    return grid_search.best_estimator_, grid_search

def get_classification_report(model, x_test, y_test):
    predictions = model.predict(x_test)
    
    print(classification_report(y_test, predictions))
    
def get_vectors(vectorizer, text):
    documents = vectorizer.pipe(text)
    
    return np.asarray([doc.vector for doc in documents])

def process_text(text):
    text = re.sub('(\.|,|\?|!|-|;|\*|"|:|—|\(|\)|%|#|\$|&|_|\/|@)', '', text)
    text = re.sub('\d', '', text)
    
    return text

def process_text_ft(text):
    text = re.sub('(\.|,|\?|!|-|;|\*|"|:|—|\(|\)|%|#|\$|&|_|\/|@)', '', text)
    text = re.sub('\d', '', text)
    
    return text.lower()

In [5]:
df = pd.read_csv('../data/obligation_extraction_df.csv')

In [138]:
!python3 -m spacy download en_core_web_lg

Collecting en_core_web_lg==2.1.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.1.0/en_core_web_lg-2.1.0.tar.gz (826.9MB)
[K     |████████████████████████████████| 826.9MB 2.4MB/s eta 0:00:01   |▏                               | 3.1MB 1.5MB/s eta 0:09:27     |█                               | 25.9MB 2.4MB/s eta 0:05:37     |█████▌                          | 141.8MB 3.1MB/s eta 0:03:41     |██████████▌                     | 270.4MB 579kB/s eta 0:16:01     |███████████                     | 282.7MB 2.6MB/s eta 0:03:33     |████████████████▎               | 421.0MB 1.8MB/s eta 0:03:46     |█████████████████▋              | 454.2MB 2.3MB/s eta 0:02:44     |█████████████████████████▍      | 656.1MB 2.9MB/s eta 0:01:00
[?25hBuilding wheels for collected packages: en-core-web-lg
  Building wheel for en-core-web-lg (setup.py) ... [?25ldone
[?25h  Created wheel for en-core-web-lg: filename=en_core_web_lg-2.1.0-cp36-none-any.whl size=82825507

In [6]:
df_train, df_test = train_test_split(df, train_size=0.7, random_state=42)



In [5]:
nlp = spacy.load('en_core_web_lg')

In [62]:
# Models using TfIdf vectorization

nb_tfidf = Pipeline([
    ('vect', TfidfVectorizer(stop_words='english', preprocessor=process_text)),
    ('clf', MultinomialNB())
])

svm_tfidf = Pipeline([
    ('vect', TfidfVectorizer(stop_words='english', preprocessor=process_text)),
    ('clf', SVC())
])

xgb_tfidf = Pipeline([
    ('vect', TfidfVectorizer(stop_words='english', preprocessor=process_text)),
    ('clf', XGBClassifier())
])

In [21]:
nb_tfidf_, nb_tfidf_grid = get_best_estimator(nb_tfidf, {}, df_train.sentence, df_train.is_obligation)

get_classification_report(nb_tfidf_, df_test.sentence, df_test.is_obligation)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.9s finished


{}
              precision    recall  f1-score   support

       False       0.80      0.60      0.69      2113
        True       0.71      0.86      0.78      2371

   micro avg       0.74      0.74      0.74      4484
   macro avg       0.75      0.73      0.73      4484
weighted avg       0.75      0.74      0.74      4484



In [65]:
# with preprocessing

nb_tfidf_, nb_tfidf_grid = get_best_estimator(nb_tfidf, {}, df_train.sentence, df_train.is_obligation)

get_classification_report(nb_tfidf_, df_test.sentence, df_test.is_obligation)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    2.6s finished


{}
              precision    recall  f1-score   support

       False       0.80      0.61      0.69      2113
        True       0.71      0.87      0.78      2371

   micro avg       0.75      0.75      0.75      4484
   macro avg       0.76      0.74      0.74      4484
weighted avg       0.76      0.75      0.74      4484



In [23]:
param_grid_svm = {
    'clf__C': [0.001, 0.01, 0.1, 1, 10], 
    'clf__gamma' : [0.001, 0.01, 0.1, 1],
    # 'clf__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'clf__kernel': ['rbf'],
    'clf__degree': [1, 2, 3, 4, 5]
}

svm_tfidf_, svm_tfidf_grid = get_best_estimator(
    svm_tfidf, 
    param_grid_svm, 
    df_train.sentence, 
    df_train.is_obligation
)

get_classification_report(svm_tfidf_, df_test.sentence, df_test.is_obligation)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 15.6min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 29.2min finished


{'clf__C': 1, 'clf__degree': 1, 'clf__gamma': 1, 'clf__kernel': 'rbf'}
              precision    recall  f1-score   support

       False       0.81      0.79      0.80      2113
        True       0.82      0.84      0.83      2371

   micro avg       0.82      0.82      0.82      4484
   macro avg       0.82      0.82      0.82      4484
weighted avg       0.82      0.82      0.82      4484



In [66]:
# with preprocessing

param_grid_svm = {
    'clf__C': [0.01, 0.1, 1, 10], 
    'clf__gamma' : [0.01, 0.1, 1],
    'clf__kernel': ['rbf'],
    'clf__degree': [1, 2, 3, 4]
}

svm_tfidf_, svm_tfidf_grid = get_best_estimator(
    svm_tfidf, 
    param_grid_svm, 
    df_train.sentence, 
    df_train.is_obligation
)

get_classification_report(svm_tfidf_, df_test.sentence, df_test.is_obligation)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 48 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed: 13.1min finished


{'clf__C': 1, 'clf__degree': 1, 'clf__gamma': 1, 'clf__kernel': 'rbf'}
              precision    recall  f1-score   support

       False       0.81      0.79      0.80      2113
        True       0.82      0.84      0.83      2371

   micro avg       0.81      0.81      0.81      4484
   macro avg       0.81      0.81      0.81      4484
weighted avg       0.81      0.81      0.81      4484



In [24]:
param_grid = {
    'clf__max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, 20],
    'clf__n_estimators': [50, 100, 200],
}

xgb_tfidf_, xgb_tfidf_grid = get_best_estimator(
    xgb_tfidf, 
    param_grid, 
    df_train.sentence, 
    df_train.is_obligation
)

get_classification_report(xgb_tfidf_, df_test.sentence, df_test.is_obligation)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   25.9s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:  2.8min finished


{'clf__max_depth': 8, 'clf__n_estimators': 200}
              precision    recall  f1-score   support

       False       0.81      0.81      0.81      2113
        True       0.83      0.83      0.83      2371

   micro avg       0.82      0.82      0.82      4484
   macro avg       0.82      0.82      0.82      4484
weighted avg       0.82      0.82      0.82      4484



In [63]:
# with preprocessing
param_grid = {
    'clf__max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, 20],
    'clf__n_estimators': [50, 100, 200],
}

xgb_tfidf_, xgb_tfidf_grid = get_best_estimator(
    xgb_tfidf, 
    param_grid, 
    df_train.sentence, 
    df_train.is_obligation
)

get_classification_report(xgb_tfidf_, df_test.sentence, df_test.is_obligation)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   22.6s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:  2.8min finished


{'clf__max_depth': 10, 'clf__n_estimators': 200}
              precision    recall  f1-score   support

       False       0.80      0.81      0.80      2113
        True       0.83      0.82      0.82      2371

   micro avg       0.81      0.81      0.81      4484
   macro avg       0.81      0.81      0.81      4484
weighted avg       0.81      0.81      0.81      4484



In [14]:
x_train_glove = get_vectors(nlp, df_train.sentence.values)
x_test_glove= get_vectors(nlp, df_test.sentence.values)

In [17]:
# Models trained on glove embeddings

nb_glove = MultinomialNB()
svc_glove = SVC()
xgb_glove = XGBClassifier()

In [19]:
# param_grid_svc = {
#     'C': [0.001, 0.01, 0.1, 1, 10],
#     'gamma' : [0.001, 0.01, 0.1, 1],
#     'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
#     'degree': [1, 2, 3, 4, 5]
# }

param_grid_svc = {
    'C': [0.01, 0.1, 1],
    'gamma' : [0.01, 0.1, 1],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': [1, 2, 3]
}

svc_glove, svc_glove_grid = get_best_estimator(
    svc_glove, 
    param_grid_svc, 
    x_train_glove, 
    df_train.is_obligation.values
)

get_classification_report(svc_glove, x_test_glove, df_test.is_obligation.values)

Fitting 3 folds for each of 108 candidates, totalling 324 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 34.4min
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed: 70.6min finished


{'C': 1, 'degree': 1, 'gamma': 1, 'kernel': 'rbf'}
              precision    recall  f1-score   support

       False       0.80      0.79      0.79      2113
        True       0.81      0.82      0.82      2371

   micro avg       0.80      0.80      0.80      4484
   macro avg       0.80      0.80      0.80      4484
weighted avg       0.80      0.80      0.80      4484



In [27]:
param_grid_xgb = {
    'clf__max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, 20],
    'clf__n_estimators': [50, 100, 200],
}

xgb_glove, param_grid_xgb = get_best_estimator(
    xgb_glove, 
    param_grid_xgb, 
    np.array(x_train_glove), 
    df_train.is_obligation.values
)

get_classification_report(xgb_glove, np.asarray(x_test_glove), df_test.is_obligation.values)

              precision    recall  f1-score   support

       False       0.77      0.74      0.76      2113
        True       0.78      0.80      0.79      2371

   micro avg       0.78      0.78      0.78      4484
   macro avg       0.78      0.77      0.77      4484
weighted avg       0.78      0.78      0.78      4484



In [8]:
# Models using BoW vectorization

nb_bow = Pipeline([
    ('vect', CountVectorizer(stop_words='english')),
    ('clf', MultinomialNB())
])

svm_bow = Pipeline([
    ('vect', CountVectorizer(stop_words='english')),
    ('clf', SVC())
])

xgb_bow = Pipeline([
    ('vect', CountVectorizer(stop_words='english')),
    ('clf', XGBClassifier())
])

In [13]:
nb_bow_, nb_bow_grid = get_best_estimator(nb_bow, {}, df_train.sentence, df_train.is_obligation)

get_classification_report(nb_bow_, df_test.sentence, df_test.is_obligation)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.9s finished


{}
              precision    recall  f1-score   support

       False       0.77      0.61      0.68      2113
        True       0.71      0.84      0.77      2371

   micro avg       0.73      0.73      0.73      4484
   macro avg       0.74      0.73      0.73      4484
weighted avg       0.74      0.73      0.73      4484



In [15]:
param_grid_svm = {
    'clf__C': [0.001, 0.01, 0.1, 1, 10], 
    'clf__gamma' : [0.001, 0.01, 0.1, 1],
    # 'clf__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'clf__kernel': ['rbf'],
    'clf__degree': [1, 2, 3, 4, 5]
}

svm_bow_, svm_bow_grid = get_best_estimator(
    svm_bow, 
    param_grid_svm, 
    df_train.sentence, 
    df_train.is_obligation
)

get_classification_report(svm_bow_, df_test.sentence, df_test.is_obligation)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 15.1min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 28.9min finished


{'clf__C': 10, 'clf__degree': 1, 'clf__gamma': 0.01, 'clf__kernel': 'rbf'}
              precision    recall  f1-score   support

       False       0.79      0.80      0.80      2113
        True       0.82      0.81      0.81      2371

   micro avg       0.81      0.81      0.81      4484
   macro avg       0.81      0.81      0.81      4484
weighted avg       0.81      0.81      0.81      4484



In [16]:
param_grid = {
    'clf__max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, 20],
    'clf__n_estimators': [50, 100, 200],
}

xgb_bow_, xgb_bow_grid = get_best_estimator(
    xgb_bow, 
    param_grid, 
    df_train.sentence, 
    df_train.is_obligation
)

get_classification_report(xgb_bow_, df_test.sentence, df_test.is_obligation)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   16.7s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:  1.6min finished


{'clf__max_depth': 6, 'clf__n_estimators': 100}
              precision    recall  f1-score   support

       False       0.80      0.82      0.81      2113
        True       0.84      0.82      0.83      2371

   micro avg       0.82      0.82      0.82      4484
   macro avg       0.82      0.82      0.82      4484
weighted avg       0.82      0.82      0.82      4484



In [19]:
joblib.dump(xgb_bow_, '../models/xgb_bow.joblib')
joblib.dump(svm_bow_, '../models/svm_bow.joblib')
joblib.dump(nb_bow_, '../models/nb_bow.joblib')

['../models/nb_bow.joblib']

In [9]:
import fasttext.util

fasttext.util.download_model('en', if_exists='ignore')  # English

'cc.en.300.bin'

In [10]:
ft = fasttext.load_model('cc.en.300.bin')



In [10]:
x_train_fasttext = np.asarray([ft.get_sentence_vector(sent) for sent in df_train.sentence])
x_test_fasttext = np.asarray([ft.get_sentence_vector(sent) for sent in df_test.sentence])

In [12]:
# Models trained on the fasttext embeddings

svc_fasttext = SVC()
xgb_fasttext = XGBClassifier()

In [51]:
# param_grid_svc = {
#     'C': [0.001, 0.01, 0.1, 1, 10],
#     'gamma' : [0.001, 0.01, 0.1, 1],
#     'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
#     'degree': [1, 2, 3, 4, 5]
# }

param_grid_svc = {
    'C': [0.01, 0.1, 1],
    'gamma' : [0.01, 0.1, 1],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': [1, 2, 3]
}

svc_fasttext, svc_glove_grid = get_best_estimator(
    svc_fasttext, 
    param_grid_svc, 
    list(x_train_fasttext), 
    df_train.is_obligation.values
)

get_classification_report(svc_fasttext, list(x_test_fasttext), df_test.is_obligation.values)

              precision    recall  f1-score   support

       False       0.78      0.70      0.74      2113
        True       0.76      0.83      0.79      2371

   micro avg       0.77      0.77      0.77      4484
   macro avg       0.77      0.77      0.77      4484
weighted avg       0.77      0.77      0.77      4484



In [135]:

param_grid_xgb = {
    'clf__max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, 20],
    'clf__n_estimators': [50, 100, 200],
}

xgb_fasttext, param_grid_xgb = get_best_estimator(
    xgb_fasttext, 
    param_grid_xgb, 
    x_train_fasttext, 
    df_train.is_obligation.values
)

get_classification_report(
    xgb_fasttext, 
    x_test_fasttext, 
    df_test.is_obligation.values
)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed: 11.6min finished


Parameters: { clf__max_depth, clf__n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


{'clf__max_depth': 2, 'clf__n_estimators': 50}
              precision    recall  f1-score   support

       False       0.78      0.74      0.76      2113
        True       0.78      0.81      0.80      2371

   micro avg       0.78      0.78      0.78      4484
   macro avg       0.78      0.78      0.78      4484
weighted avg       0.78      0.78      0.78      4484



In [11]:
# FastText preprocessing

sentences_train = df_train.sentence.apply(lambda x: process_text_ft(x))
sentences_test = df_test.sentence.apply(lambda x: process_text_ft(x))

x_train_fasttext = np.asarray([ft.get_sentence_vector(sent) for sent in sentences_train])
x_test_fasttext = np.asarray([ft.get_sentence_vector(sent) for sent in sentences_test])

del sentences_train, sentences_test

In [13]:
# param_grid_svc = {
#     'C': [0.001, 0.01, 0.1, 1, 10],
#     'gamma' : [0.001, 0.01, 0.1, 1],
#     'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
#     'degree': [1, 2, 3, 4, 5]
# }

param_grid_svc = {
    'C': [0.01, 0.1, 1],
    'gamma' : [0.01, 0.1, 1],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': [1, 2, 3]
}

svc_fasttext, svc_glove_grid = get_best_estimator(
    svc_fasttext, 
    param_grid_svc, 
    list(x_train_fasttext), 
    df_train.is_obligation.values
)

get_classification_report(svc_fasttext, list(x_test_fasttext), df_test.is_obligation.values)

Fitting 3 folds for each of 108 candidates, totalling 324 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 33.1min
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed: 663.8min finished


{'C': 1, 'degree': 1, 'gamma': 1, 'kernel': 'rbf'}
              precision    recall  f1-score   support

       False       0.79      0.71      0.75      2113
        True       0.76      0.83      0.80      2371

   micro avg       0.77      0.77      0.77      4484
   macro avg       0.78      0.77      0.77      4484
weighted avg       0.78      0.77      0.77      4484



In [15]:
x_train_glove.shape

(10461, 300)

In [17]:
x_train_fasttext.shape

(10461, 300)

In [32]:
combined_vectors_train = np.concatenate([x_train_glove, x_train_fasttext], axis=1)
combined_vectors_test = np.concatenate([x_test_glove, x_test_fasttext], axis=1)

In [33]:
combined_vectors_train

array([[-0.07845327,  0.205226  , -0.06929687, ...,  0.0495472 ,
        -0.02016848, -0.00382634],
       [-0.04374111,  0.17809425, -0.09262659, ...,  0.06744081,
        -0.00782636,  0.00454679],
       [ 0.02362153,  0.0674613 , -0.21361956, ...,  0.05092333,
        -0.02880907, -0.00625204],
       ...,
       [ 0.05727962,  0.01763594, -0.22384964, ...,  0.06275982,
        -0.00919158, -0.00963021],
       [-0.10408751,  0.13569058, -0.1299177 , ...,  0.05006945,
        -0.02868438, -0.01424044],
       [-0.0756157 ,  0.14540789, -0.09514034, ...,  0.06228682,
        -0.01930556,  0.00314015]], dtype=float32)

In [35]:
# Meta embeddings

svc_meta = SVC()
xgb_meta = XGBClassifier()

In [37]:
param_grid_svc = {
    'C': [0.01, 0.1, 1],
    'gamma' : [0.01, 0.1, 1],
    'kernel': ['poly', 'rbf', 'sigmoid'],
    'degree': [1, 2, 3]
}

svc_meta_, svc_glove_grid = get_best_estimator(
    svc_meta, 
    param_grid_svc, 
    combined_vectors_train, 
    df_train.is_obligation.values
)

get_classification_report(svc_meta_, combined_vectors_test, df_test.is_obligation.values)

Fitting 3 folds for each of 81 candidates, totalling 243 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 11.1min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 51.1min
[Parallel(n_jobs=-1)]: Done 243 out of 243 | elapsed: 78.9min finished


{'C': 1, 'degree': 1, 'gamma': 1, 'kernel': 'rbf'}
              precision    recall  f1-score   support

       False       0.80      0.79      0.79      2113
        True       0.81      0.82      0.82      2371

   micro avg       0.81      0.81      0.81      4484
   macro avg       0.81      0.81      0.81      4484
weighted avg       0.81      0.81      0.81      4484



In [39]:
param_grid_xgb = {
    'clf__max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, 20],
    'clf__n_estimators': [50, 100, 200],
}

xgb_combined_, param_grid_xgb = get_best_estimator(
    xgb_meta, 
    param_grid_xgb, 
    combined_vectors_train, 
    df_train.is_obligation.values
)

get_classification_report(
    xgb_combined_, 
    combined_vectors_test, 
    df_test.is_obligation.values
)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed: 21.7min finished


Parameters: { clf__max_depth, clf__n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


{'clf__max_depth': 2, 'clf__n_estimators': 50}
              precision    recall  f1-score   support

       False       0.77      0.75      0.76      2113
        True       0.78      0.80      0.79      2371

   micro avg       0.78      0.78      0.78      4484
   macro avg       0.78      0.78      0.78      4484
weighted avg       0.78      0.78      0.78      4484



In [47]:
meta_embeddings_train = x_train_glove + x_train_fasttext / 2
meta_embeddings_test = x_test_glove + x_test_fasttext / 2

In [48]:
param_grid_svc = {
    'C': [0.01, 0.1, 1],
    'gamma' : [0.01, 0.1, 1],
    'kernel': ['poly', 'rbf', 'sigmoid'],
    'degree': [1, 2, 3]
}

svc_meta_, svc_glove_grid = get_best_estimator(
    svc_meta, 
    param_grid_svc, 
    meta_embeddings_train, 
    df_train.is_obligation.values
)

get_classification_report(svc_meta_, meta_embeddings_test, df_test.is_obligation.values)

Fitting 3 folds for each of 81 candidates, totalling 243 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 27.3min
[Parallel(n_jobs=-1)]: Done 243 out of 243 | elapsed: 42.5min finished


{'C': 1, 'degree': 1, 'gamma': 1, 'kernel': 'rbf'}
              precision    recall  f1-score   support

       False       0.80      0.78      0.79      2113
        True       0.81      0.82      0.82      2371

   micro avg       0.80      0.80      0.80      4484
   macro avg       0.80      0.80      0.80      4484
weighted avg       0.80      0.80      0.80      4484



In [49]:
param_grid_xgb = {
    'clf__max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, 20],
    'clf__n_estimators': [50, 100, 200],
}

xgb_meta_, param_grid_xgb = get_best_estimator(
    xgb_meta, 
    param_grid_xgb, 
    meta_embeddings_train, 
    df_train.is_obligation.values
)

get_classification_report(
    xgb_meta_, 
    meta_embeddings_test, 
    df_test.is_obligation.values
)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed: 11.3min finished


Parameters: { clf__max_depth, clf__n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


{'clf__max_depth': 2, 'clf__n_estimators': 50}
              precision    recall  f1-score   support

       False       0.77      0.75      0.76      2113
        True       0.78      0.81      0.79      2371

   micro avg       0.78      0.78      0.78      4484
   macro avg       0.78      0.78      0.78      4484
weighted avg       0.78      0.78      0.78      4484



In [50]:
import tensorflow_hub as hub
import tensorflow as tf

embed = hub.Module("https://tfhub.dev/google/universal-sentence-encoder-large/3")

def embed_sentences(sentences):
    embeddings = embed(sentences)

    with tf.Session() as session:
        session.run([tf.global_variables_initializer(), tf.tables_initializer()])
        sentence_embedings = session.run(embeddings)

    return sentence_embedings

In [52]:
transformer_x_train = embed_sentences(df_train.sentence.values)
transformer_x_test = embed_sentences(df_test.sentence.values)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [53]:
param_grid_svc = {
    'C': [0.01, 0.1, 1],
    'gamma' : [0.01, 0.1, 1],
    'kernel': ['poly', 'rbf', 'sigmoid'],
    'degree': [1, 2, 3]
}

svc_use_, svc_glove_grid = get_best_estimator(
    svc_meta, 
    param_grid_svc, 
    transformer_x_train, 
    df_train.is_obligation.values
)

get_classification_report(svc_use_, transformer_x_test, df_test.is_obligation.values)

Fitting 3 folds for each of 81 candidates, totalling 243 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  9.6min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 45.9min
[Parallel(n_jobs=-1)]: Done 243 out of 243 | elapsed: 70.4min finished


{'C': 1, 'degree': 3, 'gamma': 1, 'kernel': 'poly'}
              precision    recall  f1-score   support

       False       0.81      0.77      0.79      2113
        True       0.80      0.84      0.82      2371

   micro avg       0.81      0.81      0.81      4484
   macro avg       0.81      0.81      0.81      4484
weighted avg       0.81      0.81      0.81      4484



In [54]:
param_grid_xgb = {
    'clf__max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, 20],
    'clf__n_estimators': [50, 100, 200],
}

xgb_meta_, param_grid_xgb = get_best_estimator(
    xgb_meta, 
    param_grid_xgb, 
    transformer_x_train, 
    df_train.is_obligation.values
)

get_classification_report(
    xgb_meta_, 
    transformer_x_test, 
    df_test.is_obligation.values
)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  6.6min
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed: 19.0min finished


Parameters: { clf__max_depth, clf__n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


{'clf__max_depth': 2, 'clf__n_estimators': 50}
              precision    recall  f1-score   support

       False       0.77      0.75      0.76      2113
        True       0.78      0.80      0.79      2371

   micro avg       0.78      0.78      0.78      4484
   macro avg       0.78      0.78      0.78      4484
weighted avg       0.78      0.78      0.78      4484

