In [1]:
import pandas
import spacy

import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from tqdm.auto import tqdm
from spacy.tokens import DocBin

In [3]:
# !python3 -m spacy download en_core_web_lg

In [4]:
def process_text(text):
    text = re.sub('(\.|,|\?|!|-|;|\*|"|:|—|\(|\)|%|#|\$|&|_|\/|@)', '', text)
    text = re.sub('\d', '', text)
    
    # TODO add lowercase
    return text

def make_docs(data):
    """
    this will take a list of texts and labels 
    and transform them in spacy documents
    
    data: list(tuple(text, label))
    
    returns: List(spacy.Doc.doc)
    """
    
    docs = []
    # nlp.pipe([texts]) is way faster than running 
    # nlp(text) for each text
    # as_tuples allows us to pass in a tuple, 
    # the first one is treated as text
    # the second one will get returned as it is.
    
    for doc, label in tqdm(nlp.pipe(data, as_tuples=True), total = len(data)):
        
        doc.cats["obligation"] = label
        doc.cats["not_obligation"] = not label
        
        
        # put them into a nice list
        docs.append(doc)
    
    return docs

In [6]:
nlp = spacy.load('en_core_web_lg')
df = pd.read_csv('../data/obligation_extraction_df.csv')

df_train, df_test = train_test_split(df, train_size=0.8, random_state=42)
df_train, df_valid = train_test_split(df, train_size=0.8, random_state=42)

In [7]:
def transform_label(is_obligation):
    return {'cats': {'OBLIGATION': is_obligation, 'NOT_OBLIGATION': not is_obligation}}

In [8]:
train_data = [
    (pair.sentence, pair.is_obligation) for pair in df_train.itertuples()
]

test_data = [
    (pair.sentence, pair.is_obligation) for pair in df_valid.itertuples()
]

In [9]:
train_docs = make_docs(train_data)
doc_bin = DocBin(docs=train_docs)
doc_bin.to_disk("../data/train.spacy")

test_docs = make_docs(test_data)
doc_bin = DocBin(docs=test_docs)
doc_bin.to_disk("../data/valid.spacy")

  0%|          | 0/11956 [00:00<?, ?it/s]

  0%|          | 0/2989 [00:00<?, ?it/s]

In [20]:
!python3 -m spacy train ../models/config.cfg --paths.train ../data/train.spacy --paths.dev ../data/valid.spacy -o ../models/spacy_ensamble

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
[38;5;2m✔ Created output directory: ../models/spacy_ensamble[0m
[38;5;4mℹ Saving to output directory: ../models/spacy_ensamble[0m
[38;5;4mℹ Using CPU[0m
[1m
[2021-10-03 17:37:29,967] [INFO] Set up nlp object from config
[2021-10-03 17:37:29,979] [INFO] Pipeline: ['tok2vec', 'textcat']
[2021-10-03 17:37:29,984] [INFO] Created vocabulary
[2021-10-03 17:37:32,165] [INFO] Adde

In [21]:
nlp = spacy.load('../models/spacy_ensamble/model-best')

In [41]:
docs = nlp.pipe(df_test.sentence.values)

In [42]:
preds = []

for doc in docs:
    scores = doc.cats
    
    preds.append(doc.cats['obligation'] > doc.cats['not_obligation'])

In [43]:
print(classification_report(df_test.is_obligation.values, preds))

              precision    recall  f1-score   support

       False       0.84      0.81      0.83      1411
        True       0.84      0.86      0.85      1578

    accuracy                           0.84      2989
   macro avg       0.84      0.84      0.84      2989
weighted avg       0.84      0.84      0.84      2989

