## Import Libraries

In [1]:
import sklearn_crfsuite
from sklearn_crfsuite.metrics import flatten
from sklearn.metrics import classification_report

import pickle
from datasets import load_from_disk

from reference_parsing.config import (LABEL2ID)

from reference_parsing.utils.data_preparation import prepare_crf_data
from reference_parsing.embeddings.HandFeatureEmbedding import  HandFeatureEmbedding
from reference_parsing.embeddings.BytePairReferenceEmbedding import BytePairReferenceEmbedding

  from .autonotebook import tqdm as notebook_tqdm


## Load Dataset

In [2]:
prepared_dataset = load_from_disk("./datasets/prepared_dataset")

In [3]:
bp_emb = BytePairReferenceEmbedding()
hand_emb = HandFeatureEmbedding()

In [None]:
label2id = LABEL2ID
id2label = {v: k for k, v in label2id.items()}
label_order = list(label2id.keys())

## CRF 1-mil

In [4]:
train_1mil = prepared_dataset["train"].select(range(1000000))
test_1mil = prepared_dataset["test"].select(range(100000)) 

### No Hand-Features

In [None]:
X_train, y_train = prepare_crf_data(bp_emb, train_1mil, prefix1="bp")
X_test, y_test = prepare_crf_data(bp_emb, test_1mil, prefix1="bp")

In [6]:
crf_1mil = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)

crf_1mil.keep_tempfiles = False
crf_1mil.model_filename = "dummy.crfsuite"

crf_1mil.fit(X_train, y_train)

In [None]:
y_train_pred = crf_1mil.predict(X_train)
y_train_flat = flatten(y_train)
y_train_pred_flat = flatten(y_train_pred)
print(classification_report(y_train_flat, y_train_pred_flat, labels=label_order, zero_division=0))

                   precision    recall  f1-score      support

         B-AUTHOR       0.88      0.93      0.90       831574
         I-AUTHOR       0.92      0.90      0.91      2667921
           B-YEAR       0.91      0.87      0.89       799351
           I-YEAR       0.92      0.90      0.91         4578
          B-TITLE       0.82      0.89      0.85       878952
          I-TITLE       0.84      0.87      0.85      6742518
B-CONTAINER-TITLE       0.81      0.89      0.85       523879
I-CONTAINER-TITLE       0.85      0.88      0.86      5318442
         B-VOLUME       0.93      0.89      0.91        25075
         I-VOLUME       0.92      0.91      0.91          837
          B-ISSUE       0.88      0.85      0.86        71496
          I-ISSUE       0.87      0.84      0.85          495
           B-PAGE       0.97      0.95      0.96       645588
           I-PAGE       0.95      0.94      0.94        97542
           B-ISBN       0.93      0.87      0.90       149375
       

In [None]:
y_test_pred = crf_1mil.predict(X_test)
y_test_flat = flatten(y_test)
y_test_pred_flat = flatten(y_test_pred)
print(classification_report(y_test_flat, y_test_pred_flat, labels=label_order, zero_division=0))

                   precision    recall  f1-score      support

         B-AUTHOR       0.85      0.91      0.88        91574
         I-AUTHOR       0.93      0.84      0.88       366742
           B-YEAR       0.83      0.89      0.86        71298
           I-YEAR       0.89      0.83      0.86          672
          B-TITLE       0.85      0.90      0.87       957431
          I-TITLE       0.82      0.88      0.85      8616141
B-CONTAINER-TITLE       0.82      0.91      0.86        74286
I-CONTAINER-TITLE       0.88      0.89      0.88       631754
         B-VOLUME       0.94      0.91      0.92         2374
         I-VOLUME       0.95      0.87      0.91           95
          B-ISSUE       0.89      0.83      0.86         7662
          I-ISSUE       0.87      0.83      0.85          152
           B-PAGE       0.98      0.95      0.96        74628
           I-PAGE       0.95      0.93      0.94        15841
           B-ISBN       0.91      0.86      0.88        24678
       

In [14]:
with open("models/crf_model_1mil.pkl", "wb") as f:
    pickle.dump(crf_1mil, f)

In [None]:
# with open("models/crf_model.pkl", "rb") as f:
#     crf_1mil = pickle.load(f)

### Hand-Features

In [None]:
X_train, y_train = prepare_crf_data(bp_emb, train_1mil, 
                                    embedding_obj2=hand_emb, 
                                    prefix1="bp", prefix2="hand")
X_test, y_test = prepare_crf_data(bp_emb, test_1mil,
                                  embedding_obj2=hand_emb,
                                   prefix1="bp", prefix2="hand")

In [29]:
crf_1mil_hand = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)

crf_1mil_hand.keep_tempfiles = False
crf_1mil_hand.model_filename = "dummy.crfsuite"

crf_1mil_hand.fit(X_train, y_train)

In [None]:
y_train_pred = crf_1mil_hand.predict(X_train)
y_train_flat = flatten(y_train)
y_train_pred_flat = flatten(y_train_pred)
print(classification_report(y_train_flat, y_train_pred_flat, labels=label_order, zero_division=0))

                   precision    recall  f1-score      support

         B-AUTHOR       0.92      0.95      0.93       831574
         I-AUTHOR       0.93      0.90      0.91      2667921
           B-YEAR       0.95      0.90      0.92       799351
           I-YEAR       0.95      0.95      0.95         4578
          B-TITLE       0.85      0.91      0.88       878952
          I-TITLE       0.89      0.90      0.89      6742518
B-CONTAINER-TITLE       0.85      0.89      0.87       523879
I-CONTAINER-TITLE       0.87      0.87      0.87      5318442
         B-VOLUME       0.94      0.91      0.92        25075
         I-VOLUME       0.94      0.91      0.92          837
          B-ISSUE       0.90      0.88      0.89        71496
          I-ISSUE       0.90      0.87      0.88          495
           B-PAGE       0.97      0.96      0.96       645588
           I-PAGE       0.96      0.96      0.96        97542
           B-ISBN       0.91      0.89      0.90       149375
       

In [None]:
y_test_pred = crf_1mil_hand.predict(X_test)
y_test_flat = flatten(y_test)
y_test_pred_flat = flatten(y_test_pred)
print(classification_report(y_test_flat, y_test_pred_flat, labels=label_order, zero_division=0))

                   precision    recall  f1-score      support

         B-AUTHOR       0.89      0.94      0.91        91574
         I-AUTHOR       0.94      0.89      0.91       366742
           B-YEAR       0.89      0.90      0.89        71298
           I-YEAR       0.95      0.95      0.95          672
          B-TITLE       0.88      0.90      0.89       957431
          I-TITLE       0.87      0.89      0.88      8616141
B-CONTAINER-TITLE       0.84      0.94      0.89        74286
I-CONTAINER-TITLE       0.83      0.87      0.85       631754
         B-VOLUME       0.93      0.88      0.90         2374
         I-VOLUME       0.92      0.86      0.89           95
          B-ISSUE       0.90      0.87      0.88         7662
          I-ISSUE       0.89      0.91      0.90          152
           B-PAGE       0.97      0.94      0.95        74628
           I-PAGE       0.98      0.95      0.96        15841
           B-ISBN       0.92      0.89      0.90        24678
       

In [None]:
with open("models/crf_model_1mil_hand.pkl", "wb") as f:
    pickle.dump(crf_1mil_hand, f)

In [None]:
# with open("models/crf_model_1mil_hand.pkl", "rb") as f:
#     crf_1mil_hand = pickle.load(f)

## CRF 5-mil

In [None]:
train_5mil = prepared_dataset["train"].select(range(5000000))
test_5mil = prepared_dataset["test"].select(range(200000)) 

In [None]:
X_train, y_train = prepare_crf_data(bp_emb, train_5mil, 
                                    embedding_obj2=hand_emb, 
                                    prefix1="bp", prefix2="hand")
X_test, y_test = prepare_crf_data(bp_emb, test_5mil,
                                  embedding_obj2=hand_emb,
                                   prefix1="bp", prefix2="hand")

In [35]:
crf_5mil_hand = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)

crf_5mil_hand.keep_tempfiles = False
crf_5mil_hand.model_filename = "dummy.crfsuite"

crf_5mil_hand.fit(X_train, y_train)

In [None]:
y_train_pred = crf_5mil_hand.predict(X_train)
y_train_flat = flatten(y_train)
y_train_pred_flat = flatten(y_train_pred)
print(classification_report(y_train_flat, y_train_pred_flat, labels=label_order, zero_division=0))

                   precision    recall  f1-score      support

         B-AUTHOR       0.94      0.96      0.95      4257816
         I-AUTHOR       0.94      0.92      0.93     13774259
           B-YEAR       0.95      0.93      0.94      3975744
           I-YEAR       0.96      0.95      0.95        25781
          B-TITLE       0.87      0.89      0.88      4454860
          I-TITLE       0.88      0.89      0.88     35764187
B-CONTAINER-TITLE       0.85      0.87      0.86      2257372
I-CONTAINER-TITLE       0.88      0.89      0.88     30770652
         B-VOLUME       0.95      0.90      0.92       200371
         I-VOLUME       0.93      0.92      0.92         7533
          B-ISSUE       0.91      0.87      0.89       417556
          I-ISSUE       0.89      0.88      0.88         3957
           B-PAGE       0.97      0.97      0.97      3526715
           I-PAGE       0.95      0.94      0.94       487722
           B-ISBN       0.92      0.90      0.91       746137
       

In [None]:
y_test_pred = crf_5mil_hand.predict(X_test)
y_test_flat = flatten(y_test)
y_test_pred_flat = flatten(y_test_pred)
print(classification_report(y_test_flat, y_test_pred_flat, labels=label_order, zero_division=0))

                   precision    recall  f1-score      support

         B-AUTHOR       0.92      0.95      0.93       190515
         I-AUTHOR       0.94      0.90      0.92      1735612
           B-YEAR       0.92      0.91      0.91       172556
           I-YEAR       0.94      0.95      0.94         1350
          B-TITLE       0.90      0.91      0.90      1815762
          I-TITLE       0.89      0.90      0.89     18249721
B-CONTAINER-TITLE       0.88      0.92      0.90       133714
I-CONTAINER-TITLE       0.86      0.89      0.87      1100358
         B-VOLUME       0.94      0.89      0.91        39419
         I-VOLUME       0.91      0.87      0.89          155
          B-ISSUE       0.91      0.88      0.89        13557
          I-ISSUE       0.90      0.92      0.91          343
           B-PAGE       0.96      0.93      0.94       157135
           I-PAGE       0.98      0.96      0.97        29258
           B-ISBN       0.93      0.90      0.91        45327
       

In [36]:
with open("models/crf_model_5mil_hand.pkl", "wb") as f:
    pickle.dump(crf_5mil_hand, f)

In [None]:
# with open("models/crf_model_5mil_hand.pkl", "rb") as f:
#     crf_5mil_hand = pickle.load(f)