In [None]:
from citation_scanner import download, tika, regex
import pandas as pd
import spacy_annotator as spa

# Grab Sample Data

The first thing we need to do, to test viability, is get some sample data to work with.

In [None]:
file_list = download.get_file_list()
download.download_file_list(file_list, loc="pdfs")

# Extract Text
Now we have some relevant PDFs locally we use tika to pull out the text

In [None]:
tika.parse_dir("pdfs")
docs = tika.read_parsed_dir("pdfs", strip_lines=True)

# Make Samples

At this point we have the entire text of the PDF's but for our tagging we want to cut them down to something more manageble. 

Here we use a regex to find the text either side of each occurance of the `[YYYY]` tags in the document.

In [None]:
pattern_list = [r"\[\d{4}\]"]

match_list = regex.get_match_obj(
    pattern=pattern_list[0],
    doc=docs[0],
    window=(200, 200)
)

In [None]:
# Add just the text elements to a dataframe (as expected by the annotator)
samples_df = pd.DataFrame({
    "text": [x[1] for x in match_list]}
)

samples_df.head()

Set up an annotator with which to tag citatons.

In [None]:
annotator = spa.Annotator(labels=["citation"])

Here we use the annotator plugin to generate a second column in the format spacy expects, ie. 

In [None]:
df_labels = annotator.annotate(df=samples_df, col_text="text")

In [None]:
df_labels.head()

In [None]:
has_labels = df_labels[df_labels["annotations"] !=""]

Finally, we save out the annotations to a .spacy file

In [None]:
spacy_annotations = annotator.to_spacy(has_labels, "training/citation_labels_1.spacy")