In [1]:
import os
import pandas as pd

## How to use this notebook:
Since this binder has to be hosted publicly for now, you'll need to copy the data to this notebook. You can do that like this:

  1. Clone the [data repo](https://bitbucket.org/bombora-datascience/dragnet-labeled-data)(if you haven't already)
  2. Zip the data repo
  3. Click the Jupyter logo at the top of this notebook to view this directory
  4. Using the "Upload" button at the top right, upload the zipped data repo

Then you can follow the steps in the notebook and label some content :)

The only other notable things you should have to change are the assignment of `my_df` to the `DataFrame` with your name on it, and (if you change any of the labeled data) save your new labeled data(using the very last cell) and download those files.

## Read in Data

In [2]:
# if needed, unzip data
import zipfile as zf
files = zf.ZipFile("dragnet-labeled-data.zip", 'r')
files.extractall('.')
files.close()

In [3]:
from dragnet import Extractor
from utils import read_dragnet_data

# this will probably take a couple of minutes, especially the first time(when it has to extract the data)
data_dir = 'dragnet-labeled-data'
dragnet_extractor = Extractor(to_extract='content')
df = read_dragnet_data(data_dir, to_extract=dragnet_extractor.to_extract)

Reading in Data: 100%|██████████| 476/476 [00:56<00:00,  8.42it/s]


## Compare Extractors

In [4]:
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import KFold
from utils import extraction_comparison, JustextWrapper
justext_extractor = JustextWrapper()

# you can safely ignore the `WARNING:root:extraction failed: too few blocks (1)` warnings below
dfs = []
for train_index, test_index in tqdm(KFold(n_splits=5, random_state=42).split(df), total=5):
    train_df = df.iloc[train_index]
    test_df = df.iloc[test_index]
    dragnet_extractor.fit(train_df['doc'], train_df['labels'], train_df['weights'])
    scores_df = extraction_comparison(dragnet_extractor, justext_extractor, test_df)
    sorted_df = scores_df.sort_values(by=['base_f1', 'comp_f1'], ascending=[True, False])
    dfs.append(sorted_df)














In [5]:
patricks_df, lindsays_df, nicos_df, melanies_df, jeffs_df = dfs
my_df = patricks_df

## Visualize Comparison

In [6]:
from utils import content_extract_comparison_widget

content_extract_comparison_widget(my_df)

In [7]:
from IPython.display import FileLink

# save your content to a new 'Corrected' file
new_labels_dir = os.path.join(data_dir, 'new_Corrected')
if not os.path.isdir(new_labels_dir):
    os.mkdir(new_labels_dir)

for row in my_df.itertuples():
    filename = "{}.html.corrected.txt".format(row.fileroot)
    with open(os.path.join(new_labels_dir, filename), 'w') as f:
        print(row.labeled_content, file=f)
        
# zip new labeled files
output_fname = "dragnet-labeled-data-output.zip"
with zf.ZipFile(output_fname, 'w') as f:
    for root, dirs, files in os.walk(new_labels_dir):
        for file in files:
            f.write(os.path.join(root, file))
            
# link to file: click the link to download
FileLink(output_fname)