In [1]:
import os
import pandas as pd

## How to use this notebook:
Since this binder has to be hosted publicly for now, you'll need to copy the data to this notebook. You can do that like this:

  1. Clone the [data repo](https://bitbucket.org/bombora-datascience/dragnet-labeled-data)(if you haven't already)
  2. Zip the data repo
  3. Click the Jupyter logo at the top of this notebook to view this directory
  4. Using the "Upload" button at the top right, upload the zipped data repo

Then you can follow the steps in the notebook and label some content :)

The only other notable things you should have to change are the assignment of `my_df` to the `DataFrame` with your name on it, and (if you change any of the labeled data) save your new labeled data(using the very last cell) and download those files.

## Read in Data

In [2]:
# if needed, unzip data
import zipfile as zf
files = zf.ZipFile("dragnet-labeled-data.zip", 'r')
files.extractall('.')
files.close()

In [3]:
from imp import reload
import utils
reload(utils)
from dragnet import Extractor
from utils import read_dragnet_data

# this will probably take a couple of minutes, especially the first time(when it has to extract the data)
data_dir = 'dragnet-labeled-data'
dragnet_extractor = Extractor(to_extract='content')
df = read_dragnet_data(data_dir, to_extract=dragnet_extractor.to_extract)

Reading in Data: 100%|██████████| 476/476 [00:57<00:00,  8.31it/s]


## Initialize Extractors/Results

In [4]:
from sklearn.model_selection import KFold
from utils import extraction_comparison, JustextWrapper
justext_extractor = JustextWrapper()

dfs = []
for train_index, test_index in KFold(n_splits=5, random_state=42).split(df):
    train_df = df.iloc[test_index]
    test_df = df.iloc[train_index]
    dragnet_extractor.fit(train_df['doc'], train_df['labels'], train_df['weights'])
    scores_df = extraction_comparison(dragnet_extractor, justext_extractor, test_df)
    sorted_df = scores_df.sort_values(by=['base_f1', 'comp_f1'], ascending=[True, False])
    dfs.append(sorted_df)

Extracting Dragnet Content: 100%|██████████| 380/380 [00:11<00:00, 33.69it/s]
Extracting Justext Content: 100%|██████████| 380/380 [00:21<00:00, 17.35it/s]
Parsing Expected Content: 100%|██████████| 380/380 [00:08<00:00, 42.79it/s]
Extracting Dragnet Content: 100%|██████████| 381/381 [00:10<00:00, 34.80it/s]
Extracting Justext Content: 100%|██████████| 381/381 [00:20<00:00, 18.56it/s]
Parsing Expected Content: 100%|██████████| 381/381 [00:08<00:00, 43.85it/s]
Extracting Dragnet Content: 100%|██████████| 381/381 [00:10<00:00, 37.02it/s]
Extracting Justext Content: 100%|██████████| 381/381 [00:19<00:00, 19.79it/s]
Parsing Expected Content: 100%|██████████| 381/381 [00:08<00:00, 46.70it/s]
Extracting Dragnet Content: 100%|██████████| 381/381 [00:10<00:00, 37.51it/s]
Extracting Justext Content: 100%|██████████| 381/381 [00:17<00:00, 22.25it/s]
Parsing Expected Content: 100%|██████████| 381/381 [00:06<00:00, 54.71it/s]
Extracting Dragnet Content: 100%|██████████| 381/381 [00:08<00:00, 43.81

In [6]:
patricks_df, lindsays_df, nicos_df, melanies_df, jeffs_df = dfs
my_df = patricks_df

## Compare Results

In [7]:
from utils import content_extract_comparison_widget

content_extract_comparison_widget(my_df)

In [8]:
# save your content to a new 'Corrected' file
new_labels_dir = os.path.join(data_dir, 'new_Corrected')

for row in my_df.itertuples():
    filename = "{}.html.corrected.txt".format(row.fileroot)
    with open(os.path.join(new_labels_dir, filename), 'w') as f:
        print(row.labeled_content, file=f)