In [1]:
import os
import pandas as pd

## How to use this notebook:
Since this binder has to be hosted publicly for now, you'll need to copy the data to this notebook. You can do that like this:

  1. Clone the [data repo](https://bitbucket.org/bombora-datascience/dragnet-labeled-data)(if you haven't already)
  2. Zip the data repo
  3. Click the Jupyter logo at the top of this notebook to view this directory
  4. Using the "Upload" button at the top right, upload the zipped data repo

Then you can follow the steps in the notebook and label some content :)

The only other notable things you should have to change are the assignment of `my_df` to the `DataFrame` with your name on it, and (if you change any of the labeled data) save your new labeled data(using the very last cell) and download those files.

## Read in Data

In [None]:
# if needed, unzip data
import zipfile as zf
files = zf.ZipFile("dragnet-labeled-data.zip", 'r')
files.extractall('.')
files.close()

In [2]:
from dragnet import Extractor
from utils import read_dragnet_data

# this will probably take a couple of minutes, especially the first time(when it has to extract the data)
data_dir = 'dragnet-labeled-data'
dragnet_extractor = Extractor(to_extract='content')
train_df, test_df = read_dragnet_data(data_dir, to_extract=dragnet_extractor.to_extract)

Reading in Data: 100%|██████████| 476/476 [00:54<00:00,  8.74it/s]


## Initialize Extractors

In [3]:
from utils import JustextWrapper

# You can safely ignore the "WARNING:root:extraction failed: too few blocks (1)" warnings here
justext_extractor = JustextWrapper()
dragnet_extractor.fit(train_df['doc'], train_df['labels'], train_df['weights'])



Extractor(blockifier=<class 'dragnet.blocks.TagCountNoCSSReadabilityBlockifier'>,
     features=FeatureUnion(n_jobs=1,
       transformer_list=[('kohlschuetter', KohlschuetterFeatures()), ('weninger', WeningerFeatures(sigma=1.0)), ('readability', ReadabilityFeatures())],
       transformer_weights=None),
     max_block_weight=200,
     model=ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
     prob_threshold=0.5, to_extract=('content',))

## Compare Results

In [4]:
from utils import extraction_comparison

scores_df = extraction_comparison(dragnet_extractor, justext_extractor, test_df)
sorted_df = scores_df.sort_values(by=['base_f1', 'comp_f1'], ascending=[True, False])

patricks_df = sorted_df[0:25].copy()
lindsays_df = sorted_df[25:50].copy()
nicos_df = sorted_df[50:75].copy()
melanies_df = sorted_df[75:100].copy()
# go ahead and set `my_df` based on your name here :)
my_df = patricks_df

Extracting Dragnet Content: 100%|██████████| 119/119 [00:02<00:00, 42.57it/s]
Extracting Justext Content: 100%|██████████| 119/119 [00:05<00:00, 22.66it/s]
Parsing Expected Content: 100%|██████████| 119/119 [00:02<00:00, 54.81it/s]


In [5]:
from utils import content_extract_comparison_widget

content_extract_comparison_widget(my_df)

In [83]:
# save your content to a new 'Corrected' file
new_labels_dir = os.path.join(data_dir, 'new_Corrected')

for row in my_df.itertuples():
    filename = "{}.html.corrected.txt".format(row.fileroot)
    with open(os.path.join(new_labels_dir, filename), 'w') as f:
        print(row.labeled_content, file=f)