# Label

Run the models against fresh data to see how it goes.

In [51]:
import os
import re
import pathlib
from urllib.parse import urlparse

In [52]:
import dill
import tldextract
import numpy as np
import pandas as pd

## Get models

In [53]:
input_path = pathlib.Path("") / "input"

In [54]:
output_path = pathlib.Path("") / "output"

In [55]:
path_and_text_model = dill.load(open(output_path / "path-and-text-model.pickle", "rb"))

In [56]:
path_only_model = dill.load(open(output_path / "path-only-model.pickle", "rb"))

## Implement

In [57]:
DOMAIN_BLACKLIST = (
    "google",
    "twitter",
    "facebook",
    "doubleclick",
    "instagram",
    "pinterest",
    "legacy",
)

In [58]:
SUBDOMAIN_BLACKLIST = (
    "careers",
    "mail",
    "account",
)

In [59]:
PATH_BLACKLIST = (
    "",
    "/",
)

In [60]:
EXT_BLACKLIST = (
    ".js",
    ".css",
    ".jpg",
    ".gif",
    ".png",
)

In [61]:
PATHPART_WHITELIST = (
    "/story",
    "/stories",
    "/article",
    "/feature",
    "/featured",
    "/blog",
    "/interactive",
    "/graphic",
    "/video",
    "/post",
)

In [62]:
def tidy_text(t):
    t = t.strip()
    t = t.replace("\n", "")
    t = t.replace("\t", "")
    t = t.lower()
    t = re.sub('<[^<]+?>', '', t)
    t = ' '.join(t.split())
    return t

In [63]:
def label(url, text=None):
    # Drop any nulls
    if not url or pd.isnull(url) or not url.strip():
        return False
    
    # Pull out the data we need
    urlparts = urlparse(url)
    path = urlparts.path
    tld = tldextract.extract(url)

    # Drop anything we're certain isn't a story
    if tld.domain in DOMAIN_BLACKLIST:
        return False
    elif tld.subdomain in SUBDOMAIN_BLACKLIST:
        return False
    # Kill anything in one of our blacklists
    elif path in PATH_BLACKLIST:
        return False
    elif os.path.splitext(path)[1] in EXT_BLACKLIST:
        return False
    
    # Pick which model we're using, based on the input
    if text:
        text = tidy_text(text)
        model = path_and_text_model
    else:
        model = path_only_model

    # Run a prediction
    data = [dict(path=path, text=text)]
    prediction = model.predict(data)[0] == 1

    # If it's False but it has one of our whitelisted slugs, overturn the decision
    if not prediction:
        if path.startswith(PATHPART_WHITELIST) and len(path) > 10 and "-" in path:
            return True
    
    # Return the result
    return prediction

## Test

Try some one-off predictions

In [64]:
label("http://www.latimes.com/whatever.html", "This is not a headline")

False

In [65]:
label("http://www.latimes.com/2019/04/unhcr-corruption-refugee-resettlement/", "This is a headline")

True

In [66]:
label("http://www.latimes.com/whatever.html")

False

In [67]:
label("http://www.latimes.com/2019/04/unhcr-corruption-refugee-resettlement/")

True

Inspect the inaccurate predictions in our labeled dataset

In [68]:
labeled_df = pd.read_csv(input_path / "labeled.csv", dtype={"is_story": bool})

In [69]:
labeled_df['prediction'] = labeled_df.apply(lambda x: label(x['url'], x['text']), axis=1)

In [70]:
labeled_df['is_accurate'] = labeled_df.prediction == labeled_df.is_story

In [71]:
labeled_df.is_accurate.value_counts(normalize=True)

True     0.9594
False    0.0406
Name: is_accurate, dtype: float64

In [72]:
labeled_df[~labeled_df.is_accurate][[
    'text',
    'url',
    'is_story',
    'prediction',
    'is_accurate',
]]

Unnamed: 0,text,url,is_story,prediction,is_accurate
1,8 \n\t\t\t\t\t\t\t\t \t\t\n\t\t\t\t\t\t\t\t\t...,https://jewishcurrents.org/bad-education,True,False,False
3,20 Under 40,https://www.lagrangenews.com/20-under-40/,True,False,False
4,Bela’s Pilgrim,https://jewishcurrents.org/belas-pilgrim,True,False,False
7,Central/Eastern Europe,https://100r.org/about/the-reporters/central-e...,False,True,False
21,Follow the storms as they come in.,https://www.11alive.com/radar,False,True,False
...,...,...,...,...,...
2431,switch to the \n International edition,https://www.theguardian.com/preference/edition...,False,True,False
2438,"Conserve and enhance grains with ease, says le...",/sponsored/arid-40927322.html,False,True,False
2508,Explore nearly two decades of Cook County cour...,https://charges.thecircuit.cc/en/,False,True,False
2538,Read More ...,https://www.okayplayer.com/music/flying-lotus-...,True,False,False


Test an unsupervised sample of links

In [73]:
sample_df = pd.read_csv(input_path / "sample.csv")

In [74]:
sample_df['prediction'] = sample_df.apply(lambda x: label(x['url'], x['text']), axis=1)

In [75]:
sample_df.to_csv(input_path / "sample.csv", index=False)