# Label

Run the models against fresh data to see how it goes.

In [1]:
import os
import re
import pathlib
from urllib.parse import urlparse

In [2]:
import dill
import tldextract
import numpy as np
import pandas as pd

## Get models

In [3]:
input_path = pathlib.Path("") / "input"

In [4]:
output_path = pathlib.Path("") / "output"

In [5]:
path_and_text_model = dill.load(open(output_path / "path-and-text-model.pickle", "rb"))

In [6]:
path_only_model = dill.load(open(output_path / "path-only-model.pickle", "rb"))

## Implement

In [7]:
DOMAIN_BLACKLIST = (
    "google",
    "twitter",
    "facebook",
    "doubleclick",
    "instagram",
    "pinterest",
    "legacy",
)

In [8]:
SUBDOMAIN_BLACKLIST = (
    "careers",
    "mail",
    "account",
)

In [9]:
PATH_BLACKLIST = (
    "",
    "/",
)

In [10]:
EXT_BLACKLIST = (
    ".js",
    ".css",
    ".jpg",
    ".gif",
    ".png",
)

In [11]:
PATHPART_WHITELIST = (
    "/story",
    "/stories",
    "/article",
    "/feature",
    "/featured",
    "/blog",
    "/interactive",
    "/graphic",
    "/video",
    "/post",
)

In [12]:
def tidy_text(t):
    t = t.strip()
    t = t.replace("\n", "")
    t = t.replace("\t", "")
    t = t.lower()
    t = re.sub('<[^<]+?>', '', t)
    t = ' '.join(t.split())
    return t

In [13]:
def label(url, text=None):
    # Drop any nulls
    if not url:
        return False
    
    # Pull out the data we need
    urlparts = urlparse(url)
    path = urlparts.path
    tld = tldextract.extract(url)

    # Drop anything we're certain isn't a story
    if tld.domain in DOMAIN_BLACKLIST:
        return False
    elif tld.subdomain in SUBDOMAIN_BLACKLIST:
        return False
    # Kill anything in one of our blacklists
    elif path in PATH_BLACKLIST:
        return False
    elif os.path.splitext(path)[1] in EXT_BLACKLIST:
        return False
    
    # Pick which model we're using, based on the input
    if text:
        text = tidy_text(text)
        model = path_and_text_model
    else:
        model = path_only_model

    # Run a prediction
    data = [dict(path=path, text=text)]
    prediction = model.predict(data)[0] == 1

    # If it's False but it has one of our whitelisted slugs, overturn the decision
    if not prediction:
        if path.startswith(PATHPART_WHITELIST) and len(path) > 10 and "-" in path:
            return True
    
    # Return the result
    return prediction

## Test

Try some one-off predictions

In [14]:
label("http://www.latimes.com/whatever.html", "This is not a headline")

False

In [15]:
label("http://www.latimes.com/2019/04/unhcr-corruption-refugee-resettlement/", "This is a headline")

True

In [16]:
label("http://www.latimes.com/whatever.html")

False

In [17]:
label("http://www.latimes.com/2019/04/unhcr-corruption-refugee-resettlement/")

True

Inspect the inaccurate predictions in our labeled dataset

In [18]:
labeled_df = pd.read_csv(input_path / "labeled.csv", dtype={"is_story": bool})

In [19]:
labeled_df['prediction'] = labeled_df.apply(lambda x: label(x['url'], x['text']), axis=1)

In [20]:
labeled_df['is_accurate'] = labeled_df.prediction == labeled_df.is_story

In [21]:
labeled_df.is_accurate.value_counts(normalize=True)

True     0.956036
False    0.043964
Name: is_accurate, dtype: float64

In [22]:
labeled_df[~labeled_df.is_accurate][[
    'text',
    'url',
    'is_story',
    'prediction',
    'is_accurate',
]]

Unnamed: 0,text,url,is_story,prediction,is_accurate
1,8\n\n\t\t\t\t\t\t\t\t\t \t\tBad Education,https://jewishcurrents.org/bad-education,True,False,False
3,20 Under 40,https://www.lagrangenews.com/20-under-40/,True,False,False
4,Bela’s Pilgrim,https://jewishcurrents.org/belas-pilgrim,True,False,False
27,!function(){if (!document.hidden) {let e=docum...,https://www.12newsnow.com/article/news/nation-...,False,True,False
59,Changing Child Care,https://19thnews.org/collections/changing-chil...,False,True,False
...,...,...,...,...,...
2751,Spelling BeeHow many words can you make with 7...,https://www.nytimes.com/puzzles/spelling-bee,False,True,False
2752,© 2022 The New York Times Company,https://help.nytimes.com/hc/en-us/articles/115...,False,True,False
2753,VertexConnect the dots to reveal the hidden pi...,https://www.nytimes.com/puzzles/vertex,False,True,False
2755,New York Times GamesSubscribe for full access ...,https://www.nytimes.com/subscription/games,False,True,False


Test an unsupervised sample of links

In [23]:
sample_df = pd.read_csv(input_path / "sample.csv")

In [None]:
sample_df['prediction'] = sample_df.apply(lambda x: label(x['url'], x['text']), axis=1)

In [None]:
sample_df.to_csv(input_path / "sample.csv", index=False)