# Label

Run the models against fresh data to see how it goes.

In [26]:
import os
import re
import pathlib
from urllib.parse import urlparse

In [27]:
import dill
import tldextract
import numpy as np
import pandas as pd

## Get models

In [28]:
input_path = pathlib.Path("") / "input"

In [29]:
output_path = pathlib.Path("") / "output"

In [30]:
path_and_text_model = dill.load(open(output_path / "path-and-text-model.pickle", "rb"))

In [31]:
path_only_model = dill.load(open(output_path / "path-only-model.pickle", "rb"))

## Implement

In [32]:
DOMAIN_BLACKLIST = (
    "google",
    "twitter",
    "facebook",
    "doubleclick",
    "instagram",
    "pinterest",
    "legacy",
)

In [33]:
SUBDOMAIN_BLACKLIST = (
    "careers",
    "mail",
    "account",
)

In [34]:
PATH_BLACKLIST = (
    "",
    "/",
)

In [35]:
EXT_BLACKLIST = (
    ".js",
    ".css",
    ".jpg",
    ".gif",
    ".png",
)

In [36]:
PATHPART_WHITELIST = (
    "/story",
    "/stories",
    "/article",
    "/feature",
    "/featured",
    "/blog",
    "/interactive",
    "/graphic",
    "/video",
    "/post",
)

In [37]:
def tidy_text(t):
    t = t.strip()
    t = t.replace("\n", "")
    t = t.replace("\t", "")
    t = t.lower()
    t = re.sub('<[^<]+?>', '', t)
    t = ' '.join(t.split())
    return t

In [38]:
def label(url, text=None):
    # Drop any nulls
    if not url or pd.isnull(url) or not url.strip():
        return False
    
    # Pull out the data we need
    urlparts = urlparse(url)
    path = urlparts.path
    tld = tldextract.extract(url)

    # Drop anything we're certain isn't a story
    if tld.domain in DOMAIN_BLACKLIST:
        return False
    elif tld.subdomain in SUBDOMAIN_BLACKLIST:
        return False
    # Kill anything in one of our blacklists
    elif path in PATH_BLACKLIST:
        return False
    elif os.path.splitext(path)[1] in EXT_BLACKLIST:
        return False
    
    # Pick which model we're using, based on the input
    if text:
        text = tidy_text(text)
        model = path_and_text_model
    else:
        model = path_only_model

    # Run a prediction
    data = [dict(path=path, text=text)]
    prediction = model.predict(data)[0] == 1

    # If it's False but it has one of our whitelisted slugs, overturn the decision
    if not prediction:
        if path.startswith(PATHPART_WHITELIST) and len(path) > 10 and "-" in path:
            return True
    
    # Return the result
    return prediction

## Test

Try some one-off predictions

In [39]:
label("http://www.latimes.com/whatever.html", "This is not a headline")

False

In [40]:
label("http://www.latimes.com/2019/04/unhcr-corruption-refugee-resettlement/", "This is a headline")

True

In [41]:
label("http://www.latimes.com/whatever.html")

False

In [42]:
label("http://www.latimes.com/2019/04/unhcr-corruption-refugee-resettlement/")

True

Inspect the inaccurate predictions in our labeled dataset

In [43]:
labeled_df = pd.read_csv(input_path / "labeled.csv", dtype={"is_story": bool})

In [44]:
labeled_df['prediction'] = labeled_df.apply(lambda x: label(x['url'], x['text']), axis=1)

In [45]:
labeled_df['is_accurate'] = labeled_df.prediction == labeled_df.is_story

In [46]:
labeled_df.is_accurate.value_counts(normalize=True)

True     0.959811
False    0.040189
Name: is_accurate, dtype: float64

In [47]:
labeled_df[~labeled_df.is_accurate][[
    'text',
    'url',
    'is_story',
    'prediction',
    'is_accurate',
]]

Unnamed: 0,text,url,is_story,prediction,is_accurate
1,8 \n\t\t\t\t\t\t\t\t \t\t\n\t\t\t\t\t\t\t\t\t...,https://jewishcurrents.org/bad-education,True,False,False
3,20 Under 40,https://www.lagrangenews.com/20-under-40/,True,False,False
4,Bela’s Pilgrim,https://jewishcurrents.org/belas-pilgrim,True,False,False
7,Central/Eastern Europe,https://100r.org/about/the-reporters/central-e...,False,True,False
27,!function(){if (!document.hidden) {let e=docum...,https://www.12newsnow.com/article/news/nation-...,False,True,False
...,...,...,...,...,...
2395,Main Events 07/29/2022,/entertainment/main-events-07-29-2022/article_...,False,True,False
2397,Carmen Lomana,https://www.larazon.es/gente/famosos/20220730/...,False,True,False
2426,Sign up for our email,/info/2018/sep/17/guardian-us-morning-briefing...,False,True,False
2438,"Conserve and enhance grains with ease, says le...",/sponsored/arid-40927322.html,False,True,False


Test an unsupervised sample of links

In [48]:
sample_df = pd.read_csv(input_path / "sample.csv")

In [49]:
sample_df['prediction'] = sample_df.apply(lambda x: label(x['url'], x['text']), axis=1)

In [50]:
sample_df.to_csv(input_path / "sample.csv", index=False)