# Label

Run the models against fresh data to see how it goes.

In [147]:
import pathlib
from urllib.parse import urlparse

In [148]:
import dill
import tldextract
import numpy as np
import pandas as pd

## Get models

In [149]:
input_path = pathlib.Path("") / "input"

In [150]:
output_path = pathlib.Path("") / "output"

In [151]:
path_and_text_model = dill.load(open(output_path / "path-and-text-model.pickle", "rb"))

In [152]:
path_only_model = dill.load(open(output_path / "path-only-model.pickle", "rb"))

## Implement

In [153]:
DOMAIN_BLACKLIST = (
    "google",
    "twitter",
    "facebook",
    "doubleclick",
    "instagram",
    "pinterest",
    "legacy",
)

In [154]:
SUBDOMAIN_BLACKLIST = (
    "careers",
    "mail",
    "account",
)

In [155]:
def tidy_text(t):
    t = t.strip()
    t = t.replace("\n", "")
    t = t.replace("\t", "")
    t = t.lower()
    t = ' '.join(t.split())
    return t

In [156]:
def label(url, text=None):
    if pd.isnull(url):
        return False
    
    # Pull out the data we need
    path = urlparse(url).path
    tld = tldextract.extract(url)

    # Drop anything we're certain isn't a story
    if tld.domain in DOMAIN_BLACKLIST:
        return False
    elif tld.subdomain in SUBDOMAIN_BLACKLIST:
        return False

    # Pick which model we're using, based on the input
    if text:
        text = tidy_text(text)
        model = path_and_text_model
    else:
        model = path_only_model

    # Run a prediction
    data = [dict(path=path, text=text)]
    prediction = model.predict(data)

    # Return the result
    return prediction[0] == 1

## Test

Try some one-off predictions

In [157]:
label("http://www.latimes.com/whatever.html", "This is not a headline")

False

In [158]:
label("http://www.latimes.com/2019/04/unhcr-corruption-refugee-resettlement/", "This is a headline")

True

In [159]:
label("http://www.latimes.com/whatever.html")

False

In [160]:
label("http://www.latimes.com/2019/04/unhcr-corruption-refugee-resettlement/")

True

Inspect the inaccurate predictions in our labeled dataset

In [161]:
labeled_df = pd.read_csv(input_path / "labeled.csv", dtype={"is_story": bool})

In [162]:
labeled_df['prediction'] = labeled_df.apply(lambda x: label(x['url'], x['text']), axis=1)

In [163]:
labeled_df['is_accurate'] = labeled_df.prediction == labeled_df.is_story

In [164]:
labeled_df.is_accurate.value_counts()

True     1931
False      97
Name: is_accurate, dtype: int64

In [165]:
labeled_df[~labeled_df.is_accurate][[
    'text',
    'url',
    'is_story',
    'prediction',
    'is_accurate',
]]

Unnamed: 0,text,url,is_story,prediction,is_accurate
1,8\n\n\t\t\t\t\t\t\t\t\t \t\tBad Education,https://jewishcurrents.org/bad-education,True,False,False
3,20 Under 40,https://www.lagrangenews.com/20-under-40/,True,False,False
4,Bela’s Pilgrim,https://jewishcurrents.org/belas-pilgrim,True,False,False
7,Central/Eastern Europe,https://100r.org/about/the-reporters/central-e...,False,True,False
27,!function(){if (!document.hidden) {let e=docum...,https://www.12newsnow.com/article/news/nation-...,False,True,False
...,...,...,...,...,...
1943,Fall Camp Report,/news/fall-camp-report/article_5526d5ec-0fdc-1...,False,True,False
1950,Consumer Reports: Best dehumidifiers,https://www.nbcnews.com/select/shopping/consum...,False,True,False
1955,Contact,https://www.nbcnews.com/news/us-news/nbc-news-...,False,True,False
1984,Central Iowa Arts Articles,https://littlevillagemag.com/category/central-...,False,True,False


Test an unsupervised sample of links

In [166]:
sample_df = pd.read_csv(input_path / "sample.csv")

In [167]:
sample_df['prediction'] = sample_df.apply(lambda x: label(x['url'], x['text']), axis=1)

In [168]:
sample_df.to_csv(input_path / "sample.csv", index=False)