# Train

Creating and testing machine learning models

## Import

Bring in standard Python tools

In [1]:
import pathlib
from urllib.parse import urlparse

Third-party data libraries

In [2]:
import dill
import tldextract
import numpy as np
import pandas as pd

Machine learning gear

In [3]:
from sklearn import metrics
from sklearn.base import TransformerMixin
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import make_pipeline, make_union
from sklearn.feature_extraction.text import CountVectorizer

## Prepare data

Read in our supervised dataset.

In [4]:
input_path = pathlib.Path("") / "input"

In [5]:
output_path = pathlib.Path("") / "output"

In [6]:
labeled_df = pd.read_csv(
    input_path / "labeled.csv",
    dtype={"is_story": int, "text": str}
)

In [7]:
labeled_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2775 entries, 0 to 2774
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   text      2775 non-null   object
 1   url       2775 non-null   object
 2   handle    2775 non-null   object
 3   is_story  2775 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 86.8+ KB


In [8]:
labeled_df.head()

Unnamed: 0,text,url,handle,is_story
0,A Tradition of Violence,https://knock-la.com/tradition-of-violence-las...,knockdotla,1
1,8\n\n\t\t\t\t\t\t\t\t\t \t\tBad Education,https://jewishcurrents.org/bad-education,jewishcurrents,1
2,69\n\n\t\t\t\t\t\tView Slide Show,https://www.nationalreview.com/photos/russia-u...,nro,1
3,20 Under 40,https://www.lagrangenews.com/20-under-40/,lagrangenews,1
4,Bela’s Pilgrim,https://jewishcurrents.org/belas-pilgrim,jewishcurrents,1


Extract the paths from the urls

In [9]:
labeled_df['path'] = labeled_df.url.apply(lambda x: urlparse(x).path)

Extract the domain

In [10]:
labeled_df['domain'] = labeled_df.url.apply(lambda x: tldextract.extract(x).domain)

Extract the subdomain

In [11]:
labeled_df['subdomain'] = labeled_df.url.apply(lambda x: tldextract.extract(x).subdomain)

In [12]:
labeled_df.head()

Unnamed: 0,text,url,handle,is_story,path,domain,subdomain
0,A Tradition of Violence,https://knock-la.com/tradition-of-violence-las...,knockdotla,1,/tradition-of-violence-lasd-gang-history/,knock-la,
1,8\n\n\t\t\t\t\t\t\t\t\t \t\tBad Education,https://jewishcurrents.org/bad-education,jewishcurrents,1,/bad-education,jewishcurrents,
2,69\n\n\t\t\t\t\t\tView Slide Show,https://www.nationalreview.com/photos/russia-u...,nro,1,/photos/russia-ukraine-war-week-22/,nationalreview,www
3,20 Under 40,https://www.lagrangenews.com/20-under-40/,lagrangenews,1,/20-under-40/,lagrangenews,www
4,Bela’s Pilgrim,https://jewishcurrents.org/belas-pilgrim,jewishcurrents,1,/belas-pilgrim,jewishcurrents,


Remove rows without a headline

In [13]:
training_df = labeled_df[~(labeled_df.text == "")]

Remove blacklisted domains

In [14]:
DOMAIN_BLACKLIST = (
    "google",
    "twitter",
    "facebook",
    "doubleclick",
    "eventbrite",
    "youtube",
    "vimeo",
    "instagram",
    "ceros",
)

In [15]:
training_df = training_df[~(training_df.domain.isin(DOMAIN_BLACKLIST))]

Same for subdomain

In [16]:
SUBDOMAIN_BLACKLIST = (
    "careers",
    "mail",
    "account",
    "events",
)

In [17]:
training_df = training_df[~(training_df.subdomain.isin(SUBDOMAIN_BLACKLIST))]

Cut any duplicates

In [18]:
training_df.drop_duplicates(inplace=True)

## Train model

Create pipeline transformers that preprecess our text columns, convert the strings into vectors that are ready for analysis

In [19]:
class DataFrameColumnExtracter(TransformerMixin):
    """A custom object that can pull different columns from our DataFrames."""

    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        if isinstance(X, list):
            return [d[self.column] for d in X]
        else:
            return X[self.column]

In [20]:
def densify(x):
    """Condense a sparse CountVectorizer result into an array that GaussianNB can work with."""
    return x.toarray()

A transformer that can work with our text column.

In [21]:
text_transformer = make_pipeline(
   DataFrameColumnExtracter('text'), 
   CountVectorizer(min_df=0.1, max_df=0.9, ngram_range=(1, 8), analyzer="char"),
   FunctionTransformer(densify, accept_sparse=True)
)

A transformer that can work with our path column.

In [22]:
path_transformer = make_pipeline(
   DataFrameColumnExtracter('path'), 
   CountVectorizer(min_df=0.1, max_df=0.9, ngram_range=(1, 8), analyzer="char"),
   FunctionTransformer(densify, accept_sparse=True)
)

Set our training set

In [23]:
X = training_df[['text', 'path']].fillna("")
y = training_df['is_story']

In [24]:
train, test, train_labels, test_labels = train_test_split(
    X,
    y,
    test_size=0.33,
    random_state=45
)

Create a model that uses both fields.

In [25]:
def get_path_and_text_model():
    """Create a model that uses both the `path` and `text` fields."""
    # Create a pipeline that pulls in both fields
    pipe = make_pipeline(
        make_union(text_transformer, path_transformer),
        GaussianNB()
    )
    
    # Train it
    model = pipe.fit(train, train_labels)
    
    # Test it
    preds = model.predict(test)
    
    # Print the test results
    print(metrics.classification_report(test_labels, preds))

    # Return it
    return model

In [26]:
path_and_text_model = get_path_and_text_model()

              precision    recall  f1-score   support

           0       0.97      0.95      0.96       537
           1       0.93      0.96      0.94       361

    accuracy                           0.95       898
   macro avg       0.95      0.95      0.95       898
weighted avg       0.95      0.95      0.95       898



Create a model that uses only the path field.

In [27]:
def get_path_only_model():
    """Create a model that uses only the `path` field."""
    # Create a pipeline that pulls just that field
    pipe = make_pipeline(path_transformer, GaussianNB())
    
    # Train it
    model = pipe.fit(train, train_labels)
    
    # Test it
    preds = model.predict(test)
    
    # Print the test results
    print(metrics.classification_report(test_labels, preds))

    # Return it
    return model

In [28]:
path_only_model = get_path_only_model()

              precision    recall  f1-score   support

           0       0.96      0.96      0.96       537
           1       0.93      0.94      0.94       361

    accuracy                           0.95       898
   macro avg       0.95      0.95      0.95       898
weighted avg       0.95      0.95      0.95       898



## Review

In [29]:
example = [
    dict(path="/2019/04/unhcr-corruption-refugee-resettlement/", text="This is a headline"),
    dict(path="/", text="Homepage")
]

In [30]:
path_and_text_model.predict(example)

array([1, 0])

In [31]:
path_only_model.predict(example)

array([1, 0])

## Export

Save the models as pickles

In [None]:
with open(output_path / "path-and-text-model.pickle", 'wb') as fh:
    dill.dump(path_and_text_model, fh)

In [None]:
with open(output_path / "path-only-model.pickle", 'wb') as fh:
    dill.dump(path_only_model, fh)