# Train

Creating and testing machine learning models

## Import

Bring in standard Python tools

In [34]:
import pathlib
from urllib.parse import urlparse

Third-party data libraries

In [35]:
import dill
import tldextract
import numpy as np
import pandas as pd

Machine learning gear

In [36]:
from sklearn import metrics
from sklearn.base import TransformerMixin
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import make_pipeline, make_union
from sklearn.feature_extraction.text import CountVectorizer

## Prepare data

Read in our supervised dataset.

In [37]:
input_path = pathlib.Path("") / "input"

In [38]:
output_path = pathlib.Path("") / "output"

In [39]:
labeled_df = pd.read_csv(
    input_path / "labeled.csv",
    dtype={"is_story": int, "text": str}
)

In [40]:
labeled_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2734 entries, 0 to 2733
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   text      2734 non-null   object
 1   url       2734 non-null   object
 2   handle    2734 non-null   object
 3   is_story  2734 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 85.6+ KB


In [41]:
labeled_df.head()

Unnamed: 0,text,url,handle,is_story
0,A Tradition of Violence,https://knock-la.com/tradition-of-violence-las...,knockdotla,1
1,8 \n\t\t\t\t\t\t\t\t \t\t\n\t\t\t\t\t\t\t\t\t...,https://jewishcurrents.org/bad-education,jewishcurrents,1
2,69\n\t\t\t\t\t\t\t\n\t\t\t\t\t\tView Slide Show,https://www.nationalreview.com/photos/russia-u...,nro,1
3,20 Under 40,https://www.lagrangenews.com/20-under-40/,lagrangenews,1
4,Bela’s Pilgrim,https://jewishcurrents.org/belas-pilgrim,jewishcurrents,1


Extract the paths from the urls

In [42]:
labeled_df['path'] = labeled_df.url.apply(lambda x: urlparse(x).path)

Extract the domain

In [43]:
labeled_df['domain'] = labeled_df.url.apply(lambda x: tldextract.extract(x).domain)

Extract the subdomain

In [44]:
labeled_df['subdomain'] = labeled_df.url.apply(lambda x: tldextract.extract(x).subdomain)

In [45]:
labeled_df.head()

Unnamed: 0,text,url,handle,is_story,path,domain,subdomain
0,A Tradition of Violence,https://knock-la.com/tradition-of-violence-las...,knockdotla,1,/tradition-of-violence-lasd-gang-history/,knock-la,
1,8 \n\t\t\t\t\t\t\t\t \t\t\n\t\t\t\t\t\t\t\t\t...,https://jewishcurrents.org/bad-education,jewishcurrents,1,/bad-education,jewishcurrents,
2,69\n\t\t\t\t\t\t\t\n\t\t\t\t\t\tView Slide Show,https://www.nationalreview.com/photos/russia-u...,nro,1,/photos/russia-ukraine-war-week-22/,nationalreview,www
3,20 Under 40,https://www.lagrangenews.com/20-under-40/,lagrangenews,1,/20-under-40/,lagrangenews,www
4,Bela’s Pilgrim,https://jewishcurrents.org/belas-pilgrim,jewishcurrents,1,/belas-pilgrim,jewishcurrents,


Remove rows without a headline

In [46]:
training_df = labeled_df[~(labeled_df.text == "")]

Remove blacklisted domains

In [47]:
DOMAIN_BLACKLIST = (
    "google",
    "twitter",
    "facebook",
    "doubleclick",
    "instagram",
    "pinterest",
    "legacy",
)

In [48]:
training_df = training_df[~(training_df.domain.isin(DOMAIN_BLACKLIST))]

Same for subdomain

In [49]:
SUBDOMAIN_BLACKLIST = (
    "careers",
    "mail",
    "account",
)

In [50]:
training_df = training_df[~(training_df.subdomain.isin(SUBDOMAIN_BLACKLIST))]

Cut any duplicates

In [51]:
training_df.drop_duplicates(inplace=True)

## Train model

Create pipeline transformers that preprecess our text columns, convert the strings into vectors that are ready for analysis

In [52]:
class DataFrameColumnExtracter(TransformerMixin):
    """A custom object that can pull different columns from our DataFrames."""

    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        if isinstance(X, list):
            return [d[self.column] for d in X]
        else:
            return X[self.column]

In [53]:
def densify(x):
    """Condense a sparse CountVectorizer result into an array that GaussianNB can work with."""
    return x.toarray()

A transformer that can work with our text column.

In [54]:
text_transformer = make_pipeline(
   DataFrameColumnExtracter('text'), 
   CountVectorizer(min_df=0.1, max_df=0.9, ngram_range=(1, 8), analyzer="char"),
   FunctionTransformer(densify, accept_sparse=True)
)

A transformer that can work with our path column.

In [55]:
path_transformer = make_pipeline(
   DataFrameColumnExtracter('path'), 
   CountVectorizer(min_df=0.1, max_df=0.9, ngram_range=(1, 8), analyzer="char"),
   FunctionTransformer(densify, accept_sparse=True)
)

Set our training set

In [56]:
X = training_df
y = training_df['is_story']

In [57]:
train, test, train_labels, test_labels = train_test_split(
    X,
    y,
    test_size=0.33,
    random_state=45
)

Create a model that uses both fields.

In [58]:
def get_path_and_text_model():
    """Create a model that uses both the `path` and `text` fields."""
    # Create a pipeline that pulls in both fields
    pipe = make_pipeline(
        make_union(text_transformer, path_transformer),
        GaussianNB()
    )
    
    # Train it
    model = pipe.fit(train, train_labels)
    
    # Test it
    preds = model.predict(test)
    
    # Print the test results
    print(metrics.classification_report(test_labels, preds))

    # Return it
    return model

In [59]:
path_and_text_model = get_path_and_text_model()

              precision    recall  f1-score   support

           0       0.98      0.94      0.96       534
           1       0.92      0.97      0.94       350

    accuracy                           0.95       884
   macro avg       0.95      0.96      0.95       884
weighted avg       0.95      0.95      0.95       884



Create a model that uses only the path field.

In [60]:
def get_path_only_model():
    """Create a model that uses only the `path` field."""
    # Create a pipeline that pulls just that field
    pipe = make_pipeline(path_transformer, GaussianNB())
    
    # Train it
    model = pipe.fit(train, train_labels)
    
    # Test it
    preds = model.predict(test)
    
    # Print the test results
    print(metrics.classification_report(test_labels, preds))

    # Return it
    return model

In [61]:
path_only_model = get_path_only_model()

              precision    recall  f1-score   support

           0       0.97      0.94      0.96       534
           1       0.92      0.95      0.93       350

    accuracy                           0.95       884
   macro avg       0.94      0.95      0.94       884
weighted avg       0.95      0.95      0.95       884



## Review

In [62]:
example = [
    dict(path="/2019/04/unhcr-corruption-refugee-resettlement/", text="This is a headline"),
    dict(path="/", text="Homepage")
]

In [63]:
path_and_text_model.predict(example)

array([1, 0])

In [64]:
path_only_model.predict(example)

array([1, 0])

## Export

Save the models as pickles

In [65]:
with open(output_path / "path-and-text-model.pickle", 'wb') as fh:
    dill.dump(path_and_text_model, fh)

In [66]:
with open(output_path / "path-only-model.pickle", 'wb') as fh:
    dill.dump(path_only_model, fh)