# Train

Creating and testing machine learning models

## Import

Bring in standard Python tools

In [1]:
import pathlib
from urllib.parse import urlparse

Third-party data libraries

In [2]:
import skops.io
import tldextract
import numpy as np
import pandas as pd

Machine learning gear

In [3]:
from sklearn import metrics
from sklearn.svm import LinearSVC
from sklearn.base import TransformerMixin
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, make_union
from sklearn.feature_extraction.text import CountVectorizer

## Prepare data

Read in our supervised dataset.

In [4]:
input_path = pathlib.Path("") / "input"

In [5]:
output_path = pathlib.Path("") / "output"

In [6]:
labeled_df = pd.read_csv(
    input_path / "labeled.csv",
    dtype={"is_story": int, "text": str}
)

In [7]:
labeled_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2891 entries, 0 to 2890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   text      2891 non-null   object
 1   url       2891 non-null   object
 2   handle    2891 non-null   object
 3   is_story  2891 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 90.5+ KB


In [8]:
labeled_df.head()

Unnamed: 0,text,url,handle,is_story
0,Trouble in Kenya's Flower Fields,https://100r.org/2017/12/trouble-in/,100reporters,1
1,Asylum for Sale Refugees Say Some U.N. Workers...,https://100r.org/2019/04/unhcr-corruption-refu...,100reporters,1
2,Documentaries as AdvertisingCorporate Interest...,https://100r.org/2019/12/documentaries-as-adve...,100reporters,1
3,Pandemic Drives Wave of Property Grabs in Zambia,https://100r.org/2020/12/propertygrabs/,100reporters,1
4,Did Industry Funding Influence an FDA Investig...,https://100r.org/2022/07/did-industry-funding-...,100reporters,1


Extract the paths from the urls

In [9]:
labeled_df['path'] = labeled_df.url.apply(lambda x: urlparse(x).path)

Extract the domain

In [10]:
labeled_df['domain'] = labeled_df.url.apply(lambda x: tldextract.extract(x).domain)

Extract the subdomain

In [11]:
labeled_df['subdomain'] = labeled_df.url.apply(lambda x: tldextract.extract(x).subdomain)

In [12]:
labeled_df.head()

Unnamed: 0,text,url,handle,is_story,path,domain,subdomain
0,Trouble in Kenya's Flower Fields,https://100r.org/2017/12/trouble-in/,100reporters,1,/2017/12/trouble-in/,100r,
1,Asylum for Sale Refugees Say Some U.N. Workers...,https://100r.org/2019/04/unhcr-corruption-refu...,100reporters,1,/2019/04/unhcr-corruption-refugee-resettlement/,100r,
2,Documentaries as AdvertisingCorporate Interest...,https://100r.org/2019/12/documentaries-as-adve...,100reporters,1,/2019/12/documentaries-as-advertising/,100r,
3,Pandemic Drives Wave of Property Grabs in Zambia,https://100r.org/2020/12/propertygrabs/,100reporters,1,/2020/12/propertygrabs/,100r,
4,Did Industry Funding Influence an FDA Investig...,https://100r.org/2022/07/did-industry-funding-...,100reporters,1,/2022/07/did-industry-funding-influence-an-fda...,100r,


Remove rows without a headline

In [13]:
training_df = labeled_df[~(labeled_df.text == "")]

Remove blacklisted domains

In [14]:
DOMAIN_BLACKLIST = (
    "google",
    "twitter",
    "facebook",
    "doubleclick",
    "eventbrite",
    "youtube",
    "vimeo",
    "instagram",
    "ceros",
)

In [15]:
training_df = training_df[~(training_df.domain.isin(DOMAIN_BLACKLIST))]

Same for subdomain

In [16]:
SUBDOMAIN_BLACKLIST = (
    "careers",
    "mail",
    "account",
    "events",
)

In [17]:
training_df = training_df[~(training_df.subdomain.isin(SUBDOMAIN_BLACKLIST))]

Cut any duplicates

In [18]:
training_df.drop_duplicates(inplace=True)

In [19]:
training_df[["text", "path", "is_story"]].to_csv("output/prepared-data.csv", index=False)

## Train model

In [20]:
preprocessor = ColumnTransformer(
    transformers=[
        ('text', CountVectorizer(min_df=0.1, max_df=0.9, ngram_range=(1, 8), analyzer="char"), 'text'),
        ('path', CountVectorizer(min_df=0.1, max_df=0.9, ngram_range=(1, 8), analyzer="char"), 'path')
    ],
    remainder='drop'  # Drop any columns not specified
)

In [21]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LinearSVC())
])

Set our training set

In [22]:
X = training_df[['text', 'path']].fillna("")
y = training_df['is_story']

In [23]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.33,
    random_state=42
)

In [24]:
# Fit the pipeline on the training data
model = pipeline.fit(X_train, y_train)

# Test it
preds = model.predict(y_test)

# Print the test results
print(metrics.classification_report(y_train, preds))

ValueError: X does not contain any features, but ColumnTransformer is expecting 2 features

Create a model that uses both fields.

In [42]:
def get_path_and_text_model():
    """Create a model that uses both the `path` and `text` fields."""
    # Create a pipeline that pulls in both fields
    pipe = make_pipeline(
        make_union(text_transformer, path_transformer),
        GaussianNB()
    )
    
    # Train it
    model = pipe.fit(X_train.todense(), y_train)
    
    # Test it
    preds = model.predict(test)
    
    # Print the test results
    print(metrics.classification_report(test_labels, preds))

    # Return it
    return model

In [43]:
path_and_text_model = get_path_and_text_model()

AttributeError: 'DataFrame' object has no attribute 'todense'

Create a model that uses only the path field.

In [27]:
def get_path_only_model():
    """Create a model that uses only the `path` field."""
    # Create a pipeline that pulls just that field
    pipe = make_pipeline(path_transformer, GaussianNB())
    
    # Train it
    model = pipe.fit(train, train_labels)
    
    # Test it
    preds = model.predict(test)
    
    # Print the test results
    print(metrics.classification_report(test_labels, preds))

    # Return it
    return model

In [28]:
path_only_model = get_path_only_model()

              precision    recall  f1-score   support

           0       0.94      0.94      0.94       539
           1       0.92      0.92      0.92       398

    accuracy                           0.93       937
   macro avg       0.93      0.93      0.93       937
weighted avg       0.93      0.93      0.93       937



## Review

In [29]:
example = [
    dict(path="/2019/04/unhcr-corruption-refugee-resettlement/", text="This is a headline"),
    dict(path="/", text="Homepage")
]

In [30]:
path_and_text_model.predict(example)

array([1, 0])

In [31]:
path_only_model.predict(example)

array([1, 0])

## Export

Save the models as pickles

In [33]:
skops.io.dump(path_and_text_model, output_path / "path-and-text-model.skops")

In [34]:
skops.io.dump(path_only_model, output_path / "path-only-model.pickle")