# Train

Creating and testing machine learning models

## Import

Bring in standard Python tools

In [31]:
import pickle
import pathlib
from urllib.parse import urlparse

Third-party data libraries

In [None]:
import tldextract
import numpy as np
import pandas as pd

Machine learning gear

In [32]:
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

## Prepare data

Read in our supervised dataset.

In [2]:
input_path = pathlib.Path("") / "input"

In [3]:
output_path = pathlib.Path("") / "output"

In [4]:
labeled_df = pd.read_csv(input_path / "labeled.csv", dtype={"is_story": int})

In [5]:
labeled_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1288 entries, 0 to 1287
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   text      1288 non-null   object
 1   url       1288 non-null   object
 2   handle    1288 non-null   object
 3   is_story  1288 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 40.4+ KB


In [6]:
labeled_df.head()

Unnamed: 0,text,url,handle,is_story
0,About Us,https://100r.org/about/,100reporters,0
1,Asylum for Sale Refugees Say Some U.N. Workers...,https://100r.org/2019/04/unhcr-corruption-refu...,100reporters,1
2,Central/Eastern Europe,https://100r.org/about/the-reporters/central-e...,100reporters,0
3,Diana Jean Schemo,https://100r.org/author/diana/,100reporters,0
4,Did Industry Funding Influence an FDA Investig...,https://100r.org/2022/07/did-industry-funding-...,100reporters,1


Tidy the headline field

In [7]:
def tidy_text(t):
    return t.strip()

In [8]:
labeled_df['text'] = labeled_df.text.apply(tidy_text)

Extract the paths from the urls

In [9]:
labeled_df['path'] = labeled_df.url.apply(lambda x: urlparse(x).path)

Extract the domain

In [10]:
labeled_df['domain'] = labeled_df.url.apply(lambda x: tldextract.extract(x).domain)

Extract the subdomain

In [11]:
labeled_df['subdomain'] = labeled_df.url.apply(lambda x: tldextract.extract(x).subdomain)

In [12]:
labeled_df.head()

Unnamed: 0,text,url,handle,is_story,path,domain,subdomain
0,About Us,https://100r.org/about/,100reporters,0,/about/,100r,
1,Asylum for Sale Refugees Say Some U.N. Workers...,https://100r.org/2019/04/unhcr-corruption-refu...,100reporters,1,/2019/04/unhcr-corruption-refugee-resettlement/,100r,
2,Central/Eastern Europe,https://100r.org/about/the-reporters/central-e...,100reporters,0,/about/the-reporters/central-eastern-europe-st...,100r,
3,Diana Jean Schemo,https://100r.org/author/diana/,100reporters,0,/author/diana/,100r,
4,Did Industry Funding Influence an FDA Investig...,https://100r.org/2022/07/did-industry-funding-...,100reporters,1,/2022/07/did-industry-funding-influence-an-fda...,100r,


Remove rows without a headline

In [13]:
training_df = labeled_df[~(labeled_df.text == "")]

Remove blacklisted domains

In [14]:
DOMAIN_BLACKLIST = (
    "google",
    "twitter",
    "facebook",
    "doubleclick",
    "instagram",
    "pinterest",
    "legacy",
)

In [15]:
training_df = training_df[~(training_df.domain.isin(DOMAIN_BLACKLIST))]

Same for subdomain

In [16]:
SUBDOMAIN_BLACKLIST = (
    "careers",
    "mail",
    "account",
)

In [17]:
training_df = training_df[~(training_df.subdomain.isin(SUBDOMAIN_BLACKLIST))]

Cut any duplicates

In [18]:
training_df.drop_duplicates(inplace=True)

## Train model

Set our label, which is whether the URL leads to a story or not.

In [19]:
labels = training_df.is_story.to_numpy()

Use NLP to convert the string fields into values

In [20]:
vectorizer = CountVectorizer(min_df=0.1, max_df=0.9, ngram_range=(1, 3), analyzer="char")

In [21]:
text_feature = vectorizer.fit_transform(training_df.text).toarray()

In [22]:
path_feature = vectorizer.fit_transform(training_df.path).toarray()

Combine the two string features into a single vector

In [23]:
features = np.concatenate([text_feature, path_feature], axis=1)

Create a training set to test against

In [24]:
train, test, train_labels, test_labels = train_test_split(
    features,
    labels,
    test_size=0.33,
    random_state=45
)

Create the classifier

In [25]:
gnb = GaussianNB()

Train it.

In [26]:
model = gnb.fit(train, train_labels)

Make predictions.

In [27]:
preds = gnb.predict(test)

## Review

Review the results

In [28]:
metrics.accuracy_score(test_labels, preds)

0.9496402877697842

In [29]:
print(metrics.classification_report(test_labels, preds))

              precision    recall  f1-score   support

           0       0.95      0.96      0.96       247
           1       0.95      0.93      0.94       170

    accuracy                           0.95       417
   macro avg       0.95      0.95      0.95       417
weighted avg       0.95      0.95      0.95       417



## Export

Save the model as a pickle

In [30]:
with open(output_path / "gnb.pickle", 'wb') as fh:
    pickle.dump(model, fh)