# Train

Creating and testing machine learning models

In [1]:
import pickle
import pathlib
import numpy as np
import pandas as pd
from urllib.parse import urlparse
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

Read in our supervised dataset.

In [2]:
input_path = pathlib.Path("") / "input"

In [3]:
output_path = pathlib.Path("") / "output"

In [4]:
coded_df = pd.read_csv(input_path / "coded.csv", dtype={"is_story": int})

In [5]:
coded_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1288 entries, 0 to 1287
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   text      1288 non-null   object
 1   url       1288 non-null   object
 2   handle    1288 non-null   object
 3   is_story  1288 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 40.4+ KB


In [6]:
coded_df.head()

Unnamed: 0,text,url,handle,is_story
0,About Us,https://100r.org/about/,100reporters,0
1,Asylum for Sale Refugees Say Some U.N. Workers...,https://100r.org/2019/04/unhcr-corruption-refu...,100reporters,1
2,Central/Eastern Europe,https://100r.org/about/the-reporters/central-e...,100reporters,0
3,Diana Jean Schemo,https://100r.org/author/diana/,100reporters,0
4,Did Industry Funding Influence an FDA Investig...,https://100r.org/2022/07/did-industry-funding-...,100reporters,1


Extract the paths from the urls

In [7]:
coded_df['path'] = coded_df.url.apply(lambda x: urlparse(x).path)

In [8]:
coded_df.head()

Unnamed: 0,text,url,handle,is_story,path
0,About Us,https://100r.org/about/,100reporters,0,/about/
1,Asylum for Sale Refugees Say Some U.N. Workers...,https://100r.org/2019/04/unhcr-corruption-refu...,100reporters,1,/2019/04/unhcr-corruption-refugee-resettlement/
2,Central/Eastern Europe,https://100r.org/about/the-reporters/central-e...,100reporters,0,/about/the-reporters/central-eastern-europe-st...
3,Diana Jean Schemo,https://100r.org/author/diana/,100reporters,0,/author/diana/
4,Did Industry Funding Influence an FDA Investig...,https://100r.org/2022/07/did-industry-funding-...,100reporters,1,/2022/07/did-industry-funding-influence-an-fda...


Set our label, which is whether the URL leads to a story or not.

In [9]:
target_names = ['no', 'yes']

In [10]:
labels = coded_df.is_story.to_numpy()

Use NLP to convert the string fields into values

In [11]:
feature_names = ["text", "path"]

In [295]:
vectorizer = CountVectorizer(min_df=0.1, max_df=0.9, ngram_range=(1, 3), analyzer="char")

In [296]:
text_feature = vectorizer.fit_transform(coded_df.text).toarray()

In [297]:
path_feature = vectorizer.fit_transform(coded_df.path).toarray()

Combine the two string features into a single vector

In [298]:
features = np.concatenate([text_feature, path_feature], axis=1)

Create a training set to test against

In [299]:
train, test, train_labels, test_labels = train_test_split(
    features,
    labels,
    test_size=0.33,
    random_state=45
)

Create the classifier

In [300]:
gnb = GaussianNB()

Train it.

In [301]:
model = gnb.fit(train, train_labels)

Make predictions.

In [302]:
preds = gnb.predict(test)

Review the results

In [303]:
accuracy_score(test_labels, preds)

0.9577464788732394

In [304]:
print(classification_report(test_labels, preds))

              precision    recall  f1-score   support

           0       0.98      0.95      0.96       260
           1       0.93      0.96      0.95       166

    accuracy                           0.96       426
   macro avg       0.95      0.96      0.96       426
weighted avg       0.96      0.96      0.96       426



In [305]:
with open(output_path / "gnb.pickle", 'wb') as fh:
    pickle.dump(model, fh)