We begin by making appropriate imports as well as loading the data needed for NLTK and Spacy.

In [8]:
import math
import os
import random

from pprint import pprint
from typing import List, Dict

import nltk
import numpy as np
import spacy
# Download the required dataset from NLTK
nltk.download("stopwords", quiet=True)

from nltk.corpus import stopwords
from sklearn import preprocessing, svm
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from tqdm.notebook import tqdm

# If this fails, please run `python -m spacy download en_core_web_sm`
nlp = spacy.load("en_core_web_sm")

Two functions can then be defined to load the data from the text files to memory

In [9]:
def load_corpus(folder: str) -> List[str]:
    """Load strings from folder of text

    Args:
        folder (str): The path to the folder to load

    Returns:
        List[str]: List of strings retrieved from text files in the folder
    """
    corpus = []
    # Crawl all subfolders
    for root, dirs, files in os.walk(folder, topdown=False):
        for name in files:
            try:
                with open(os.path.join(root, name), "r") as fp:
                    # Some of the files have non-unicode characters in them so this can fail
                    corpus.append(fp.read())
            except UnicodeDecodeError as e:
                ... # Let the error pass silently
                # print(e.__str__(), "for", os.path.join(root, name))
    return corpus

def load_corpuses(folder: str) -> Dict[str, List[str]]:
    """Load corpuses from sub-folders of specified folder

    Args:
        folder (str): The parent folder

    Returns:
        Dict[str, List[str]]: Dictionary of corpuses
    """
    sub_folders = []
    for root, dirs, files in os.walk(folder):
        if dirs:
            for dir_ in dirs:
                sub_folders.append(dir_)

    corpuses = {}
    for sub_folder in sub_folders:
        corpuses[sub_folder] = load_corpus(os.path.join(folder, sub_folder))
    return corpuses

We then build our dataset using these corpuses. The $x$ vector is made from three features:
- Word frequencies
- Frequency of named entity types
- Weighted word frequencies

Once constructed, the data set is shuffled.

In [10]:
x = []
y =[]

corpuses = load_corpuses("bbc")

# From the dictionary, generate 2 lists of x and y data
for corpus in corpuses:
    for story in corpuses[corpus]:
        x.append(story)
        y.append(corpus)

# Shuffle x and y in the same way
c = list(zip(x, y))
random.shuffle(c)
x, y = zip(*c)

Test and training data are then sampled using a 20:80 split respectively. The $Y$ values are then encoded so that they can be used as labels within the SVM.

In [11]:
# Specify the 60:20:20 split
size_dataset_full=len(x)
size_test = int(round(size_dataset_full*0.2,0))
size_dev = int(round(size_dataset_full*0.2,0))

list_test_indices=random.sample(range(size_dataset_full), size_test)

test_x = x[:size_test]
test_y = y[:size_test]
dev_x = x[size_test+1:size_dev + size_test]
dev_y = y[size_test+1:size_dev + size_test]
train_x = x[size_dev + size_test:]
train_y = y[size_dev + size_test:]

# Encode the labels using the labels present in the Y data
le = preprocessing.LabelEncoder()
le.fit(train_y)
train_y = le.transform(train_y)

# Possible for this to fail as a label could be in test that isn't in train
test_y = le.transform(test_y)
dev_y = le.transform(dev_y)

In [12]:
def feature_extraction(stories: List[str]) -> List[List[int]]:
    """Extracts features from a list of strings

    Args:
        stories (List[str]): Strings to extract features from

    Returns:
        List[List[int]]: List of vectors which can be used in a model
    """

    # Fit a counter for all of the named entity types in Spacy
    entity_types = CountVectorizer(stop_words=stopwords.words('english'))
    entity_types.fit(['CARDINAL', 'PERSON', 'GPE', 'MONEY', 'ORG', 'ORDINAL', 'WORK_OF_ART', 'NORP', 'PERCENT', 'DATE', 'LANGUAGE', 'FAC', 'LOC', 'TIME', 'PRODUCT', 'EVENT', 'QUANTITY', 'LAW'])

    # Iterate through all of the training data and process it
    processed_stories = []
    for story in tqdm(stories):
        # Apply Spacy NLP to the story
        analysed = nlp(story)
        processed_stories.append(
            # Word grequency matrix
            list(vectorizer.transform([story]).toarray()[0]) +
            # Named entity type frequency
            list(entity_types.transform([tag.label_ for tag in analysed.ents]).toarray()[0]) +
            # Weighted word frequency
            list(tfid.transform(vectorizer.transform([story])).toarray()[0])
        )
    return processed_stories

We then define two of the feature extraction methods. `CountVectorizer` builds a vocabulary from the previously loaded training data. `TfidfTransformer` is then built using the matrix provided by `CountVectorizer`.

The combination of the vectors resulted in very large $x$ vectors to train on. The best 500 features are selected using the $\chi^2$ method

In [6]:
# Word frequency counter setup
vectorizer = CountVectorizer(stop_words=stopwords.words('english'))
# Weighted word frequency counter setup
tfid = TfidfTransformer()

# Fit the counters to the test data
vectorizer.fit(train_x)
tfid.fit(vectorizer.transform(train_x))

# Perform the pre-processing
train_x = feature_extraction(train_x)

# Select only the best 500 features
get_best=SelectKBest(chi2, k=500).fit(train_x, train_y)
train_x_chi = get_best.transform(train_x)

  0%|          | 0/1779 [00:00<?, ?it/s]

<class 'list'>


The SVM object is constructed. The pipeline includes passing the data through the `StandardScaler` function which "Standardize \[sic\] features by removing the mean and scaling to unit variance".

In [7]:
svm_clf=make_pipeline(StandardScaler(), svm.SVC(cache_size=10000, decision_function_shape='ovo'))

We can then pass the training data to the SVM to train the model.

In [8]:
svm_clf.fit(train_x_chi, train_y)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(cache_size=10000, decision_function_shape='ovo'))])

Using this model, we can run the training data through it in order to evaluate the SVM.

In [9]:
Y_text_predictions = svm_clf.predict(get_best.transform(feature_extraction(dev_x)))

  0%|          | 0/445 [00:00<?, ?it/s]

The `classification_report` function allows us to easily generate a report on the success of the SVM by providing known good $Y$ values as well as $Y$ values attained through the SVM.

In [10]:
print(classification_report(dev_y, Y_text_predictions, target_names=le.inverse_transform(svm_clf.classes_)))

               precision    recall  f1-score   support

     business       0.95      0.96      0.96       100
entertainment       0.95      0.95      0.95        66
     politics       0.99      0.92      0.95        87
        sport       1.00      0.94      0.97       112
         tech       0.84      0.96      0.90        80

     accuracy                           0.95       445
    macro avg       0.95      0.95      0.95       445
 weighted avg       0.95      0.95      0.95       445



In order to use the SVM, a large amount of preprocessing needs to be done on a string. This has been encapsulated in the following function:

In [11]:
def predict(story: str) -> str:
    """Gives a genre prediction for a news story

    Args:
        story (str): The plaintext of the story

    Returns:
        str: The genre of the story
    """
    return le.inverse_transform(
        svm_clf.predict(
            get_best.transform(
                feature_extraction([story])
            )
        )
    )[0]

We can then try this function with a news story.

In [12]:
predict("""
This guide is a concise summary of the main policies being put forward by each party.

The policy areas featured in the guide were selected using polling data on what the public consider to be the most important issues facing the country.

While some issues such as health and education are the responsibility of the Scottish Parliament, others such as foreign policy and Brexit are decided at the UK parliament at Westminster. You can read more about how devolution works here.

More information on how the issues and parties were selected is in our methodology.

A full list of parties standing at the election will be published after nominations have closed.
""")

  0%|          | 0/1 [00:00<?, ?it/s]

'politics'