# Final Testing on Holdout Set With Trained Multinomial Logistic Regression Model

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Previewing-the-Holdout-Set" data-toc-modified-id="Previewing-the-Holdout-Set-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Previewing the Holdout Set</a></span></li><li><span><a href="#Label-Encoding" data-toc-modified-id="Label-Encoding-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Label Encoding</a></span></li><li><span><a href="#Feature-Selection" data-toc-modified-id="Feature-Selection-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Feature Selection</a></span></li><li><span><a href="#Testing-Model-Performance" data-toc-modified-id="Testing-Model-Performance-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Testing Model Performance</a></span><ul class="toc-item"><li><span><a href="#Get-The-Performance-Metrics" data-toc-modified-id="Get-The-Performance-Metrics-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Get The Performance Metrics</a></span></li></ul></li></ul></div>

In [1]:
# Import libraries
import joblib
import re
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
# Import the Holdout Set
holdout_set = pd.read_csv("../clean-datasets/exported-from-db/AllTheNews21_Holdout.csv")

In [3]:
# Import the model to test
pickle_file = "../saved-classification-models/allthenews21/MultinomialLogisticRegression-TFIDF5000-Best-RandomizedSearch3CV.pkl"

In [4]:
# This is required by the content of the pickle file
def process_text(text):

    """
    Preprocess a given text: 
        - Lowercase
        - Tokenize
        - Remove non-needed tokens
        - Lemmatize
        - Clean
    """

    # Convert to lowercase, replace newlines with spaces, strip whitespaces
    text = text.lower().strip()

    # Tokenize
    word_tokens = word_tokenize(text)
    # Convert to a numpy array
    word_tokens = np.array(word_tokens)

    # Keep only alphabetic characters
    is_alpha = list(map(str.isalpha, word_tokens))
    word_tokens = word_tokens[is_alpha]

    # Remove stopwords
    custom_stopwords = ["said", "say", "says"]
    stop_words = set(stopwords.words("english") + custom_stopwords)
    is_not_stopword = list(map(lambda token: token not in stop_words, word_tokens))
    word_tokens = word_tokens[is_not_stopword]

    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    vectorize_lemmatizer = np.vectorize(lemmatizer.lemmatize)
    word_tokens = vectorize_lemmatizer(word_tokens)

    # Convert into a setence form
    sentence = " ".join(word_tokens)

    # Return final tokenized sentence
    return sentence

In [5]:
# Then, recreate the prediction model
with open(pickle_file, 'rb') as file:
    trained_model_lr = joblib.load(file)

## Previewing the Holdout Set

In [6]:
display(holdout_set.shape)
display(holdout_set.head())

(3149, 4)

Unnamed: 0,article,category,article_length,word_count
0,"This is the first part of the series, Escape f...",health and wellness,20307,3468
1,(CNN)The US Food and Drug Administration has ...,health and wellness,4394,664
2,(CNN)Since the Supreme Court first convened i...,politics,3848,635
3,Washington (CNN)Former Trump campaign adviser ...,politics,5429,866
4,Washington (CNN)President Donald Trump sought ...,politics,8291,1335


In [7]:
holdout_set.groupby(["category"]).count()

Unnamed: 0_level_0,article,article_length,word_count
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
arts and entertainment,150,150,150
automobiles,150,150,150
business,150,150,150
climate and environment,150,150,150
energy,150,150,150
finance and economics,150,150,150
food,150,150,150
global healthcare,150,150,150
health and wellness,149,149,149
legal and crimes,150,150,150


## Label Encoding

In [8]:
# First, we need to get the labels encoded
# We will use the mapping that we got from training and map to here

label_encoding_mapping = [
    ('arts and entertainment', 0),
    ('automobiles', 1),
    ('business', 2),
    ('climate and environment', 3),
    ('energy', 4),
    ('finance and economics', 5),
    ('food', 6),
    ('global healthcare', 7),
    ('health and wellness', 8),
    ('legal and crimes', 9),
    ('life', 10),
    ('markets and investments', 11),
    ('personal finance', 12),
    ('politics', 13),
    ('real estate', 14),
    ('science and technology', 15),
    ('sports', 16),
    ('travel and transportation', 17),
    ('us', 18),
    ('wealth', 19),
    ('world', 20)
]

# Define a mapping function
def label_encoder(st):
    for label, code in label_encoding_mapping:
        if st == label:
            return code

# Apply mapping function to the dataset
holdout_set["category_target"] = holdout_set["category"].map(label_encoder)

# Check result
holdout_set.head()

Unnamed: 0,article,category,article_length,word_count,category_target
0,"This is the first part of the series, Escape f...",health and wellness,20307,3468,8
1,(CNN)The US Food and Drug Administration has ...,health and wellness,4394,664,8
2,(CNN)Since the Supreme Court first convened i...,politics,3848,635,13
3,Washington (CNN)Former Trump campaign adviser ...,politics,5429,866,13
4,Washington (CNN)President Donald Trump sought ...,politics,8291,1335,13


## Feature Selection

In [9]:
# All we need are the article and the category target
X_holdout = holdout_set["article"]
y_holdout = holdout_set["category_target"]

In [10]:
print(X_holdout)

0       This is the first part of the series, Escape f...
1        (CNN)The US Food and Drug Administration has ...
2        (CNN)Since the Supreme Court first convened i...
3       Washington (CNN)Former Trump campaign adviser ...
4       Washington (CNN)President Donald Trump sought ...
                              ...                        
3144     (CNN)The children of Junior Seau, the superst...
3145     (CNN)Gigi Hadid took to Instagram to address ...
3146    LONDON, (Reuters) - People who use cannabis ev...
3147    After Dr. Celeste Good of Greenville, N.C., bo...
3148     (CNN)The Supreme Court just handed a huge vic...
Name: article, Length: 3149, dtype: object


In [11]:
print(y_holdout)

0        8
1        8
2       13
3       13
4       13
        ..
3144    18
3145     0
3146     8
3147    14
3148    13
Name: category_target, Length: 3149, dtype: int64


## Testing Model Performance

In [12]:
# Predict on the test data
labels_pred = trained_model_lr.predict(X_holdout)
display(labels_pred)

array([18,  8, 18, ...,  8, 14, 13], dtype=int64)

### Get The Performance Metrics

In [14]:
# Accuracy
accuracy = accuracy_score(y_holdout, labels_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7300730390600191


In [15]:
# Classification Report
report = classification_report(y_holdout, labels_pred)
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.75      0.71       150
           1       0.79      0.88      0.83       150
           2       0.41      0.39      0.40       150
           3       0.75      0.84      0.79       150
           4       0.66      0.74      0.70       150
           5       0.51      0.51      0.51       150
           6       0.93      0.87      0.90       150
           7       0.79      0.77      0.78       150
           8       0.72      0.74      0.73       149
           9       0.81      0.72      0.76       150
          10       0.68      0.71      0.69       150
          11       0.69      0.49      0.57       150
          12       0.87      0.84      0.85       150
          13       0.72      0.73      0.72       150
          14       0.88      0.87      0.88       150
          15       0.66      0.65      0.66       150
          16       0.85      0.90      0.87       150
   

In [17]:
pd.options.display.max_rows = 3000
pd.options.display.max_columns = 50

In [18]:
# Print Confusion Matrix
pd.DataFrame(confusion_matrix(y_holdout, labels_pred))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
0,112,0,0,1,0,0,1,0,2,2,13,0,1,1,1,8,3,1,1,1,2
1,0,132,3,1,2,1,0,1,0,0,1,2,1,2,0,0,1,0,1,1,1
2,0,8,59,2,11,23,2,4,0,0,1,6,0,2,0,8,1,2,3,10,8
3,0,2,2,126,3,0,0,0,0,2,1,0,0,1,0,3,3,0,1,1,5
4,0,0,4,7,111,8,1,4,0,0,0,4,0,1,0,0,0,0,0,0,10
5,0,0,17,6,5,77,0,10,0,0,0,13,1,7,1,2,0,0,0,7,4
6,4,1,1,0,0,0,131,0,3,1,2,1,0,1,3,0,0,2,0,0,0
7,0,2,7,2,4,8,0,115,2,0,0,2,0,0,0,0,5,1,0,0,2
8,2,0,1,1,0,0,0,2,111,1,10,1,4,3,0,3,2,0,5,0,3
9,5,1,1,1,0,0,0,0,6,108,4,0,1,3,0,8,1,2,7,1,1
