<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
import joblib

In [2]:
pickle_file = "./models/MultinomialLogisticRegression-TFIDF5000-Best-RandomizedSearch3CV.pkl"

In [3]:
# For pre-processing
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Define Preprocessing function, which will be used by the TFIDF Vectorizer
def process_text(text):
    """Preprocess a given text: 
        - Lowercase
        - Tokenize
        - Remove non-needed tokens
        - Lemmatize
        - Clean
    """

    # Convert to lowercase, replace newlines with spaces, strip whitespaces
    text = text.lower().strip()

    # Tokenize
    word_tokens = word_tokenize(text)
    # Convert to a numpy array
    word_tokens = np.array(word_tokens)

    # Keep only alphabetic characters
    is_alpha = list(map(str.isalpha, word_tokens))
    word_tokens = word_tokens[is_alpha]

    # Remove stopwords
    custom_stopwords = ["said", "say", "says"]
    stop_words = set(stopwords.words("english") + custom_stopwords)
    is_not_stopword = list(map(lambda token: token not in stop_words, word_tokens))
    word_tokens = word_tokens[is_not_stopword]

    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    vectorize_lemmatizer = np.vectorize(lemmatizer.lemmatize)
    word_tokens = vectorize_lemmatizer(word_tokens)

    # Convert into a setence form
    sentence = " ".join(word_tokens)

    # Return final tokenized sentence
    return sentence

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\maeva\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maeva\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\maeva\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
with open(pickle_file, 'rb') as file:
    model = joblib.load(file)

In [5]:
def predict_category(text):
    
    category_mapping = [
        ('arts and entertainment', 0),
        ('automobiles', 1),
        ('business', 2),
        ('climate and environment', 3),
        ('energy', 4),
        ('finance and economics', 5),
        ('food', 6),
        ('global healthcare', 7),
        ('health and wellness', 8),
        ('legal and crimes', 9),
        ('life', 10),
        ('markets and investments', 11),
        ('personal finance', 12),
        ('politics', 13),
        ('real estate', 14),
        ('science and technology', 15),
        ('sports', 16),
        ('travel and transportation', 17),
        ('us', 18),
        ('wealth', 19),
        ('world', 20)
    ]
    
    predicted_num = model.predict(text)[0]
    
    for cat, num in category_mapping:
        if num == predicted_num:
            return cat

In [6]:
text = ["""
(CNN) — Italian police have seized more than 200 pounds of sand, stones and shells stolen from the beaches of Sardinia last year, dishing out fines to dozens of tourists who took them as souvenirs.
The items were returned to the beaches they were taken from earlier this week, the Guardia di Finanza -- Italy's finance police -- said in a statement.
The Italian island's idyllic white sand is protected, and tourists face hefty fines and even jail time for removing it from local beaches.
Police said 41 people had been fined in connection with the spate of sand and shell thefts; the penalties issued range from 500 to 3,000 euros ($600 to $3,650).
The seizures -- totaling more than 100 kilograms (220 pounds) -- took place ''despite the significant decrease in the number of tourists on the island the past summer season'' due to the Covid-19 pandemic, the statement said.
Police carried out regular checks on departing travelers at the island's Olbia Costa Smeralda Airport, as well as on various e-commerce sites where the sand was being sold, they added.
"""]

In [7]:
predict_category(text)

'travel and transportation'