In [1]:
import string
import nltk
import numpy as np
from string import punctuation
from sklearn.datasets import fetch_20newsgroups
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.model_selection import cross_val_score
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk import pos_tag
from nltk.corpus import wordnet

In [2]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt to /Users/kahncant/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/kahncant/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kahncant/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/kahncant/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/kahncant/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/kahncant/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [3]:
data_train = fetch_20newsgroups(
    subset="train",
    categories=[
      'rec.motorcycles',
      'rec.sport.hockey',
      'sci.electronics',
      'sci.space'
    ],
    shuffle=True,
    random_state=42,
    remove=("headers", "footers", "quotes"),
)

In [4]:
data_test = fetch_20newsgroups(
    subset="test",
    categories=[
      'rec.motorcycles',
      'rec.sport.hockey',
      'sci.electronics',
      'sci.space'
    ],
    shuffle=True,
    random_state=42,
    remove=("headers", "footers", "quotes"),
)

In [5]:
class NLP:  
    def __init__(self, data):
        self.data = data
        self.stop_words = set(stopwords.words('english'))

        self.lowered = self.lowercase(self.data)
        self.punctuated = self.remove_punctuation(self.lowered)
        self.cleared = self.remove_hidden_characters(self.punctuated)
        self.blacked = self.whitespace_removal(self.cleared)
        self.tokenised = self.tokenise(self.blacked)
        self.cleaned = self.remove_stopwords(self.tokenised)
        self.texted = self.remove_numeric_tokens(self.cleaned)
        self.completed = self.remove_short_tokens(self.cleaned)

        self.stemmed = self.preprocess(self.completed, reduction='s')
        self.lemmatised = self.preprocess(self.completed, reduction='l')

        self.vocabulary_stemmed = self.build_vocabulary(self.stemmed)
        self.vocabulary_lemmatised = self.build_vocabulary(self.lemmatised)
        
    def lowercase(self, texts):
        return [t.lower() for t in texts]
    
    def remove_punctuation(self, texts):
        cleaned_texts = []
        for t in texts:
            t = t.translate(str.maketrans('', '', punctuation))
            cleaned_texts.append(t)
        return cleaned_texts
    
    def remove_hidden_characters(self, texts):
        cleaned_texts = []
        for t in texts:
            cleaned_texts.append(t.replace("\n", " ").replace("\t", " ").replace("\'", ''))
        return cleaned_texts
    
    def whitespace_removal(self, texts):
        return [t.strip() for t in texts]
    
    def tokenise(self, texts):
        return [word_tokenize(t) for t in texts]
    
    def remove_stopwords(self, texts):
        cleaned_texts = []
        for text in texts:
            cleaned_texts.append([w for w in text if not w in self.stop_words])
        return cleaned_texts
    
    def remove_numeric_tokens(self, texts):
        return [[w for w in text if not w.isdigit() and not any(char.isdigit() for char in w)] 
                for text in texts]
    
    def remove_short_tokens(self, texts, min_length=2):
        return [[w for w in text if len(w) >= min_length] for text in texts]
    
    def stem_texts(self, texts):
        stemmer = PorterStemmer()
        return [[stemmer.stem(word) for word in text] for text in texts]
    
    @staticmethod
    def get_wordnet_pos(treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN  

    def lemmatise_texts(self, texts):
        lemmatiser = WordNetLemmatizer()
        lemmatised_texts = []
        for text in texts:
            pos_tags = pos_tag(text)
            lemmatised = [lemmatiser.lemmatize(word, self.get_wordnet_pos(pos)) 
                        for word, pos in pos_tags]
            lemmatised_texts.append(lemmatised)
        return lemmatised_texts
        
    def preprocess(self, texts, reduction):
        if reduction == 's':
            texts = self.stem_texts(texts)
        elif reduction == 'l':
            texts = self.lemmatise_texts(texts)
        return texts
    
    def build_vocabulary(self, texts):
        vocabulary = {}
        for text in texts:
          for word in text:
            if word not in vocabulary:
              vocabulary[word] = len(vocabulary)
        return vocabulary

In [6]:
def encode(texts, vocabulary):
  dataset = []
  for t in texts:
    feature_vector = [0] * len(vocabulary)
    for w in t:
      if w in vocabulary:
        feature_vector[vocabulary[w]] = 1
    dataset.append(feature_vector)
  return np.array(dataset), vocabulary

In [7]:
train = NLP(data_train.data)
test = NLP(data_test.data)

In [8]:
S_train_features, S_train_vocabulary = encode(train.stemmed, train.vocabulary_stemmed)
S_test_features, _ = encode(test.stemmed, train.vocabulary_stemmed)

L_train_features, L_train_vocabulary = encode(train.lemmatised, train.vocabulary_lemmatised)
L_test_features, _ = encode(test.lemmatised, train.vocabulary_lemmatised)

print(f"Stemmed features shape: {S_train_features.shape}")
print(f"Lemmatised features shape: {L_train_features.shape}")
print(f"Feature dimension difference: {L_train_features.shape[1] - S_train_features.shape[1]:,}")

Stemmed features shape: (2382, 24336)
Lemmatised features shape: (2382, 27455)
Feature dimension difference: 3,119


In [9]:
S_y_test = data_test.target
S_x_train = S_train_features
S_y_train = data_train.target
S_x_test = S_test_features

L_y_test = data_test.target
L_x_train = L_train_features
L_y_train = data_train.target
L_x_test = L_test_features

In [10]:
S_clf = LogisticRegression(max_iter=1000)
L_clf = LogisticRegression(max_iter=1000)

S_clf.fit(S_x_train, S_y_train)
L_clf.fit(L_x_train, L_y_train)

In [11]:

S_y_pred = S_clf.predict(S_x_test)
L_y_pred = L_clf.predict(L_x_test)

print("\n--- Stemmed Model ---")
print(f"Accuracy (Stemmed): {accuracy_score(S_y_test, S_y_pred):.2f}")
print(f"Precision (Stemmed): {precision_score(S_y_test, S_y_pred, average='macro'):.2f}")
print(f"Recall (Stemmed): {recall_score(S_y_test, S_y_pred, average='macro'):.2f}")

print("\n--- Lemmatised Model ---")
print(f"Accuracy (Lemmatised): {accuracy_score(L_y_test, L_y_pred):.2f}")
print(f"Precision (Lemmatised): {precision_score(L_y_test, L_y_pred, average='macro'):.2f}")
print(f"Recall (Lemmatised): {recall_score(L_y_test, L_y_pred, average='macro'):.2f}")


--- Stemmed Model ---
Accuracy (Stemmed): 0.82
Precision (Stemmed): 0.83
Recall (Stemmed): 0.82

--- Lemmatised Model ---
Accuracy (Lemmatised): 0.83
Precision (Lemmatised): 0.83
Recall (Lemmatised): 0.83


In [12]:
cv_scores_s = cross_val_score(LogisticRegression(max_iter=1000, random_state=42), 
                              S_train_features, data_train.target, 
                              cv=5, scoring='accuracy')

cv_scores_l = cross_val_score(LogisticRegression(max_iter=1000, random_state=42), 
                              L_train_features, data_train.target, 
                              cv=5, scoring='accuracy')

print(f"\nStemmed CV Scores: {[f'{s:.3f}' for s in cv_scores_s]}")
print(f"Stemmed CV Accuracy: {cv_scores_s.mean():.4f} (+/- {cv_scores_s.std():.4f})")

print(f"\nLemmatised CV Scores: {[f'{s:.3f}' for s in cv_scores_l]}")
print(f"Lemmatised CV Accuracy: {cv_scores_l.mean():.4f} (+/- {cv_scores_l.std():.4f})")

print(f"\nCV Improvement: {(cv_scores_l.mean() - cv_scores_s.mean())*100:+.2f}%")


Stemmed CV Scores: ['0.826', '0.862', '0.813', '0.828', '0.836']
Stemmed CV Accuracy: 0.8329 (+/- 0.0162)

Lemmatised CV Scores: ['0.824', '0.855', '0.811', '0.821', '0.840']
Lemmatised CV Accuracy: 0.8304 (+/- 0.0156)

CV Improvement: -0.25%
