# ADMAGD - 20 news dataset

## Importing libraries

In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.matutils import Sparse2Corpus
from gensim import corpora, models
from collections import defaultdict
import re
from string import punctuation

In [2]:
from model.admagd import ADMAGD

## Data fetching

In [3]:
categories = ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc' ]

In [4]:
newsgroups = fetch_20newsgroups(subset='train')

In [5]:
newsgroup_body = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

## Author Extraction

In [6]:
def extract_author(text):
    match = re.search(r"From: (.+?)(\n|$)", text)
    if match:
        author = match.group(1)
        return re.sub(r"[<>].*", "", author).strip()
    return "Unknown"

In [7]:
authors = [extract_author(doc) for doc in newsgroups.data]

In [8]:
author2doc = {}

for doc_id, author in enumerate(authors):
    if author not in author2doc:
        author2doc[author] = []
    author2doc[author].append(doc_id)

## Pre-processing

#### STOP WORDS creation
Initial stop words from `sklearn.feature_extraction._stop_words`

In [9]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS # Total 318 words

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Alam\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
nltk_stop_words = list(set(stopwords.words('english')))
sk_stop_words = list(ENGLISH_STOP_WORDS)

STOP_WORDS = list(set(nltk_stop_words + sk_stop_words))

In [11]:
STOP_WORDS += ["use", "think", "thanks", "know", "like", "make", "say", "time", "use", "need", "want", "come" ]

#### Lemmatizer
Lemmatizer minimizes text ambiguity. Example words like bicycle or bicycles are converted to base word bicycle. Basically, it will convert all words having the same meaning but different representation to their base form. It reduces the word density in the given text and helps in preparing the accurate features for training machine. Cleaner the data, the more intelligent and accurate your machine learning model, will be. NLTK Lemmatizer will also saves memory as well as computational cost.

In [12]:
import nltk
nltk.download('punkt')
from nltk.corpus import wordnet as wn
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
nltk.download('averaged_perceptron_tagger') # need for pos_tag

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Alam\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Alam\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Alam\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Alam\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [13]:
# Initialization
lemmatizer = WordNetLemmatizer()

# Creating a POS tag map
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

def convertWordIntoLemmatizeWord(words):
  return [lemmatizer.lemmatize(word, tag_map[tag[0]]) for word, tag in pos_tag(words)]

#### Creating a pre-process function
* Remove numbers. ✅
* Convert word into lowercase word. ✅
* Remove all stop words. ✅
* Remove all punctuations. ✅
* Some white spaces may be added to the list of words, due to the translate function & nature of our documents. Remove them as well. ✅
* Remove just-numeric strings. ✅
* Lemmatize. ✅
* Remove words with only 2 characters or less. [Low frequency] ✅
* Remove words with more than 12 characters. [High frequency] ✅

In [14]:
def preprocess(words):
  #First, remove numbers
  words = [re.sub(r"\d+", "", word) for word in words]
  #Normalize the cases of our words
  words = [word.lower() for word in words]
  #Remove all punctuations
  table = str.maketrans('', '', punctuation)
  words = [word.translate(table) for word in words]
  #Some white spaces may be added to the list of words, due to the translate function & nature of our documents. We've to remove them.
  words = [word for word in words if word]
  #Remove just-numeric strings
  words = [word for word in words if not word.isdigit()]
  #Remove all stop words
  words = [word for word in words if word not in STOP_WORDS]
  #Lemmatize
  words = convertWordIntoLemmatizeWord(words)
  #Remove all stop words
  words = [word for word in words if word not in STOP_WORDS]
  #Remove words with less than 3 characters and more than 20 characters
  words = [word for word in words if len(word) > 2 and len(word) <= 20]
  return words

In [15]:
def preprocess_documents(docs):
  preprocessed_docs = []
  for doc in docs:
    words = word_tokenize(doc)
    words = preprocess(words)
    # preprocessed_docs.append(" ".join(words))
    preprocessed_docs.append(words)
  return preprocessed_docs

In [16]:
preprocessed_docs = preprocess_documents(newsgroup_body.data)

In [17]:
dictionary = corpora.Dictionary(preprocessed_docs)

In [18]:
dictionary.filter_extremes(no_below=15, no_above=0.5)

In [19]:
bow_corpus = [dictionary.doc2bow(doc) for doc in preprocessed_docs]

## Model Training

In [20]:
tfidf = models.TfidfModel(bow_corpus)

In [21]:
corpus_tfidf = tfidf[bow_corpus]

In [22]:
# Vectorized the data
# vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english', max_features=1000)
# X = vectorizer.fit_transform(preprocessed_docs)

# # Convert matrix to list of tuples (document, word count)
# corpus = [list(zip(row.indices, row.data)) for row in X]

# # id2word mapping
# id2word = {v: k for k, v in vectorizer.vocabulary_.items()}

In [23]:
# tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=1000)
# tfidf_X = tfidf_vectorizer.fit_transform(preprocessed_docs)

# # Create the gensim dictionary manually
# id2word_gensim = {i: token for token, i in tfidf_vectorizer.vocabulary_.items()}
# gensim_dictionary = corpora.Dictionary.from_corpus(Sparse2Corpus(tfidf_X, documents_columns=False), id2word=id2word_gensim)

# # Convert the TF-IDF matrix to a gensim corpus.
# corpus_gensim = Sparse2Corpus(tfidf_X, documents_columns=False)  # documents_columns=False for compatibility with gensim

# # Ensure the dictionary is updated with the current corpus
# gensim_dictionary.id2token = id2word_gensim
# gensim_dictionary.token2id = tfidf_vectorizer.vocabulary_


In [24]:
# Initialize alpha, beta, a, and b if different from the defaults
alpha_init = 0.1
beta_init = 0.1
a_init = 0.1
b_init = 0.1

num_topics = 20

In [25]:
# Instantiate the ADMAGD model
model = ADMAGD(corpus=corpus_tfidf, num_topics=num_topics, id2word=dictionary, authors=author2doc, alpha_init=alpha_init, beta_init=beta_init, a_init=a_init, b_init=b_init)

In [26]:
# Run Gibbs sampling
model.gibbs_sampling(iterations=200)

iteration: 1
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 6
iteration: 7
iteration: 8
iteration: 9
iteration: 10
iteration: 11
iteration: 12
iteration: 13
iteration: 14
iteration: 15
iteration: 16
iteration: 17
iteration: 18
iteration: 19
iteration: 20
iteration: 21
iteration: 22
iteration: 23
iteration: 24
iteration: 25
iteration: 26
iteration: 27
iteration: 28
iteration: 29
iteration: 30
iteration: 31
iteration: 32
iteration: 33
iteration: 34
iteration: 35
iteration: 36
iteration: 37
iteration: 38
iteration: 39
iteration: 40
iteration: 41
iteration: 42
iteration: 43
iteration: 44
iteration: 45
iteration: 46
iteration: 47
iteration: 48
iteration: 49
iteration: 50
iteration: 51
iteration: 52
iteration: 53
iteration: 54
iteration: 55
iteration: 56
iteration: 57
iteration: 58
iteration: 59
iteration: 60
iteration: 61
iteration: 62
iteration: 63
iteration: 64
iteration: 65
iteration: 66
iteration: 67
iteration: 68
iteration: 69
iteration: 70
iteration: 71
iteration: 72
i

## Store Model

### Pickle

In [27]:
import pickle

In [28]:
model_file_name = "tfidf_train_extra_stopwords_200_iteration_admagd_model"

Save the model to a file

In [29]:
with open(f"trained_ model/{model_file_name}.pkl", 'wb') as f:
    pickle.dump(model, f)

### joblib

In [30]:
from joblib import dump

Save the model to a file

In [31]:
dump(model, f"trained_ model/{model_file_name}.joblib")

['trained_ model/tfidf_train_extra_stopwords_200_iteration_admagd_model.joblib']