# ADMAGD - 20 news dataset

## Importing libraries

In [1]:
import numpy as np
from gensim import corpora
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer

from collections import defaultdict
import re
from string import punctuation

In [2]:
from model.admagd import ADMAGD

## Data fetching

In [14]:
categories = ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc' ]

In [15]:
newsgroups = fetch_20newsgroups(subset='all', categories = categories)

## Author Extraction

In [16]:
def extract_author(text):
    match = re.search(r"From: (.+?)(\n|$)", text)
    if match:
        author = match.group(1)
        return re.sub(r"[<>].*", "", author).strip()
    return "Unknown"

In [17]:
authors = [extract_author(doc) for doc in newsgroups.data]

In [18]:
author2doc = {}

for doc_id, author in enumerate(authors):
    if author not in author2doc:
        author2doc[author] = []
    author2doc[author].append(doc_id)

## Pre-processing

#### STOP WORDS creation
Initial stop words from `sklearn.feature_extraction._stop_words`

In [47]:
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS # Total 318 words

stopWords=list(ENGLISH_STOP_WORDS)

Adding some more common stop words throughout all docs which doesn't play no part in classification

In [48]:
stopWords+=['subject','from', 'date', 'reply-to', 'newsgroups', 'message-id', 'lines', 'path', 'organization', 
            'would', 'writes', 'references', 'article', 'sender', 'nntp-posting-host', 'people', 
            'university', 'think', 'xref', 'cantaloupe.srv.cs.cmu.edu', 'could', 'distribution', 'first', 
            'anyone', 'really', 'since', 'believe', 'still', 
            "max>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'"]

stopWords = set(stopWords)

#### Lemmatizer
Lemmatizer minimizes text ambiguity. Example words like bicycle or bicycles are converted to base word bicycle. Basically, it will convert all words having the same meaning but different representation to their base form. It reduces the word density in the given text and helps in preparing the accurate features for training machine. Cleaner the data, the more intelligent and accurate your machine learning model, will be. NLTK Lemmatizer will also saves memory as well as computational cost.

In [49]:
import nltk
nltk.download('punkt')
from nltk.corpus import wordnet as wn
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
nltk.download('averaged_perceptron_tagger') # need for pos_tag

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Alam\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Alam\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Alam\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Alam\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [50]:
# Initialization
lemmatizer = WordNetLemmatizer()

# Creating a POS tag map
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

def convertWordIntoLemmatizeWord(words):
  return [lemmatizer.lemmatize(word, tag_map[tag[0]]) for word, tag in pos_tag(words)]

#### Creating a pre-process function
* Remove numbers. ✅
* Convert word into lowercase word. ✅
* Remove all stop words. ✅
* Remove all punctuations. ✅
* Some white spaces may be added to the list of words, due to the translate function & nature of our documents. Remove them as well. ✅
* Remove just-numeric strings. ✅
* Lemmatize. ✅
* Remove words with only 2 characters or less. [Low frequency] ✅
* Remove words with more than 12 characters. [High frequency] ✅

In [51]:
def preprocess(words):
  #Remove all stop words
  words = [word for word in words if word not in stopWords]
  #First, remove numbers
  words = [re.sub(r"\d+", "", word) for word in words]
  #Normalize the cases of our words
  words = [word.lower() for word in words]
  #Remove all punctuations
  table = str.maketrans('', '', punctuation)
  words = [word.translate(table) for word in words]
  #Some white spaces may be added to the list of words, due to the translate function & nature of our documents. We've to remove them.
  words = [word for word in words if word]
  #Remove just-numeric strings
  words = [word for word in words if not word.isdigit()]
  #Lemmatize
  words = convertWordIntoLemmatizeWord(words)
  #Remove words with less than 3 characters and more than 12 characters
  words = [word for word in words if len(word) > 2 and len(word) <= 12]
  return words

In [52]:
def preprocess_documents(docs):
  preprocessed_docs = []
  for doc in docs:
    words = word_tokenize(doc)
    words = preprocess(words)
    preprocessed_docs.append(" ".join(words))
  return preprocessed_docs

In [53]:
preprocessed_docs = preprocess_documents(newsgroups.data)

## Model Training

In [54]:
# Vectorized the data
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english', max_features=1000)
X = vectorizer.fit_transform(preprocessed_docs)

# Convert matrix to list of tuples (document, word count)
corpus = [list(zip(row.indices, row.data)) for row in X]

# id2word mapping
id2word = {v: k for k, v in vectorizer.vocabulary_.items()}

In [55]:
# Initialize alpha, beta, a, and b if different from the defaults
alpha_init = 0.1
beta_init = 0.1
a_init = 0.1
b_init = 0.1

num_topics = 20

In [56]:
# Instantiate the ADMAGD model
model = ADMAGD(corpus=corpus, num_topics=num_topics, id2word=id2word, authors=author2doc, alpha_init=alpha_init, beta_init=beta_init, a_init=a_init, b_init=b_init)

In [57]:
# Run Gibbs sampling
model.gibbs_sampling(iterations=100)

iteration: 0
iteration: 1
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 6
iteration: 7
iteration: 8
iteration: 9
iteration: 10
iteration: 11
iteration: 12
iteration: 13
iteration: 14
iteration: 15
iteration: 16
iteration: 17
iteration: 18
iteration: 19
iteration: 20
iteration: 21
iteration: 22
iteration: 23
iteration: 24
iteration: 25
iteration: 26
iteration: 27
iteration: 28
iteration: 29
iteration: 30
iteration: 31
iteration: 32
iteration: 33
iteration: 34
iteration: 35
iteration: 36
iteration: 37
iteration: 38
iteration: 39
iteration: 40
iteration: 41
iteration: 42
iteration: 43
iteration: 44
iteration: 45
iteration: 46
iteration: 47
iteration: 48
iteration: 49
iteration: 50
iteration: 51
iteration: 52
iteration: 53
iteration: 54
iteration: 55
iteration: 56
iteration: 57
iteration: 58
iteration: 59
iteration: 60
iteration: 61
iteration: 62
iteration: 63
iteration: 64
iteration: 65
iteration: 66
iteration: 67
iteration: 68
iteration: 69
iteration: 70
iteration: 71
it

## Store Model

### Pickle

In [5]:
import pickle

Save the model to a file

In [78]:
with open('trained_ model/admagd_model.pkl', 'wb') as f:
    pickle.dump(model, f)

Load the model from a file

In [6]:
with open('trained_ model/admagd_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

### joblib

In [75]:
from joblib import dump
from joblib import load

Save the model to a file

In [72]:

dump(model, 'trained_ model/admagd_model.joblib')


['trained_ model/admagd_model.joblib']

Load the model from a file

In [76]:
loaded_model = load('trained_ model/admagd_model.joblib')

## Extract word for each topic

In [8]:
# After you've run Gibbs sampling
word_topic_matrix = loaded_model.word_topic_matrix
word_topic_sum = word_topic_matrix.sum(axis=1)[:, np.newaxis]
word_topic_dist = word_topic_matrix / word_topic_sum

In [10]:
# Visualize the top N words for each topic
N_TOP_WORDS = 20
for i in range(loaded_model.num_topics):
    top_words_idx = word_topic_dist[i].argsort()[-N_TOP_WORDS:][::-1]
    top_words = [loaded_model.id2word[idx] for idx in top_words_idx]
    print(f"Topic {i + 1}: {', '.join(top_words)} \n")


Topic 1: like, just, use, know, apr, distribution, university, say, dod, good, make, work, thing, need, time, new, usa, want, look, year 

Topic 2: university, israel, say, know, just, apr, like, state, israeli, right, year, use, jew, arab, make, time, want, world, way, jewish 

Topic 3: use, university, know, like, just, need, work, thanks, problem, want, computer, good, distribution, time, help, run, try, replyto, drive, apr 

Topic 4: university, use, like, know, just, work, time, need, new, distribution, thanks, say, good, look, want, make, try, usa, problem, question 

Topic 5: say, just, government, use, like, state, make, know, gun, right, time, university, way, distribution, apr, good, day, thing, law, want 

Topic 6: university, use, know, distribution, thanks, email, like, computer, look, just, apr, new, work, usa, want, science, problem, time, help, need 

Topic 7: university, know, just, like, use, good, say, time, apr, distribution, want, new, look, car, make, year, comput

Visualize the author-topic distribution

In [37]:
# Normalize the author_topic_matrix to get author-topic distribution

# Compute the sum of rows in author_topic_matrix
author_topic_sum = loaded_model.author_topic_matrix.sum(axis=1)[:, np.newaxis]

# Replace zero sums with a small epsilon value
epsilon = 1e-10
author_topic_sum[author_topic_sum == 0] = epsilon

# Perform element-wise division
author_topic_dist = loaded_model.author_topic_matrix / author_topic_sum

# Visualize the top N topics for each author
N_TOP_TOPICS = 2
for i, author in enumerate(loaded_model.authors):
    top_topics_idx = author_topic_dist[i].argsort()[-N_TOP_TOPICS:][::-1]
    print(f"Author {i+1} => {author} : Topic IDs {top_topics_idx} \n")

Author 1 => Mamatha Devineni Ratnam : Topic IDs [10 19] 

Author 2 => mblawson@midway.ecn.uoknor.edu (Matthew B Lawson) : Topic IDs [ 5 19] 

Author 3 => hilmi-er@dsv.su.se (Hilmi Eren) : Topic IDs [ 1 14] 

Author 4 => guyd@austin.ibm.com (Guy Dawson) : Topic IDs [ 2 15] 

Author 5 => Alexander Samuel McDiarmid : Topic IDs [2 5] 

Author 6 => tell@cs.unc.edu (Stephen Tell) : Topic IDs [ 7 19] 

Author 7 => lpa8921@tamuts.tamu.edu (Louis Paul Adams) : Topic IDs [18 19] 

Author 8 => dchhabra@stpl.ists.ca (Deepak Chhabra) : Topic IDs [10 19] 

Author 9 => arromdee@jyusenkyou.cs.jhu.edu (Ken Arromdee) : Topic IDs [1 8] 

Author 10 => sandvik@newton.apple.com (Kent Sandvik) : Topic IDs [19  4] 

Author 11 => steve-b@access.digex.com (Steve Brinich) : Topic IDs [18  4] 

Author 12 => Thyagi@cup.portal.com (Thyagi Morgoth NagaSiva) : Topic IDs [19  1] 

Author 13 => filipe@vxcrna.cern.ch (VINCI) : Topic IDs [11 19] 

Author 14 => kmr4@po.CWRU.edu (Keith M. Ryan) : Topic IDs [ 8 19] 

Author