# ADMAGD - Perplexity

## Importing libraries

In [7]:
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_20newsgroups

from collections import defaultdict
import re
from string import punctuation

## Fetch Test Data

In [8]:
categories = ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc' ]

In [9]:
newsgroups = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

## Pre-processing

In [6]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS # Total 318 words

import nltk
nltk.download('punkt')
from nltk.corpus import wordnet as wn
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
nltk.download('averaged_perceptron_tagger') # need for pos_tag

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Alam\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Alam\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Alam\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Alam\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Alam\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [10]:
nltk_stop_words = list(set(stopwords.words('english')))
sk_stop_words = list(ENGLISH_STOP_WORDS)

STOP_WORDS = list(set(nltk_stop_words + sk_stop_words))
STOP_WORDS += ["use", "think", "thanks", "know", "like", "make", "say", "time", "use", "need", "want", "come" ]

# Initialization
lemmatizer = WordNetLemmatizer()

# Creating a POS tag map
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

def convertWordIntoLemmatizeWord(words):
  return [lemmatizer.lemmatize(word, tag_map[tag[0]]) for word, tag in pos_tag(words)]
def preprocess(words):
  #First, remove numbers
  words = [re.sub(r"\d+", "", word) for word in words]
  #Normalize the cases of our words
  words = [word.lower() for word in words]
  #Remove all punctuations
  table = str.maketrans('', '', punctuation)
  words = [word.translate(table) for word in words]
  #Some white spaces may be added to the list of words, due to the translate function & nature of our documents. We've to remove them.
  words = [word for word in words if word]
  #Remove just-numeric strings
  words = [word for word in words if not word.isdigit()]
  #Remove all stop words
  words = [word for word in words if word not in STOP_WORDS]
  #Lemmatize
  words = convertWordIntoLemmatizeWord(words)
  #Remove all stop words
  words = [word for word in words if word not in STOP_WORDS]
  #Remove words with less than 3 characters and more than 20 characters
  words = [word for word in words if len(word) > 2 and len(word) <= 20]
  return words
def preprocess_documents(docs):
  preprocessed_docs = []
  for doc in docs:
    words = word_tokenize(doc)
    words = preprocess(words)
    # preprocessed_docs.append(" ".join(words))
    preprocessed_docs.append(words)
  return preprocessed_docs
preprocessed_docs = preprocess_documents(newsgroups.data)

In [16]:
from gensim import corpora, models


In [17]:
dictionary = corpora.Dictionary(preprocessed_docs)

In [18]:
dictionary.filter_extremes(no_below=15, no_above=0.5)

In [22]:
test_corpus = [dictionary.doc2bow(doc) for doc in preprocessed_docs]

In [23]:
test_corpus

[[(0, 1),
  (1, 2),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 2),
  (10, 1),
  (11, 2),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 2),
  (17, 1),
  (18, 1)],
 [(10, 1),
  (19, 1),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 3),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1)],
 [(17, 1), (35, 1)],
 [(9, 1),
  (23, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 4),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 3),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 2),
  (52, 1),
  (53, 5),
  (54, 1),
  (55, 2),
  (56, 5),
  (57, 1),
  (58, 1),
  (59, 1),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 1),
  (64, 4),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 1),
  (71, 1),
  (72, 1),
  (73, 1),
  (74, 1),
  (75, 1),
  (76, 1),
  (77, 5),
  (78, 3),
  (79, 3),
  (80, 2),
  (81, 2),
  (82, 1),
  (83, 2),
  (84, 1),
  (85, 1),
  (86, 4),
  (87, 1)

## Retrieve Model

In [11]:
model_file_name = "tfidf_train_extra_stopwords_200_iteration_admagd_model"

In [12]:
model_path = f"trained_ model/{model_file_name}"

### Pickle

In [13]:
import pickle

Load the model from a file

In [14]:
with open(f"{model_path}.pkl", 'rb') as f:
    loaded_model = pickle.load(f)

## Perplexity

In [None]:
total_log_likelihood = 0
total_words = 0

In [8]:
for document in test_corpus:
    doc_log_likelihood = 0
    for word_id, count in document:
        word_probability = loaded_model.get_word_probability(word_id, document)
        doc_log_likelihood += np.log(word_probability) * count

    total_log_likelihood += doc_log_likelihood
    total_words += sum(count for _, count in document)

perplexity = np.exp(-total_log_likelihood / total_words)

In [30]:
def calculate_word_probability(word_id, document):
    """
    Calculate the probability of a word given its document context.
    """
    # Sum the probabilities across all topics
    word_prob = 0.0
    for topic_id in range(loaded_model.num_topics):
        topic_prob = calculate_topic_probability(document, topic_id)
        word_prob += topic_prob * loaded_model.calculate_phi_update()[topic_id, word_id]
    return word_prob

In [25]:
def calculate_topic_probability(document, topic_id):
    """
    Calculate the probability of a topic given a document.
    """
    # Assuming each document is associated with a single author
    author = loaded_model.get_author(document[0][0])
    author_idx = loaded_model.authors.index(author)
    topic_prob = loaded_model.calculate_theta_update()[author_idx, topic_id]
    return topic_prob

In [28]:
def calculate_perplexity(test_corpus):
    """
    Calculate the perplexity for the test corpus.
    """
    total_log_likelihood = 0.0
    total_words = 0

    for document in test_corpus:
        doc_log_likelihood = 0.0
        for word_id, count in document:
            word_prob = calculate_word_probability(word_id, document)
            doc_log_likelihood += np.log(word_prob) * count
        total_log_likelihood += doc_log_likelihood
        total_words += sum(count for _, count in document)

    perplexity = np.exp(-total_log_likelihood / total_words)
    return perplexity

In [31]:
calculate_perplexity(test_corpus)

8969.996369761924