<a href="https://colab.research.google.com/github/lorrespz/NLP-Text-Analyses/blob/main/Latent_Semantic_Analysis_CountVectorizer_SVD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Latent Semantic Analysis - CountVectorizer - SVD

In [1]:
!wget -nc https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/nlp_class/all_book_titles.txt

--2024-03-14 07:05:56--  https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/nlp_class/all_book_titles.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 127992 (125K) [text/plain]
Saving to: ‘all_book_titles.txt’


2024-03-14 07:05:56 (8.86 MB/s) - ‘all_book_titles.txt’ saved [127992/127992]



In [2]:
import nltk
import numpy as np
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD

In [4]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [5]:
wordnet_lemmatizer = WordNetLemmatizer()

# Check the text file

In [6]:
titles = [line.rstrip() for line in open('all_book_titles.txt')]
titles

['Philosophy of Sex and Love A Reader',
 'Readings in Judaism, Christianity, and Islam',
 'Microprocessors Principles and Applications',
 'Bernhard Edouard Fernow: Story of North American Forestry',
 'Encyclopedia of Buddhism',
 'Motorola Microprocessor Family: 68000, 68008, 68010, 68020, 68030, and 68040, Programming and Interfacing with Applications',
 'American Anthem: Student Edition Modern Era 2007',
 'How to Read Literature Like a Professor A Lively and Entertaining Guide to Reading Between the Lines',
 'Men Are from Mars, Women Are from Venus Secrets of Great Sex, Improving Communication, Lasting Intimacy and Fulfillment, Giving and Receiving Love, Secrets of Passion, Understanding Martian',
 'Religious Traditions of the World A Journey Through Africa, Mesoamerica, North America, Judaism, Christianity, Islam, Hinduism, Buddhism, China, an',
 "World's Wisdom Sacred Texts of the World's Religions",
 "Illustrated World's Religions A Guide to Our Wisdom Traditions",
 'Soul of Sex Cu

In [27]:
len(titles)

2373

In [8]:
# Get a list of stopwords
stops = set(stopwords.words('english'))
print(stops, end =',')

{'which', 'no', 'most', "weren't", 'that', 'm', 'this', 'them', 'mightn', 'aren', 'until', 'doing', 'such', 'him', 'further', 'if', 'to', 'out', 'these', 'ain', 'was', 'she', "doesn't", 'will', 'those', 'some', 'haven', 'whom', 're', "you've", 'same', 'has', 'o', 'because', "aren't", "wouldn't", 'didn', 'be', 'down', 'during', 'between', 'were', 'through', 'shan', 'ma', 'hers', 'don', 'ours', 'shouldn', 'isn', "shan't", 'the', 'me', 'mustn', "shouldn't", 'against', "didn't", 'am', 'hasn', 'own', "mustn't", "you're", 'yourselves', 'hadn', 'just', 'an', "that'll", 'above', 'do', 'our', 'then', 'when', 'very', 'his', 'theirs', 'all', 'as', 'couldn', 'each', 'more', 'what', 'i', 'won', 'who', 'after', "don't", "you'd", "haven't", 'where', 'can', 'wasn', 'its', 'my', 'is', 'with', 'doesn', 'why', 'does', 'weren', 'from', 'themselves', 'yours', 'we', 'should', 'now', 'but', 'few', "hasn't", "you'll", 'wouldn', 'your', 'not', "won't", 'on', 'than', 'they', 'their', 'off', 'being', 'll', 'up',

In [9]:
# Enlarge the list of stopwords with specific words
stops = stops.union({'introduction', 'edition', 'series', 'application',
                     'approach', 'card', 'access', 'package', 'plus', 'etext',
                     'brief', 'vol', 'fundamental', 'guide', 'volume', 'essential',
                     'printed', 'second', 'third', 'fourth'})

# Define customize tokenizer

In [11]:
wordnet_lemmatizer.lemmatize('Cases and Materials')

'Cases and Materials'

In [12]:
def my_custom_tokenizer(s):
  s = s.lower()

  #split string into word token
  tokens = nltk.tokenize.word_tokenize(s)

  #remove short words since they aren't useful
  tokens = [token for token in tokens if len(token)>2]

  #put words into base form (e.g. changed -> change, changing -> change)
  tokens = [wordnet_lemmatizer.lemmatize(token) for token in tokens]

  #remove stopwords from the set of defined stopwords above
  tokens = [token for token in tokens if token not in stops]

  #remove digits
  tokens = [token for token in tokens if not any(c.isdigit() for c in token)]

  return tokens

# Vectorizer

In [13]:
#binary = True means that the output is only 0 or 1, meaning whether the token appears
vectorizer = CountVectorizer(binary=True, tokenizer = my_custom_tokenizer)

X = vectorizer.fit_transform(titles)



In [15]:
X.shape

(2373, 2131)

In [17]:
X[0].toarray()

array([[0, 0, 0, ..., 0, 0, 0]])

In [19]:
# Create index to word map
index_word_map = vectorizer.get_feature_names_out()
index_word_map[:15]

array(["'the", '...', 'a-z', 'abbas', 'abnormal', 'abridged', 'absolute',
       'absraction', 'abstraction', 'abundance', 'acc', 'access/phils',
       'accessibility', 'accessible', 'accompanied'], dtype=object)

In [34]:
index_word_map[-15:]

array(['workstation', 'world', 'worldwide', 'worthwhile', 'writer',
       'writing', 'wsj', 'xilinx', 'year', 'youbook', 'young', 'youth',
       'zen', 'zionism', 'zurich'], dtype=object)

In [24]:
vectorizer.vocabulary_.items()

dict_items([('philosophy', 1437), ('sex', 1741), ('love', 1117), ('reader', 1589), ('reading', 1590), ('judaism', 1033), ('christianity', 315), ('islam', 1007), ('microprocessor', 1214), ('principle', 1513), ('bernhard', 195), ('edouard', 600), ('fernow', 725), ('story', 1839), ('north', 1325), ('american', 81), ('forestry', 757), ('encyclopedia', 636), ('buddhism', 242), ('motorola', 1249), ('family', 714), ('programming', 1537), ('interfacing', 984), ('anthem', 101), ('student', 1851), ('modern', 1231), ('era', 658), ('read', 1588), ('literature', 1104), ('like', 1091), ('professor', 1531), ('lively', 1105), ('entertaining', 647), ('line', 1094), ('men', 1195), ('mar', 1140), ('woman', 2106), ('venus', 2047), ('secret', 1707), ('great', 843), ('improving', 937), ('communication', 362), ('lasting', 1065), ('intimacy', 995), ('fulfillment', 785), ('giving', 824), ('receiving', 1601), ('passion', 1397), ('understanding', 2020), ('martian', 1147), ('religious', 1634), ('tradition', 1975)

In [26]:
len(vectorizer.vocabulary_)

2131

# SVD

In [25]:
# transpose X to make rows = terms, cols = documents
# By default the CountVectorizer gives documents x terms, we need terms x documents
X = X.T

In [28]:
svd = TruncatedSVD()
Z = svd.fit_transform(X)

In [31]:
# Notice that the 2373 components have been compressed into only 2 components
X.shape, Z.shape

((2131, 2373), (2131, 2))

# Visualization

In [32]:
import plotly.express as px

In [36]:
#Plot the 2 components of Z
fig = px.scatter(x = Z[:,0], y = Z[:,1], text = index_word_map, size_max = 60)
fig.update_traces(textposition = 'top center')
fig.show()