In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

corpus = [
    "I like deep learning.", 
    "I like NLP.", 
    "I enjoy flying."
]

cvectorizer = CountVectorizer()
X = cvectorizer.fit_transform(corpus)
terms = cvectorizer.get_feature_names_out()

# List of all terms in order of document-term matrix columns. 
print(terms)

# Document term matrix 
print(X.toarray())

# Data type of matrix and shape 
print(type(X))
print(X.shape)

# [[1 0 0 1 1 0]
#  [0 0 0 0 1 1]
#  [0 1 1 0 0 0]]

documentTermMatrix = pd.DataFrame(X.toarray(),
                                  index=["Doc 1", "Doc 2", "Doc 3", ],
                                  columns=terms)

print(documentTermMatrix.to_string())

#        deep  enjoy  flying  learning  like  nlp
# Doc 1     1      0       0         1     1    0
# Doc 2     0      0       0         0     1    1
# Doc 3     0      1       1         0     0    0

['deep' 'enjoy' 'flying' 'learning' 'like' 'nlp']
[[1 0 0 1 1 0]
 [0 0 0 0 1 1]
 [0 1 1 0 0 0]]
<class 'scipy.sparse._csr.csr_matrix'>
(3, 6)
       deep  enjoy  flying  learning  like  nlp
Doc 1     1      0       0         1     1    0
Doc 2     0      0       0         0     1    1
Doc 3     0      1       1         0     0    0


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

corpus = [
    "I like deep learning.", 
    "I like NLP.", 
    "I enjoy flying."
]

vectorizer = TfidfVectorizer() 
X = vectorizer.fit_transform(corpus)

tfidfMatrix = pd.DataFrame(X.toarray(),
                            index=["Doc 1", "Doc 2", "Doc 3", ],
                            columns=terms)

print(tfidfMatrix)
#            deep     enjoy    flying  learning      like       nlp
# Doc 1  0.622766  0.000000  0.000000  0.622766  0.473630  0.000000
# Doc 2  0.000000  0.000000  0.000000  0.000000  0.605349  0.795961
# Doc 3  0.000000  0.707107  0.707107  0.000000  0.000000  0.000000

           deep     enjoy    flying  learning      like       nlp
Doc 1  0.622766  0.000000  0.000000  0.622766  0.473630  0.000000
Doc 2  0.000000  0.000000  0.000000  0.000000  0.605349  0.795961
Doc 3  0.000000  0.707107  0.707107  0.000000  0.000000  0.000000


In [3]:
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation

# Here we run the SVD algorithm
svd = TruncatedSVD(n_components=2)
lsa = svd.fit_transform(X)
print("Some additional information after running SVD: ")
print(svd.explained_variance_ratio_)
print(svd.explained_variance_ratio_.sum())
print(svd.singular_values_)
print(svd.algorithm)
print("----------")
print("Here are svd components: ")
print(svd.components_)
print("----------")
print("Here is the resulting compressed matrix: ")
print(lsa)
print("----------")

topic_encoded_df: pd.DataFrame = pd.DataFrame(lsa, columns=['topic_1', 'topic2'])

topic_encoded_df["corpus"] = corpus

print(topic_encoded_df.to_string())

# https://github.com/sirajzade/learningVideos/blob/main/vectorizing.py

Some additional information after running SVD: 
[0.23711277 0.36855639]
0.6056691599980445
[1.13433283 1.        ]
randomized
----------
Here are svd components: 
[[ 3.88212397e-01  4.56337927e-17  5.28752610e-17  3.88212397e-01
   6.72600419e-01  4.96176326e-01]
 [ 6.31861587e-17  7.07106781e-01  7.07106781e-01 -1.78242739e-17
  -5.31641590e-17 -6.92996404e-17]]
----------
Here is the resulting compressed matrix: 
[[ 8.02094437e-01  3.06972049e-18]
 [ 8.02094437e-01 -8.73426236e-17]
 [ 6.96564199e-17  1.00000000e+00]]
----------
        topic_1        topic2                 corpus
0  8.020944e-01  3.069720e-18  I like deep learning.
1  8.020944e-01 -8.734262e-17            I like NLP.
2  6.965642e-17  1.000000e+00        I enjoy flying.


In [13]:
import string, nltk
from nltk.corpus import brown
from gensim.models import Word2Vec
 
nltk.download("brown")

[nltk_data] Downloading package brown to /home/mbahng/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [16]:
# Preprocessing data to lowercase all words and remove single punctuation words
document = brown.sents()

data = []
for sent in document:
  new_sent = []
  for word in sent:
    new_word = word.lower()
    if new_word[0] not in string.punctuation:
      new_sent.append(new_word)
  if len(new_sent) > 0:
    data.append(new_sent)
 


57158


In [18]:
print(len(data))
print(len(data[0]))
print(data[0])

57158
22
['the', 'fulton', 'county', 'grand', 'jury', 'said', 'friday', 'an', 'investigation', 'of', "atlanta's", 'recent', 'primary', 'election', 'produced', 'no', 'evidence', 'that', 'any', 'irregularities', 'took', 'place']


In [20]:
# Creating Word2Vec
model = Word2Vec(
    sentences = data,
    vector_size = 50,
    window = 10,
    epochs = 20,
    workers=6
)

model["love"]

<gensim.models.word2vec.Word2Vec at 0x7f4880394e20>

In [24]:
print(model.wv["love"])

# [ 1.0745513  -1.8171308  -2.4329011  -0.3691842  -0.95292336 -0.54824775
#   1.1184701  -1.2525641  -0.7875846  -3.7816436  -1.341159    2.6486464
#  -0.30800238 -2.7417247   0.17696398 -2.9048784   1.621813    0.49121374
#   0.4354661  -1.6528435  -2.4828649   0.4085583  -0.7043962   2.8490443
#  -0.98837584  1.6951126  -1.607722    1.3588951  -0.03844598 -0.4779845
#  -3.2942739   1.3696849   0.07875736  1.0799417  -1.6086684   0.6993245
#   1.5824703   1.5176587   1.626068    1.7591808  -1.3893017  -2.4028397
#  -0.36541265  0.71958435  2.0678997  -1.6587187   1.6821662  -3.3152702
#  -1.6718794  -1.6396806 ]

[ 1.0745513  -1.8171308  -2.4329011  -0.3691842  -0.95292336 -0.54824775
  1.1184701  -1.2525641  -0.7875846  -3.7816436  -1.341159    2.6486464
 -0.30800238 -2.7417247   0.17696398 -2.9048784   1.621813    0.49121374
  0.4354661  -1.6528435  -2.4828649   0.4085583  -0.7043962   2.8490443
 -0.98837584  1.6951126  -1.607722    1.3588951  -0.03844598 -0.4779845
 -3.2942739   1.3696849   0.07875736  1.0799417  -1.6086684   0.6993245
  1.5824703   1.5176587   1.626068    1.7591808  -1.3893017  -2.4028397
 -0.36541265  0.71958435  2.0678997  -1.6587187   1.6821662  -3.3152702
 -1.6718794  -1.6396806 ]


In [None]:
corpus = [
    "I like deep learning.", 
    "I like NLP.", 
    "I enjoy flying."
]

for document