In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

docs = ["My name is Mahtab.", "The programming language name is Python.", "I love programming in Python."]
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(docs)

print("Vocbulary:", vectorizer.get_feature_names_out())

df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
print(df.round(3))


Vocbulary: ['language' 'love' 'mahtab' 'programming' 'python']
   language   love  mahtab  programming  python
0     0.000  0.000     1.0        0.000   0.000
1     0.681  0.000     0.0        0.518   0.518
2     0.000  0.681     0.0        0.518   0.518


In [50]:
import torch

def getDocMatrix():
    #documents = [["My", "name", "is", "Mahtab"],["The", "programming", "language", "name", "is", "Python"],["I", "love", "programming", "in", "Python", "programming", "language"]]
    documents = [["the","dog","is","a","nice","dog"],["the","ant","is","no","dog"]]
    nrDocs = len(documents)
    vocab = []
    for doc in documents:
        for word in doc:
            if word not in vocab:
                word = word.lower()
                vocab.append(word)
    
    for word in vocab:
        print(word)
    
    nrWords = len(vocab)
    #initialize the document matrix with zeros
    docMatrix = torch.zeros((nrDocs, nrWords), dtype=torch.float32)
    #print("Document Matrix Shape:", docMatrix)
    #create a dictionary to map words to their indices
    word2idx = {word: idx for idx, word in enumerate(vocab)}
    #print("Word to Index Mapping:", word2idx)
    for docIdx, doc in enumerate(documents):
        #replace words with their indices and transform the list to torch tensor
        tensorizedDoc = torch.tensor([word2idx[word.lower()] for word in doc], dtype=torch.long)
        print("Tensorized Document:", tensorizedDoc)
        #intermediate matrix where each row represents one token in the document
        intermediateMatrix = torch.zeros((len(tensorizedDoc), nrWords), dtype=torch.float32)
        rows = torch.arange(len(tensorizedDoc))
        intermediateMatrix[rows, tensorizedDoc] = 1.0
        #print("Intermediate Matrix:\n", intermediateMatrix)
        #sum the intermediate matrix along the rows to get the document vector
        docVector = torch.sum(intermediateMatrix, dim=0)
       # print("Document Vector:", docVector)
        #assign the document vector to the corresponding row in the document matrix
        docMatrix[docIdx] = docVector


    
    print("Document Matrix:\n", docMatrix)
    return docMatrix





docMatrix = getDocMatrix()   

the
dog
is
a
nice
ant
no
Tensorized Document: tensor([0, 1, 2, 3, 4, 1])
Tensorized Document: tensor([0, 5, 2, 6, 1])
Document Matrix:
 tensor([[1., 2., 1., 1., 1., 0., 0.],
        [1., 1., 1., 0., 0., 1., 1.]])


In [53]:
wordFreq = torch.sum(docMatrix, dim=0)
print("Word Frequencies:", wordFreq)
wordCount = torch.sum(docMatrix, dim=1)
print("Word Counts per Document:", wordCount)

termFreqMatrix = torch.div(docMatrix.T, wordCount).T
print("Term Frequency Matrix:\n", termFreqMatrix)

docFreq = torch.count_nonzero(docMatrix, dim=0)
print("Document Frequencies:", docFreq)

inverseDocFreq = torch.log((torch.tensor(docMatrix.size(0), dtype=torch.float32) / (docFreq ))+ 1)
print("Inverse Document Frequencies:", inverseDocFreq)

tfidfMatrix = termFreqMatrix * inverseDocFreq
print("TF-IDF Matrix:\n", tfidfMatrix)

Word Frequencies: tensor([2., 3., 2., 1., 1., 1., 1.])
Word Counts per Document: tensor([6., 5.])
Term Frequency Matrix:
 tensor([[0.1667, 0.3333, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.2000, 0.2000]])
Document Frequencies: tensor([2, 2, 2, 1, 1, 1, 1])
Inverse Document Frequencies: tensor([0.6931, 0.6931, 0.6931, 1.0986, 1.0986, 1.0986, 1.0986])
TF-IDF Matrix:
 tensor([[0.1155, 0.2310, 0.1155, 0.1831, 0.1831, 0.0000, 0.0000],
        [0.1386, 0.1386, 0.1386, 0.0000, 0.0000, 0.2197, 0.2197]])
