In [None]:
# Installation, you can choose a particular version as well e.g. scikit-learn==0.21.3
#!pip install scikit-learn
#!pip install pandas

## Preparing Corpus

In [27]:
documents = ['the cat sat on the mat', 'the mat was red', 'the cat liked the mat']
processed_docs = [doc.lower().replace(".","") for doc in documents]

#look at the documents list
print("Our corpus: ", processed_docs)

Our corpus:  ['the cat sat on the mat', 'the mat was red', 'the cat liked the mat']


## Bag of Words

In [28]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the count vectorizer object
count_vect = CountVectorizer()

# Build a BOW representation
bow_rep = count_vect.fit_transform(processed_docs)

# Print out the vocabulary
print("Vocabulary", count_vect.vocabulary_, "\n")

print("Vocabulary index for cat", count_vect.vocabulary_.get("cat"), "\n")

#see the BOW rep for documents
for ind in range(0,len(processed_docs)):
    print("BoW representation for Document {}: ".format(ind), bow_rep[ind].toarray())

Vocabulary {'the': 6, 'cat': 0, 'sat': 5, 'on': 3, 'mat': 2, 'was': 7, 'red': 4, 'liked': 1} 

Vocabulary index for cat 0 

BoW representation for Document 0:  [[1 0 1 1 0 1 2 0]]
BoW representation for Document 1:  [[0 0 1 0 1 0 1 1]]
BoW representation for Document 2:  [[1 1 1 0 0 0 2 0]]


You can also make a dataframe representation to summarize.

In [29]:
import pandas as pd

df_bow = pd.DataFrame(bow_rep.toarray(),columns=count_vect.get_feature_names_out())

print(df_bow)

   cat  liked  mat  on  red  sat  the  was
0    1      0    1   1    0    1    2    0
1    0      0    1   0    1    0    1    1
2    1      1    1   0    0    0    2    0


**Exercise:** Try adding another document to this BoW representation. Do you notice anything?

In [30]:
#Get the representation using this vocabulary, for a new text
temp = count_vect.transform(["cat and cat are friends"])
print("Bow representation for 'cat and cat are friends':", temp.toarray())

Bow representation for 'cat and cat are friends': [[2 0 0 0 0 0 0 0]]


## Bag of N-grams

In [6]:
# Ngram vectorization example with count vectorizer and uni, bi, trigrams
count_vect = CountVectorizer(ngram_range=(1,3))

# Build a BOW representation for the corpus
bow_rep = count_vect.fit_transform(processed_docs)

# Vocabulary mapping
print("Vocabulary: ", count_vect.vocabulary_)

# See what the results are similar to what we did for BoW

Vocabulary:  {'the': 18, 'cat': 0, 'sat': 15, 'on': 11, 'mat': 8, 'the cat': 19, 'cat sat': 3, 'sat on': 16, 'on the': 12, 'the mat': 22, 'the cat sat': 21, 'cat sat on': 4, 'sat on the': 17, 'on the mat': 13, 'was': 24, 'red': 14, 'mat was': 9, 'was red': 25, 'the mat was': 23, 'mat was red': 10, 'liked': 5, 'cat liked': 1, 'liked the': 6, 'the cat liked': 20, 'cat liked the': 2, 'liked the mat': 7}


## TF-IDF

TF-IDF allows us to consider that there are some words in a document that are more important than others. This measure quantifies the importance of a given word relative to other words in the document and in the corpus. It was commonly used representation scheme for information retrieval systems, for extracting relevant documents from a corpus for a particular query.

It is relatively easy to calculate using TfidfVectorizer in scikit-learn.

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
bow_rep_tfidf = tfidf.fit_transform(processed_docs) #note, you can use n-grams for TF-IDF as well

#All words in the vocabulary.
print("All words in the vocabulary",tfidf.get_feature_names_out())
print("\n")
#IDF for all words in the vocabulary
print("IDF for all words in the vocabulary",tfidf.idf_)
print("\n")


#TFIDF representation for all documents in our corpus
print("TFIDF representation for all documents in our corpus\n",bow_rep_tfidf.toarray())
print("\n")

df_tfidf = pd.DataFrame(bow_rep_tfidf.toarray(),columns=tfidf.get_feature_names_out())

print(df_tfidf)

All words in the vocabulary ['cat' 'liked' 'mat' 'on' 'red' 'sat' 'the' 'was']


IDF for all words in the vocabulary [1.28768207 1.69314718 1.         1.69314718 1.69314718 1.69314718
 1.         1.69314718]


TFIDF representation for all documents in our corpus
 [[0.36580076 0.         0.28407693 0.48098405 0.         0.48098405
  0.56815385 0.        ]
 [0.         0.         0.35959372 0.         0.6088451  0.
  0.35959372 0.6088451 ]
 [0.4172334  0.54861178 0.32401895 0.         0.         0.
  0.64803791 0.        ]]


        cat     liked       mat        on       red       sat       the  \
0  0.365801  0.000000  0.284077  0.480984  0.000000  0.480984  0.568154   
1  0.000000  0.000000  0.359594  0.000000  0.608845  0.000000  0.359594   
2  0.417233  0.548612  0.324019  0.000000  0.000000  0.000000  0.648038   

        was  
0  0.000000  
1  0.608845  
2  0.000000  


How is scikit-learn actually calculating these numbers? It doesn't look like what we got from the exercise in the slides.

One thing scikit-learn does differently is that for calculating term frequency it just takes the number of times the term appears in a document by default.

Secondly, it is accounting for scenarios where a term may appear in all documents of a corpus.

Finally, it normalizes its Tf-IDF vectors (L2/Euclidean norm by default) for each document in the corpus.

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Example documents
documents = [
    "the cat sat on the mat",
    "the mat was red",
    "the cat liked the mat"
]

# Create TfidfVectorizer without normalization
tfidf = TfidfVectorizer(norm=None)

# Fit and transform the documents
tfidf_matrix_unnormalized = tfidf.fit_transform(documents)

# Convert the TF-IDF matrix to an array for better visualization
tfidf_array_unnormalized = tfidf_matrix_unnormalized.toarray()

#unnormalized
print("Un-normalized TF-IDF matrix:")
print(pd.DataFrame(tfidf_array_unnormalized,columns=tfidf.get_feature_names_out()))



Un-normalized TF-IDF matrix:
        cat     liked  mat        on       red       sat  the       was
0  1.287682  0.000000  1.0  1.693147  0.000000  1.693147  2.0  0.000000
1  0.000000  0.000000  1.0  0.000000  1.693147  0.000000  1.0  1.693147
2  1.287682  1.693147  1.0  0.000000  0.000000  0.000000  2.0  0.000000


IDF: log((1 + N)/(1+df(term)) + 1

For term cat:

              log(3/2) = 0.405465 (traditional calculation)

              log(4/3) + 1 = 1.28768207 (scikit-learn)

For term mat:
              
              log(3/3) = 0 (traditional calculation)

              log(4/4) + 1 = 1.0 (scikit-learn)


What if the document doesn't exist?

In [17]:
new_document = ['a new document with OOV words']

#transform the new document using the current vocabulary
tfidf_new = tfidf.transform(new_document)
print(tfidf_new.toarray())

new_document = ['the red cat was nice']

tfidf_new = tfidf.transform(new_document)
df_tfidf = pd.DataFrame(tfidf_new.toarray(),columns=tfidf.get_feature_names_out())

print(df_tfidf)

[[0. 0. 0. 0. 0. 0. 0. 0.]]
        cat  liked  mat   on       red  sat       the       was
0  0.444514    0.0  0.0  0.0  0.584483  0.0  0.345205  0.584483
