# Count Vectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

#initialize the vectorizer
vectorizer=CountVectorizer()

#Add the files in the corpus, a document on each line
corpus=["Racing games",
        "This document describes racing cars",
        "This document is about video games in general",
        "This is a nice racing video game"]

#creates the model
model=vectorizer.fit_transform(corpus)

In [None]:
cos=cosine_similarity(model)
print(cos)

[[1.         0.31622777 0.25       0.28867513]
 [0.31622777 1.         0.31622777 0.36514837]
 [0.25       0.31622777 1.         0.4330127 ]
 [0.28867513 0.36514837 0.4330127  1.        ]]


In [None]:
print(model.shape)
print(vectorizer.get_feature_names_out())
print(model.toarray())

(4, 13)
['about' 'cars' 'describes' 'document' 'game' 'games' 'general' 'in' 'is'
 'nice' 'racing' 'this' 'video']
[[0 0 0 0 0 1 0 0 0 0 1 0 0]
 [0 1 1 1 0 0 0 0 0 0 1 1 0]
 [1 0 0 1 0 1 1 1 1 0 0 1 1]
 [0 0 0 0 1 0 0 0 1 1 1 1 1]]


In [None]:
from nltk.corpus import stopwords

#stop words
sw=stopwords.words('english')

#initialize the vectorizer
vectorizer=CountVectorizer(stop_words=sw)

In [None]:
#Add the files in the corpus, a document on each line
corpus=["Racing games",
        "This document describes racing cars",
        "This document is about video games in general",
        "This is a nice racing video game"]

#creates the model
model=vectorizer.fit_transform(corpus)

In [None]:
print(model.shape)
print(vectorizer.get_feature_names_out())
print(model.toarray())

(4, 9)
['cars' 'describes' 'document' 'game' 'games' 'general' 'nice' 'racing'
 'video']
[[0 0 0 0 1 0 0 1 0]
 [1 1 1 0 0 0 0 1 0]
 [0 0 1 0 1 1 0 0 1]
 [0 0 0 1 0 0 1 1 1]]


In [None]:
from nltk.stem import snowball
from nltk.tokenize import word_tokenize
import re

def my_tokenizer(text):
        sw=stopwords.words('english')
        stemmer=snowball.SnowballStemmer(language="english")
        tokens=word_tokenize(text)
        pruned=[stemmer.stem(t) for t in tokens if re.search(r"^\w",t) and not t in sw]
        return pruned

In [None]:
#initialize the vectorizer
vectorizer=CountVectorizer(tokenizer=my_tokenizer)

#Add the files in the corpus, a document on each line
corpus=["Racing games",
        "This document describes racing cars",
        "This document is about video games in general",
        "This is a nice racing video game"]

#creates the model
model=vectorizer.fit_transform(corpus)

#prints the model
print(model.shape)
print(vectorizer.get_feature_names_out())
print(model.toarray())

(4, 8)
['car' 'describ' 'document' 'game' 'general' 'nice' 'race' 'video']
[[0 0 0 1 0 0 1 0]
 [1 1 1 0 0 0 1 0]
 [0 0 1 1 1 0 0 1]
 [0 0 0 1 0 1 1 1]]




In [None]:
cos=cosine_similarity(model)
print(cos)

[[1.         0.35355339 0.35355339 0.70710678]
 [0.35355339 1.         0.25       0.25      ]
 [0.35355339 0.25       1.         0.5       ]
 [0.70710678 0.25       0.5        1.        ]]


# TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import snowball
import re

def my_tokenizer(text):
        sw=stopwords.words('english')
        stemmer=snowball.SnowballStemmer(language="english")
        tokens=word_tokenize(text)
        pruned=[stemmer.stem(t) for t in tokens if re.search(r"^\w",t) and not t in sw]
        return pruned

#initialize the vectorizer
vectorizer=TfidfVectorizer(tokenizer=my_tokenizer)

#Add the files in the corpus, a document on each line
corpus=["Racing games",
        "This document describes racing cars",
        "This document is about video games in general",
        "This is a nice racing video game"]

#creates the model
model=vectorizer.fit_transform(corpus)


In [None]:
print(model.shape)
print(vectorizer.get_feature_names_out())
print(model.toarray())

(4, 8)
['car' 'describ' 'document' 'game' 'general' 'nice' 'race' 'video']
[[0.         0.         0.         0.70710678 0.         0.
  0.70710678 0.        ]
 [0.57457953 0.57457953 0.4530051  0.         0.         0.
  0.36674667 0.        ]
 [0.         0.         0.4842629  0.39205255 0.61422608 0.
  0.         0.4842629 ]
 [0.         0.         0.         0.40892206 0.         0.64065543
  0.40892206 0.5051001 ]]


In [None]:
cos=cosine_similarity(model)
print(cos)

[[1.         0.25932906 0.27722302 0.57830313]
 [0.25932906 1.         0.21937356 0.1499708 ]
 [0.27722302 0.21937356 1.         0.40492018]
 [0.57830313 0.1499708  0.40492018 1.        ]]


In [None]:
#adds a query to the model
query=vectorizer.transform(["racing game GT7"])
print(query.toarray())


cos=cosine_similarity(query,model)
print(cos)

[[0.         0.         0.         0.70710678 0.         0.
  0.70710678 0.        ]]
[[1.         0.25932906 0.27722302 0.57830313]]


# Gensim

In [None]:
from gensim import corpora
from gensim import models
from gensim import similarities
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import snowball
import re

def my_tokenizer(text):
    """tokenization function"""
    sw=stopwords.words('english')
    stemmer=snowball.SnowballStemmer(language="english")
    tokens=word_tokenize(text)
    pruned=[stemmer.stem(t.lower()) for t in tokens \
            if re.search(r"^\w",t) and not t.lower() in sw]
    return pruned

documents=["This document describes racing cars",
        "This document is about video games in general",
        "This is a nice racing video game"]


In [None]:
texts=[my_tokenizer(d) for d in documents]

#creates the dictionary for the document corpus
dictionary = corpora.Dictionary(texts)
#creates a bag of word corpus
bow_corpus=[dictionary.doc2bow(text) for text in texts]

#creates a tf-idf model from the bag of word corpus
tfidf = models.TfidfModel(bow_corpus)
#creates an index that facilitates the computation of similarities
index = similarities.SparseMatrixSimilarity(tfidf[bow_corpus],len(dictionary))


[array([1.0000001 , 0.07613309, 0.07613309], dtype=float32), array([0.07613309, 1.        , 0.19339646], dtype=float32), array([0.07613309, 0.19339646, 1.        ], dtype=float32)]


In [None]:
print(bow_corpus[0])
tfidf[bow_corpus[0]]
print(texts)
print(list(index))
print(index[tfidf[bow_corpus[0]]])

[(0, 1), (1, 1), (2, 1), (3, 1)]
[['document', 'describ', 'race', 'car'], ['document', 'video', 'game', 'general'], ['nice', 'race', 'video', 'game']]
[array([1.0000001 , 0.07613309, 0.07613309], dtype=float32), array([0.07613309, 1.        , 0.19339646], dtype=float32), array([0.07613309, 0.19339646, 1.        ], dtype=float32)]
[1.0000001  0.07613309 0.07613309]


In [None]:
#tokenizes the query
query_document = my_tokenizer("racing games")
#indexes the query using the documents' dictionary
query_bow = dictionary.doc2bow(query_document)

#computes the similarity between the query and the documents
sims = index[tfidf[query_bow]]
print(list(enumerate(sims)))

[(0, 0.17312077), (1, 0.21988432), (2, 0.43976864)]
