# Vectorizer
___

In [1]:
import pandas as pd

In [2]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
import nltk
for modulo in ['stopwords', 'wordnet', 'punkt']:
    nltk.download(modulo, quiet=True)

In [4]:
stop_words = set(stopwords.words("spanish"))
lemmatizer = WordNetLemmatizer()

In [5]:
text_1 = 'Hola soy Luciano'
text_2 = 'Hola soy Luciano y me gusta jugar futbol'

In [6]:
def lemmatize(text):
    
    lemmas = []
    tokens = word_tokenize(text)
    
    for token in tokens:
        if token.isalnum() and token.lower() not in stop_words:
            lemma = lemmatizer.lemmatize(token.lower())
            lemmas.append(lemma)
    
    lemmatized_text = ' '.join(lemmas)
    return lemmatized_text

In [7]:
lemmatized_text_1, lemmatized_text_2 = lemmatize(text_1), lemmatize(text_2)
print(lemmatized_text_1)
print(lemmatized_text_2)

hola luciano
hola luciano gusta jugar futbol


In [9]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([lemmatized_text_1, lemmatized_text_2])

In [10]:
tfidf_matrix

<2x5 sparse matrix of type '<class 'numpy.float64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [11]:
print(tfidf_matrix)

  (0, 4)	0.7071067811865475
  (0, 2)	0.7071067811865475
  (1, 0)	0.4992213265230509
  (1, 3)	0.4992213265230509
  (1, 1)	0.4992213265230509
  (1, 4)	0.35520008546852583
  (1, 2)	0.35520008546852583


In [12]:
similitud = 1 - (tfidf_matrix * tfidf_matrix.T).toarray()[0, 1]
similitud

0.4976712217743283