# Document Similarity: TF-IDF

In [125]:
import numpy as np
import pandas as pd
import re
import os
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download("stopwords")
from nltk.corpus import stopwords
import stanza
print("Downloading English model...")
stanza.download('en')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/esapalosaari/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Downloading English model...


In [72]:
def tokenize_and_normalize(doc_str, stopwords):
    """Tokenizes, lemmatizes, lowercases and removes stop words.
    
    this function takes in a path to a song, reads the song file,
    tokenizes it into words, then lemmatizes and lowercases these words.
    finally, stopwords given to the function are removed from the list of song lemmas
    
    Parameters
    ----------
    file_name : str
        a path to a text file
    stopwords : list of strings
        stopwords that should be removed
    
    Returns
    -------
    normalized_song : list of strings
        a song represented as a list of its lemmas
    """
    
    nlp = stanza.Pipeline(lang='en', processors='tokenize, lemma',  verbose=False)
    doc = nlp(doc_str)
    words = doc.iter_words()
    normalized_doc = []
    for w in words:
        w = w.lemma.lower()
        if not w in stopwords:
            normalized_doc.append(w)
    normalized_doc = ' '.join(normalized_doc)
    return normalized_doc


In [151]:
## Remove the numbers at the start and end of the documents.
DATAFILE = "./Data/LeePincombeWelshDocuments.txt"
CLEANFILE = "./Data/cleanLPW.txt"
INDIVIDUAL_DOCS = "./Data/Docs"
stopwords_english = stopwords.words('english')
normalized_docs = []
if (os.path.exists(CLEANFILE)):
    os.remove(CLEANFILE)
i = 0
with open(DATAFILE, 'r', encoding="utf8", errors="ignore") as inputfile:
     lines = inputfile.readlines()
     for line in lines[1:-1]:
        start_removed = re.sub("(\d*\.\s)", "", line, 1)
        end_removed = re.sub("\(\d* words\)", "", start_removed, 1)
        normalized_docs.append(tokenize_and_normalize(end_removed, stopwords_english).split())
        with open(CLEANFILE, 'a+') as outputfile:
            outputfile.write(end_removed)
        with open(INDIVIDUAL_DOCS+f"/{i}.txt", "w+") as docfile:
            docfile.write(end_removed)
            i = i + 1

     

In [141]:
   
print(normalized_docs)



In [94]:
def tf_idf(td_matrix):
    """ Weighs a term-document matrix of raw counts with tf-idf scheme
    
    this function takes in a term-document matrix as a numpy array, 
    and weights the scores with the tf-idf algorithm described above.
    idf values are modified with log_10
    
    Parameters
    ----------
    td_matrix : numpy array 
        a matrix where columns are songs and 
        rows are word counts in a song
    
    Returns
    -------
    tf_idf_matrix : numpy array 
        a matrix where columns are songs and 
        rows are word tf-idf values in a song
        
    idf_vector : numpy array of shape (vocabulary-size, 1)
        a vector of idf values for words in the collection. the shape is (vocabulary-size, 1)
        this vector will be used to weight new query documents
    """
    # YOUR CODE HERE
    idf_vector = []
    for row in td_matrix:
        nonzeros = np.count_nonzero(row)
        documents = len(row)
        if nonzeros > 0:
            idf = np.log10(documents/nonzeros)
        else:
            idf = 0
        idf_vector.append(idf)
    
    idf_vector = np.array(idf_vector)
    idf_vector = idf_vector.reshape((len(idf_vector), 1))
    tf_idf_matrix = idf_vector * td_matrix
    
    return tf_idf_matrix, idf_vector  

In [95]:


def create_term_doc_matrix(docs_normalized):
    """ Constructs a frequency term-document matrix
    
    this function takes in a list of songs and returns a term-document matrix
    the rows are lemma types, the columns are songs 
    the rows should be sorted alphabetically
    the order of the columns should be preserved as it's given in docs_normalized
    the cell values are a number of times a lemma was seen in a song
    the value should be zero, if a lemma is absent from a song
    
    Parameters
    ----------
    docs_normalized : a list of lists of strings [['a','a','b'], ['a','b','c']]
        a list of songs represented as a list of lemmas
    
    Returns
    -------
    matrix : numpy array
        a matrix where columns are songs and rows are lemma types,
        the cells of the matrix contain lemma counts in a song,
        the lemmas for rows are sorted alphabetically
        for the example above it will be:
            np.array([[2,1],
                      [1,1],
                      [0,1]])
        
    sorted_vocab : list of strings
        a list of all the lemma types used in all songs (the rows of our matrix)
        the words should be strings sorted alphabetically
        for the example above it should be ['a','b','c']
    """
    
    # 1. create one ordered list of all terms
    sorted_vocab = []
    for song in docs_normalized:
        sorted_vocab += song
    if sorted_vocab != None:
        sorted_vocab = list(set(sorted_vocab))
        sorted_vocab.sort() 
    
    # 2. count the number of occurences of each term in each song
    # 2.2. add to list of list
    matrix = []
    for term in sorted_vocab:
        term_counts = []
        for song in docs_normalized:
            occurences = song.count(term)
            term_counts.append(occurences)
        matrix.append(term_counts)
    
    matrix = np.array(matrix)
    
    return matrix, sorted_vocab

In [142]:
term_doc_matrix, sorted_vocab = create_term_doc_matrix(normalized_docs)

In [143]:
print(term_doc_matrix.shape)

(1391, 50)


In [144]:
def tf_idf(td_matrix):
    """ Weighs a term-document matrix of raw counts with tf-idf scheme
    
    this function takes in a term-document matrix as a numpy array, 
    and weights the scores with the tf-idf algorithm described above.
    idf values are modified with log_10
    
    Parameters
    ----------
    td_matrix : numpy array 
        a matrix where columns are songs and 
        rows are word counts in a song
    
    Returns
    -------
    tf_idf_matrix : numpy array 
        a matrix where columns are songs and 
        rows are word tf-idf values in a song
        
    idf_vector : numpy array of shape (vocabulary-size, 1)
        a vector of idf values for words in the collection. the shape is (vocabulary-size, 1)
        this vector will be used to weight new query documents
    """
    idf_vector = []
    for row in td_matrix:
        nonzeros = np.count_nonzero(row)
        documents = len(row)
        if nonzeros > 0:
            idf = np.log10(documents/nonzeros)
        else:
            idf = 0
        idf_vector.append(idf)
    
    idf_vector = np.array(idf_vector)
    idf_vector = idf_vector.reshape((len(idf_vector), 1))
    tf_idf_matrix = idf_vector * td_matrix
    
    return tf_idf_matrix, idf_vector   

In [145]:
tf_idf_matrix, idf_vector = tf_idf(term_doc_matrix)

In [146]:
print(tf_idf_matrix.shape)

(1391, 50)


In [147]:
tf_idf_matrix.shape

(1391, 50)

In [153]:
similarities = cosine_similarity(tf_idf_matrix.T)

In [154]:
len(similarities[0])

50

In [120]:
print(similarities[0])

[1.00000000e+00 2.13781479e-02 4.72442066e-03 2.98296072e-02
 1.59332365e-03 1.36881180e-02 1.72153297e-02 2.27744278e-02
 6.69741566e-02 3.92486390e-03 3.45599622e-03 1.84738436e-03
 1.77662790e-02 3.84084772e-01 4.82750390e-02 1.95348520e-03
 3.23847472e-03 2.03652701e-02 1.47581451e-02 1.82154977e-02
 7.73358714e-03 7.21878314e-07 1.93592903e-03 1.32865111e-02
 2.21279922e-03 1.71801996e-02 1.76944690e-03 3.24662851e-03
 1.00893892e-02 1.57550568e-03 5.02011734e-03 1.27807306e-02
 1.79229233e-01 8.82599272e-05 7.65711903e-05 1.95474355e-03
 1.93600134e-02 2.27611466e-02 1.83126119e-02 1.37678122e-03
 2.76650369e-03 3.44635866e-03 1.40324171e-04 3.50182704e-03
 1.03476441e-02 4.06431049e-02 8.62332080e-05 2.14074821e-03
 3.35951525e-02 6.69528123e-02]


In [121]:
print(similarities[1])

[2.13781479e-02 1.00000000e+00 7.14311070e-03 1.97872424e-02
 4.86577468e-03 7.12496677e-03 9.91989641e-02 1.40579191e-02
 1.97446385e-02 6.89966106e-03 3.69667137e-02 5.08562291e-03
 6.99804289e-03 9.59810692e-03 2.18703851e-03 8.00589238e-03
 6.08308715e-03 2.94489900e-02 4.73513675e-02 1.84346628e-02
 3.36389579e-02 1.64930833e-02 8.29278060e-03 2.23322244e-03
 1.89262575e-03 8.79847042e-03 7.86197728e-03 6.64036234e-03
 2.65797727e-02 5.23253759e-02 1.71504699e-02 4.28433400e-02
 7.34790090e-03 6.22605164e-03 4.26442771e-05 1.84420128e-02
 4.37043202e-03 7.45375972e-07 1.40573214e-02 2.28396767e-02
 1.23007146e-02 6.18042117e-03 4.15132757e-02 1.53574194e-02
 8.92836443e-03 1.88414593e-02 2.51387025e-02 5.32193151e-03
 1.25730666e-01 6.17773625e-03]


In [156]:
human_evaluation_data = pd.read_csv("Data/AverageSimilarities_fixed.csv")
tf_idf_similarities =similarities[human_evaluation_data.Document_1-1, human_evaluation_data.Document_2-1]
print(len(tf_idf_similarities))
print(len(human_evaluation_data.Similarity_avg_normalized))

1225
1225


In [157]:
human_evaluation_data["Similarity_tf_idf"] = tf_idf_similarities

In [158]:
human_evaluation_data.head()

Unnamed: 0,Document_1,Document_2,Similarity_avg,Similarity_avg_normalized,Similarity_word2vec,Similarity_doc2vec,Similarity_tf_idf
0,1,2,1.5,0.125,0.180149,0.387408,0.021084
1,1,3,1.2,0.05,0.539679,0.241664,0.004666
2,1,4,1.0,0.0,0.274009,0.16937,0.028945
3,1,5,1.5,0.125,0.392186,0.274045,0.001599
4,1,6,2.5,0.375,0.25207,0.385343,0.013378


In [159]:
human_evaluation_data.to_csv('Data/AverageSimilarities_fixed.csv', index=False)

In [160]:
np.corrcoef(human_evaluation_data.Similarity_avg_normalized, tf_idf_similarities)

array([[1.        , 0.56675568],
       [0.56675568, 1.        ]])

In [161]:
np.corrcoef(human_evaluation_data.Similarity_avg_normalized, human_evaluation_data.Similarity_doc2vec)

array([[1.        , 0.44519815],
       [0.44519815, 1.        ]])

In [162]:
np.corrcoef(human_evaluation_data.Similarity_avg_normalized, human_evaluation_data.Similarity_word2vec)

array([[1.        , 0.46886663],
       [0.46886663, 1.        ]])

In [163]:
np.corrcoef(human_evaluation_data.Similarity_avg, human_evaluation_data.Similarity_word2vec)

array([[1.        , 0.46886663],
       [0.46886663, 1.        ]])