# Document-term matrix generation

In this exercise, you'll have to generate a document-term matrix from an input list of preprocessed documents.

In [1]:
from typing import List, Tuple
import ipytest
import pytest

ipytest.autoconfig()

Input documents are given as lists of tokenized terms.

In [2]:
DOCUMENTS = [
    ["aaa", "bbb", "ccc"],
    ["eee", "fff"],
    ["aaa", "eee", "aaa", "ccc", "fff", "fff", "ggg", "aaa"],
    ["bbb", "bbb", "bbb"],
    ["ggg", "fff", "ccc", "aaa", "ccc"],
]

Your task is to complete this method:

In [15]:
def get_doc_term_matrix(docs: List[List[str]]) -> Tuple[List[List[int]], List[str]]:
    """Generates a document-term matrix and the corresponding vocabulary.
    
    Args:
        docs: List of documents, each given by a list of tokenized terms.
        
    Returns:
        Tuple consisting of the document-term matrix and the corresponding vocabulary.
        In the document-term matrix row `i` corresponds to `docs[i]` and column `j`
        corresponds to the jth element of the vocabulary. Values represent the number
        of times the term appears in the document.
        Terms may be in any order in the vocabulary.
    """
    vocabulary = []
    doc_term_matrix = []
    frequency_list = []
    # TODO Complete method.
    for doc in docs:
        term_frequency = {}
        for token in doc:
            
            if token not in vocabulary:
                vocabulary.append(token)
                
            if token in term_frequency.keys():
                term_frequency[token] = term_frequency[token] + 1
            else:
                term_frequency[token] = 1
                
        frequency_list.append(term_frequency)
    
    print(vocabulary)
    print(frequency_list)
    
    for i in range(len(frequency_list)):
        doc = []
        tf = frequency_list[i]
        for key in vocabulary:
            if key in tf.keys():
                doc.append(tf[key])
            else:
                doc.append(0)
        doc_term_matrix.append(doc)
        
    print(doc_term_matrix)
    return doc_term_matrix, vocabulary

Tests.

In [16]:
%%ipytest

def test_num_docs():
    doc_term_matrix, _ = get_doc_term_matrix(DOCUMENTS)
    assert len(doc_term_matrix) == len(DOCUMENTS)
    
def test_vocabulary():
    _, vocabulary = get_doc_term_matrix(DOCUMENTS)
    assert set(vocabulary) == {"aaa", "bbb", "ccc", "eee", "fff", "ggg"}
    
def test_term_counts():
    doc_term_matrix, vocabulary = get_doc_term_matrix(DOCUMENTS)
    idx_aaa = vocabulary.index("aaa")
    idx_ccc = vocabulary.index("ccc")
    idx_fff = vocabulary.index("fff")
    assert doc_term_matrix[0][idx_aaa] == 1
    assert doc_term_matrix[0][idx_ccc] == 1
    assert doc_term_matrix[0][idx_fff] == 0
    assert doc_term_matrix[2][idx_aaa] == 3
    assert doc_term_matrix[2][idx_ccc] == 1
    assert doc_term_matrix[2][idx_fff] == 2

[32m.[0m[32m.[0m[32m.[0m[32m                                                                                          [100%][0m
[32m[32m[1m3 passed[0m[32m in 0.03s[0m[0m
