In [11]:
#importando Indice_Invertido.ipynb
import io, os, sys, types
from IPython import get_ipython
from nbformat import current
from IPython.core.interactiveshell import InteractiveShell


def find_notebook(fullname, path=None):
    name = fullname.rsplit('.', 1)[-1]
    if not path:
        path = ['']
    for d in path:
        nb_path = os.path.join(d, name + ".ipynb")
        if os.path.isfile(nb_path):
            return nb_path
        #permite a importação de algum "Notebook.ipynb"
        nb_path = nb_path.replace("_", " ")
        if os.path.isfile(nb_path):
            return nb_path


class NotebookLoader(object):
    def __init__(self, path=None):
        self.Shell = InteractiveShell.instance()
        self.path = path

    def load_module(self, fullname):
        #importa um notebook como um modulo
        path = find_notebook(fullname, self.path)

        print ("importando notebook de %s" % path)

        with io.open(path, 'r', encoding='utf-8') as f:
            nb = current.read(f, 'json')

        mod = types.ModuleType(fullname)
        mod.__file__ = path
        mod.__loader__ = self
        mod.__dict__['get_ipython'] = get_ipython
        sys.modules[fullname] = mod

        save_user_ns = self.Shell.user_ns
        self.Shell.user_ns = mod.__dict__

        try:
            for cell in nb.worksheets[0].cells:
                if cell.cell_type == 'code' and cell.language == 'python':
                    #transformação do input pra ser executavel em Python
                    code = self.Shell.input_transformer_manager.transform_cell(cell.input)
                    exec(code, mod.__dict__)
        finally:
            self.Shell.user_ns = save_user_ns
        return mod


class NotebookFinder(object):
    def __init__(self):
        self.loaders = {}

    def find_module(self, fullname, path=None):
        nb_path = find_notebook(fullname, path)
        if not nb_path:
            return

        key = path
        if path:
            key = os.path.sep.join(path)

        if key not in self.loaders:
            self.loaders[key] = NotebookLoader(path)
        return self.loaders[key]

sys.meta_path.append(NotebookFinder())

import Indice_Invertido

In [12]:
import math
import numpy as np
import re
import argparse
import os
import glob
from pathlib import Path
import time

#o "assert" só retorna um aviso em caso de erro

In [13]:
#classe com as funções de teste principais
class Test:
    
    #função pra testar "formataString"    
    def test_formataString(s):
        assert Indice_Invertido.Indice_Invertido.formataString(s) == ['abc', 'defs', 'lmnopq', 'fdfdj',
        'jfkjdfdd', '2gh3h4vggl', '058', '23848l9', '9ssd8fd43', 'as', '379', 'd', 'gf', 'aaa', 'bbdd', 
        'sja', '3kj3', '984jd', 'cc', 'akjs', 'jsa', 'dskk', 'llll', 'sks', 'aall', 'eee', 'be', 'aa',
        'hh', '3237l']
    

    #função pra testar "indiceInvertido"
    def test_indiceInvertido(arquivo):
        assert Indice_Invertido.Indice_Invertido.indiceInvertido(arquivo) == {'palavra': [[1, 2]],
        'Digo': [[1, 1]], 'digo': [[1, 2]], 'batAta': [[1, 2]], 'feijão': [[1, 1]],
        'batata1': [[1, 1], [2, 1]], 'BATATA': [[1, 1]], 'feijao': [[1, 1]], 'batata': [[1, 1], [2, 1]],
        'repete': [[1, 1], [2, 2]], 'arroz': [[2, 1]], 'ARROZ': [[2, 1]], 'FEIJÃO': [[2, 1]],
        'figo': [[2, 1]]}
    
    #função pra testar "coordenadas"
    def test_coordenadas(indInv, numDocs):
        resultTest = np.array([[0.,2.1972245773362196,0.], [0.,1.0986122886681098,0.],
                               [0.,2.1972245773362196,0.], [0.,2.1972245773362196,0.],
                               [0.,1.0986122886681098,0.], [0.,0.4054651081081644,0.4054651081081644],
                               [0.,1.0986122886681098,0.], [0.,1.0986122886681098,0.],
                               [0.,0.4054651081081644,0.4054651081081644],
                               [0.,0.4054651081081644,0.8109302162163288], [0.,0.,1.0986122886681098],
                               [0.,0.,1.0986122886681098], [0.,0.,1.0986122886681098],
                               [0.,0.,1.0986122886681098]])

        resultFunc = Indice_Invertido.Indice_Invertido.coordenadas(indInv, numDocs)
        
        for i in range(14):
            for j in range(3):
                assert resultTest[i][j] == resultFunc[i][j]
                
                
    #funções pra testar "formataQuery"
    def test_formataQuery(indInv, indQ, numDocs):
        mqResultTest = np.array([[2.1972245773362196], [0.], [2.1972245773362196], [2.1972245773362196]])
        mqResultFunc = Indice_Invertido.Indice_Invertido.formataQuery(indInv, indQ, numDocs)
        
        for i in range(4):
            for j in range(1):
                assert mqResultTest[i][j] == mqResultFunc[i][j]
        
    #função pra testar "verificaConsulta"
    def test_verificaConsulta(coords, Q):
        assert Indice_Invertido.Indice_Invertido.verificaConsulta(coords, Q) == {0: 0.9707253433941511, 1: 1.0000000000000002, 2: 0.9707253433941511}

In [14]:
#as funções dessa classe testam as funções do "Indice_Invertido" no seu formato ATUAL, confirmando
#a ausência de erro em cada parte da função
class ExtraTest:
    
    #funções pra testar "formataString"
    def test_lowerFormStr(sLower):
        assert sLower.lower() == "abdkdfkjook"
    
    
    def test_sub1FormStr(sSub):
        assert re.sub("[:,'|.@()?!#$&]"," ", sSub) == "aaaa bbb   b cc aaa b    cccd   aa     bb dda"
    
    
    def test_paragFormStr(sParag):
        assert sParag.replace("\n", " ") == "te st a ndo"
    
    
    def test_sub2FormStr(sSub):
        assert re.sub('[^A-Za-z0-9 ]+', '', sSub) == "testando"
        
    
    def test_splitFormStr(sSplit):
        assert sSplit.split() == ['aa', 'abc', 'adelsk', 'KArf0123', '57']
        
        
    #função pra testar "indiceInvertido"
    def test_vocabularioIndInv(arquivo):
        vocabulario = []
        for k, v in arqTest.items():
                ignorarIndice = []
                for i in range(len(v)):
                    frequenciaNesteDoc = 1
                    if (ignorarIndice.count(i) != 0):
                        continue
                    for j in range(i+1, len(v)):
                        if (v[i] == v[j]):
                            frequenciaNesteDoc+=1
                            ignorarIndice.append(j)

                    vocabulario.append([v[i], k, frequenciaNesteDoc])


        assert vocabulario == [['palavra', 1, 2], ['Digo', 1, 1], ['digo', 1, 2], ['batAta', 1, 2],
                               ['feijão', 1, 1], ['batata1', 1, 1], ['BATATA', 1, 1], ['feijao', 1, 1],
                               ['batata', 1, 1], ['repete', 1, 1], ['arroz', 2, 1], ['ARROZ', 2, 1],
                               ['batata', 2, 1], ['FEIJÃO', 2, 1], ['repete', 2, 2], ['figo', 2, 1],
                               ['batata1', 2, 1]]
        
        
    #funções pra testar "coordenadas"
    def test_matrizCoord(indInv, numDocs):
        mResultTest = np.array([[0.,0.,0.], [0.,0.,0.], [0.,0.,0.], [0.,0.,0.], [0.,0.,0.], [0.,0.,0.],
                                [0.,0.,0.], [0.,0.,0.], [0.,0.,0.], [0.,0.,0.], [0.,0.,0.], [0.,0.,0.],
                                [0.,0.,0.], [0.,0.,0.]])
        
        coords = np.zeros((len(indInv), numDocs))
        
        for i in range(14):
            for j in range(3):
                assert mResultTest[i][j] == coords[i][j]
    
    
    def test_forCoord(indFor, numDocs):
        coords = np.zeros((len(indFor), numDocs))
        
        vResult = []
        kResult = ""
        nxResult = []
        idfResult = []
        wResult = []
        indPalResult = []
        
        indicePalavra = 0;

        for k, v in indFor.items():
            vResult.append(v)
            kResult += k
            nx = len(v)
            nxResult.append(nx)
            idf = math.log(numDocs/nx)
            idfResult.append(idf)

            for dados in v:
                w = dados[1] * idf
                wResult.append(w)
                coords[indicePalavra][dados[0]] = w

            indicePalavra+=1
            indPalResult.append(indicePalavra)
        
        assert vResult == [[[1, 2]], [[1, 1]], [[1, 2]], [[1, 2]]]
        assert kResult == "palavraDigodigobatAta"
        assert nxResult == [1, 1, 1, 1]
        assert idfResult == [1.0986122886681098, 1.0986122886681098, 1.0986122886681098,
                             1.0986122886681098]
        assert wResult == [2.1972245773362196, 1.0986122886681098, 2.1972245773362196,
                           2.1972245773362196]
        assert indPalResult == [1, 2, 3, 4]
        
        
    #funções pra testar "formataQuery"
    def test_matrizFormQ(indInv):
        mqResultTest = np.array([[0.], [0.], [0.], [0.]])
        Q = np.zeros((len(indInv),1))
        
        for i in range(4):
            for j in range(1):
                assert mqResultTest[i][j] == Q[i][j]
                
                
    def test_forFormQ(indInv, indQ, numDocs):
        Q = np.zeros((len(indInv),1))
        
        vResult = []
        kResult = ""
        nxResult = []
        idfResult = []
        wResult = []
        indPalResult = []
        
        indicePalavra = 0;
    
        for k, v in indInv.items():
            vResult.append(v)
            kResult += k
            if k in indQ.keys():
                nx = len(v)
                nxResult.append(nx)
                idf = math.log(numDocs/nx)
                idfResult.append(idf)
                w = (indQ[k])[0][1] * idf
                wResult.append(w)
                Q[indicePalavra] = w
            indicePalavra+=1
            indPalResult.append(indicePalavra)
        
        assert vResult == [[[1, 2]], [[1, 1]], [[1, 2]], [[1, 2]]]
        assert kResult == "palavraDigodigobatAta"
        assert nxResult == [1, 1, 1]
        assert idfResult == [1.0986122886681098, 1.0986122886681098, 1.0986122886681098]
        assert wResult == [2.1972245773362196, 2.1972245773362196, 2.1972245773362196]
        assert indPalResult == [1, 2, 3, 4]

In [15]:
#testando a função "formataString" com todos os caracteres possíveis    

#testando a função lower
strTest_lower = "ABDKdfkjOOk"
ExtraTest.test_lowerFormStr(strTest_lower)

#testando a primeira substituição da função
strTest_sub1 = "aaaa:bbb ' b.cc aaa|b @ (cccd )?aa! # $bb&dda"
ExtraTest.test_sub1FormStr(strTest_sub1)

#testando a substituição de parágrafos
strTest_parag = '''te
st
a
ndo'''
ExtraTest.test_paragFormStr(strTest_parag)

#testando a segunda substituição da função
strTest_sub2 = "têeéstüaâãndoò"
ExtraTest.test_sub2FormStr(strTest_sub2)

#testando a função split
strTest_split = "aa abc adelsk KArf0123 57"
ExtraTest.test_splitFormStr(strTest_split)

#conferindo a saída da função
strTest_formStr = '''ABC defs lmNOpQ áfdFDj íJFKjdêfddã 2gh3h4vgÁg~L 058 23848l9 9ÁSSD´8fd43
'!as!é379'] /d|gf\ #aaa#bbdd %sja &3kj3 (984jd )cc *akjs +JSA ds-KK .llLL :sks aa;LL <ee~e =ã>? @báe[
àaa] _è_HH_ { 323}7l
'''

Test.test_formataString(strTest_formStr)

In [16]:
#testando a função "indiceInvertido"

txtTest1 = ['palavra', 'Digo', 'digo', 'batAta', 'feijão', 'batata1', 'palavra', 'BATATA', 'feijao',
             'batAta', 'digo', 'batata', 'repete']
txtTest2 = ['arroz', 'ARROZ', 'batata', 'FEIJÃO', 'repete', 'figo', 'repete', 'batata1']

arqTest = {1:txtTest1, 2:txtTest2}

#testando parte da função "indiceInvertido" que retorna o "vocabulario"
ExtraTest.test_vocabularioIndInv(arqTest)

#conferindo a saída da função
Test.test_indiceInvertido(arqTest)

In [17]:
#testando a função "coordenadas"

indTest_coords = {'palavra': [[1, 2]], 'Digo': [[1, 1]], 'digo': [[1, 2]], 'batAta': [[1, 2]],
            'feijão': [[1, 1]], 'batata1': [[1, 1], [2, 1]], 'BATATA': [[1, 1]], 'feijao': [[1, 1]],
            'batata': [[1, 1], [2, 1]], 'repete': [[1, 1], [2, 2]], 'arroz': [[2, 1]],
            'ARROZ': [[2, 1]], 'FEIJÃO': [[2, 1]], 'figo': [[2, 1]]}

#testando a matriz inicial "coords"
ExtraTest.test_matrizCoord(indTest_coords, 3)

#testando o funcionamento do for da função
indTest_for = {'palavra': [[1, 2]], 'Digo': [[1, 1]], 'digo': [[1, 2]], 'batAta': [[1, 2]]}

ExtraTest.test_forCoord(indTest_for, 3)

#conferindo a saída da função
Test.test_coordenadas(indTest_coords, 3)

In [18]:
#testando a função "formataQuery"

indInvTest = {'palavra': [[1, 2]], 'Digo': [[1, 1]], 'digo': [[1, 2]], 'batAta': [[1, 2]]}
indQTest = {'batAta': [[1, 2]], 'palavra': [[1, 2]], 'figo': [[1, 1]], 'digo': [[1, 2]]}

#testando a matriz inicial "Q"
ExtraTest.test_matrizFormQ(indInvTest)

#testando o funcionamento do for da função
ExtraTest.test_forFormQ(indInvTest, indQTest, 3)

#conferindo a saída da função
Test.test_formataQuery(indInvTest, indQTest, 3)

In [19]:
#testando a função "verificaConsulta"
coordTest = np.array([[1.,2.,1.], [1.,1.,1.], [1.,2.,1.], [1.,2.,1.]])
QTest = np.array([[2.], [1.], [2.], [2.]])

#conferindo a saída da função
Test.test_verificaConsulta(coordTest, QTest)