In [691]:
from scipy.sparse import *
from time import time
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn import datasets
from importlib import reload as rl
import aux

In [578]:
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]

news_bunch = datasets.fetch_20newsgroups(subset='all',
                              remove=('headers', 'footers', 'quotes'))
 
X_news_all = TfidfVectorizer().fit_transform(news_bunch.data)

Y_news_all = news_bunch.target


#### Inicializa corpus e também a versão Tfidf dele

In [683]:
cp = [ 
       'o sapo nao lava op peh nao lava porque nao quer', 
       'ele mora la na lagoa não lava o pé porque não quer', 
       'mas que chulé!', 
       'Ele mora lá na lagoa não lava o pé porque não quer' 
       ]
mcp = TfidfVectorizer().fit_transform(cp)
mcopy = mcp.copy()
mc0 = mcp[0]
mc0ar = np.array(mc0)
def corpus_small():
    global mcp, mcopy, mc0, mc0ar
    mcp = TfidfVectorizer().fit_transform(cp)
    mcopy = mcp.copy()
    mc0 = mcp[0]
    mc0ar = np.array(mc0)
def corpus_big():
    mcp = X_news_all
    mc0 = mcp[0]
    mc0ar = np.array(mc0)
    mcopy = mcp.copy()


In [543]:
def time_function(func, *args, **kargs):
#     print("func: ", func)
    if args and kargs:
        t0 = time()
        func(*args, kargs)
    elif args and not kargs:
        t0=time()
        func(*args)
    elif not args and kargs:
        t0 = time()
        func(kargs)
    else:
        t0 = time()
        func()
    return time() - t0

def loop_time_function(n, func, *args, **kargs):
    """" Executa n vezes a função func, passando args e kargs como argumento.
    Seu retorno é a soma dos tempos de execução de cada chamada à função.
    """
    return sum( (time_function(func, *args, **kargs) for _ in range(n) ) )

## Código para subtração manual, elemento a elemento
    + executou de forma consistentemente mais lenta que as versões vetorizadas

In [544]:
def manual_sub(cpy=mcopy, dest=mcp, idx = 0):
    global mc0
    mcp[0] - mc0
    mcp[1] - mc0
    mcp[2] - mc0
    mcp[3] - mc0

## Subtração ao se converter tudo para np.array foi a mais veloz
    
    Contudo, inviável para conjuntos maiores de dados (alta complexidade de espaço) conforme demonstrado ao se tentar utilizar essa estratégia com o dataset de notícias
    20newsgroup.
```python
# PROBLEMA: COMPLEXIDADE DE ESPAÇO
def numpy_array_sub(cpy=mcopy, dest=mcp, idx = 0):
    cpy.toarray() - cpy[idx].toarray()
```

In [804]:
# PROBLEMA: COMPLEXIDADE DE ESPAÇO
def numpy_array_sub(cpy=mcopy, dest=mcp, idx = 0):
    return cpy.toarray() - cpy[idx].toarray()


In [848]:
# Define algumas funções candidatas a substituir a subtração com conversão direta
# para array a partir de matriz esparsa.

# Ok.
def numpy_gambs_sub(cpy=mcopy, dest=mcp, idx = 0):
    return np.array([*mcp]) - np.array(mcp.shape[0]*[mcp[idx]])

_sub = lambda a,b: a - b
_vsub = np.frompyfunc(_sub, 2, 1)
# Ok, however in 'ramdom' order (correct submatrix)
def vectorized_sub(cpy=mcopy, dest=mcp, idx = 0):
    return _vsub(np.array([*mcp]), mc0)

# Ok. Same as 'vectorized_sub' 
def vectorized_gambs(cpy=mcopy, dest=mcp, idx = 0):
    return _vsub([*mcp], mc0)

# TESTANTO
def _sub_stupid(a, b):
    return a - b

_stupid_vec = np.frompyfunc(_sub_stupid, 2, 1)
def stupid_method(cpy=mcopy, dest=mcp, idx = 0):
    mc0ar = np.array(mc0)
    return _stupid_vec([*mcp], mc0ar)

# IDEIA ORIGINALL
_to_sub = np.arange( 0, mcp.shape[0], dtype=csr_matrix )

_to_sub[np.array(range(mcp.shape[0]))] = mc0
def original_idea(cpy=mcopy, dest=mcp, idx = 0):
    return [*mcp] - _to_sub

# Parece LENTO
def original_lil(cpy=mcopy, dest=mcp, idx = 0):
    return [*mcp.tolil()] -np.array([mc0])
# Ok. Simples e FUNCIONA
def simple_sub(cpy=mcopy, dest=mcp, idx = 0):
    return np.array([*mcp]) - [mcp[0]]

print(mcp.shape[0])

4


In [830]:
benchmark = {}

In [831]:
# Apenas para efeitos de benchmark, usar um corpus pequeno
mc_small()
# mc_big()

In [837]:
# nai = time_function(naive_sub)
rl(aux)
nn = 100
loop = 100
lista = [
    ('numpy_gambs_sub', numpy_gambs_sub),
    ('vectorized_sub', vectorized_sub),
    ('vectorized_gambs', vectorized_gambs),
    ('stupid_method', stupid_method),
    ('original_idea', original_idea),
    ('original_lil', original_lil),
    ('simple_sub', simple_sub),
]

In [839]:
nn = 100
for _ in range(loop):
    if not benchmark.get(nn):
      benchmark[nn] = {}
      for key, _ in lista:
        benchmark[nn][key] = []
    else:
      d = benchmark[nn]
      for key, fun in lista:
        if key not in d:
            benchmark[nn][key] = []

    dict_times = {}
    for key, func in lista:
      spent = loop_time_function(nn, func)
      dict_times[key] = spent
      benchmark[nn][key].append(spent)



    # dict_times['numpy_sub'] = loop_time_function(nn, numpy_array_sub)
    rl(aux)
    aux.ShowOrderedDict(dict_times, option='values')
    print('\n')

vectorized_sub       => 0.0568938
original_idea        => 0.0578701
vectorized_gambs     => 0.0595384
stupid_method        => 0.0620527
simple_sub           => 0.0664518
numpy_gambs_sub      => 0.0854075
original_lil         => 0.0958378


vectorized_sub       => 0.0595737
original_idea        => 0.0608733
vectorized_gambs     => 0.0609503
numpy_gambs_sub      => 0.0674441
stupid_method        => 0.0713587
simple_sub           => 0.0719380
original_lil         => 0.0962408


original_idea        => 0.0588691
vectorized_gambs     => 0.0591803
vectorized_sub       => 0.0592220
simple_sub           => 0.0674806
numpy_gambs_sub      => 0.0689807
stupid_method        => 0.0735266
original_lil         => 0.0955808


original_idea        => 0.0566275
vectorized_gambs     => 0.0599828
vectorized_sub       => 0.0626724
simple_sub           => 0.0680666
numpy_gambs_sub      => 0.0694425
stupid_method        => 0.0759423
original_lil         => 0.0957763


original_idea        => 0.0578203
vector

original_idea        => 0.0535107
vectorized_gambs     => 0.0565937
vectorized_sub       => 0.0579708
stupid_method        => 0.0646889
simple_sub           => 0.0659344
numpy_gambs_sub      => 0.0696929
original_lil         => 0.0986214


vectorized_gambs     => 0.0823605
original_idea        => 0.0970299
vectorized_sub       => 0.0981812
stupid_method        => 0.0984378
numpy_gambs_sub      => 0.1120129
original_lil         => 0.1328158
simple_sub           => 0.1406779


vectorized_sub       => 0.0609856
original_idea        => 0.0691078
simple_sub           => 0.0768931
numpy_gambs_sub      => 0.0904326
vectorized_gambs     => 0.1069179
stupid_method        => 0.1083415
original_lil         => 0.1111934


stupid_method        => 0.0623782
simple_sub           => 0.0749536
original_idea        => 0.0815501
vectorized_gambs     => 0.0944545
numpy_gambs_sub      => 0.1113408
vectorized_sub       => 0.1153426
original_lil         => 0.1697495


vectorized_sub       => 0.0733221
numpy_

vectorized_sub       => 0.0644724
vectorized_gambs     => 0.0750527
numpy_gambs_sub      => 0.0771365
original_idea        => 0.0791199
simple_sub           => 0.0804198
stupid_method        => 0.1000431
original_lil         => 0.1081285


vectorized_sub       => 0.0719748
vectorized_gambs     => 0.0745194
numpy_gambs_sub      => 0.0775712
original_idea        => 0.0877049
stupid_method        => 0.0912571
simple_sub           => 0.1011755
original_lil         => 0.1312194


original_idea        => 0.0645459
simple_sub           => 0.0724998
vectorized_sub       => 0.0792844
vectorized_gambs     => 0.0800717
numpy_gambs_sub      => 0.0922043
stupid_method        => 0.0940626
original_lil         => 0.1095529


original_idea        => 0.0611036
vectorized_sub       => 0.0666559
vectorized_gambs     => 0.0687387
simple_sub           => 0.0710225
numpy_gambs_sub      => 0.0752172
stupid_method        => 0.0853479
original_lil         => 0.1126981


original_idea        => 0.0607584
vector

In [846]:
rl(aux)
aux.ShowOrderedDictList(benchmark.get(nn, {}), option='values', smallTime=True)
print('====================')
aux.ShowOrderedDictList(benchmark.get(nn, {}), option='values', smallTime=False)

original_idea        => 0.0506625
vectorized_gambs     => 0.0508165
vectorized_sub       => 0.0519657
stupid_method        => 0.0530596
simple_sub           => 0.0591900
numpy_gambs_sub      => 0.0605369
original_lil         => 0.0830522
original_idea        => 0.0647902
vectorized_sub       => 0.0650796
vectorized_gambs     => 0.0669319
stupid_method        => 0.0693266
simple_sub           => 0.0739791
numpy_gambs_sub      => 0.0761492
original_lil         => 0.1068727
