In [33]:
from scipy.sparse import *
from time import time
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn import datasets
from importlib import reload as rl
import aux
import json  # salvar benchmarks
import timeit
import scipy


In [3]:
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]

news_bunch = datasets.fetch_20newsgroups(subset='all',
                              remove=('headers', 'footers', 'quotes'))
 
X_news_all = TfidfVectorizer().fit_transform(news_bunch.data)

Y_news_all = news_bunch.target


#### Inicializa corpus e também a versão Tfidf dele

In [40]:
def shrink_cp():
    global cp
    cp = [ 
        'o sapo nao lava op peh nao lava porque nao quer', 
        'ele mora la na lagoa não lava o pé porque não quer', 
        'mas que chulé!', 
        'Ele mora lá na lagoa não lava o pé porque não quer' 
    ]
    _complementary_cp()
def expand_cp():
    global cp
    cp = news_bunch.data
    _complementary_cp()

def _complementary_cp():
    global mcp, mcopy, mc0, mc0ar
    mcp = TfidfVectorizer().fit_transform(cp)
    mcopy = mcp.copy()
    mc0 = mcp[0]
    mc0ar = mc0.toarray()

In [46]:
def time_function(func, *args, **kargs):
    def wrap():
        if kargs:
            return func(*args, kargs)
        else:
            return func(*args)
        
    return timeit.timeit(wrap)

def loop_time_function(n, func, *args, **kargs):
    """" Executa n vezes a função func, passando args e kargs como argumento.
    Seu retorno é a soma dos tempos de execução de cada chamada à função.
    """
    return sum( 
        timeit.repeat(stmt=f'func(*args, **kargs)', 
                      number=1, repeat=n, setup=f'func = {func}', globals=globals())
#         (time_function(func, *args, **kargs) for _ in range(n) ) 
    )
loop_time_function(1, print, 3)

SyntaxError: invalid syntax (<timeit-src>, line 1)

In [6]:
expand_cp()

## Código para subtração manual, elemento a elemento
    + executou de forma consistentemente mais lenta que as versões vetorizadas

In [7]:
def manual_sub(cpy=mcopy, dest=mcp, idx = 0):
    global mc0
    mcp[0] - mc0
    mcp[1] - mc0
    mcp[2] - mc0
    mcp[3] - mc0

## Subtração ao se converter tudo para np.array foi a mais veloz
    
    Contudo, inviável para conjuntos maiores de dados (alta complexidade de espaço) conforme demonstrado ao se tentar utilizar essa estratégia com o dataset de notícias
    20newsgroup.
```python
# PROBLEMA: COMPLEXIDADE DE ESPAÇO
def numpy_array_sub(cpy=mcopy, dest=mcp, idx = 0):
    cpy.toarray() - cpy[idx].toarray()
```

In [8]:
# PROBLEMA: COMPLEXIDADE DE ESPAÇO
def numpy_array_sub(cpy=mcopy, dest=mcp, idx = 0):
    return mcp.toarray() - mcp[idx].toarray()


In [9]:
# Define algumas funções candidatas a substituir a subtração com conversão direta
# para array a partir de matriz esparsa.
mc0ToArray = mc0.toarray()
# Ok.
def numpy_gambs_sub(cpy=mcopy, dest=mcp, idx = 0):
    return np.array([*mcp]) - np.array(mcp.shape[0]*[mcp[idx]])

_sub = lambda a,b: a - b
_vsub = np.frompyfunc(_sub, 2, 1)
# Ok, however in 'ramdom' order (correct submatrix)
def vectorizedpy_sub(cpy=mcopy, dest=mcp, idx = 0):
    return _vsub(np.array([*mcp]), mc0)

# Ok. Same as 'vectorized_sub' 
def vectorized_gambs(cpy=mcopy, dest=mcp, idx = 0):
    return _vsub([*mcp], mc0)

# TESTANTO
def _sub_stupid(a, b):
    return a - b

_stupid_vec = np.frompyfunc(_sub_stupid, 2, 1)
def stupid_method(cpy=mcopy, dest=mcp, idx = 0):
    mc0ar = np.array(mc0)
    return _stupid_vec([*mcp], mc0ar)

# IDEIA ORIGINALL
_to_sub = np.arange( 0, mcp.shape[0], dtype=csr_matrix )

_to_sub[np.array(range(mcp.shape[0]))] = mc0
def original_idea(cpy=mcopy, dest=mcp, idx = 0):
    return [*mcp] - _to_sub

# Parece LENTO
def original_lil(cpy=mcopy, dest=mcp, idx = 0):
    return [*mcp.tolil()] -np.array([mc0])
# Ok. Simples e FUNCIONA
def simple_sub(cpy=mcopy, dest=mcp, idx = 0):
    return np.array([*mcp]) - [mcp[0]]

In [10]:
print(mcp[0].shape)
print(mcp.shape[0])

_sub = lambda a,b: a - b
_vsub = np.frompyfunc(_sub, 2, 1)
# Ok, however in 'ramdom' order (correct submatrix)
def vectorizedpy_sub(cpy=mcopy, dest=mcp, idx = 0):
    return _vsub(np.array([*mcp]), mc0)

# vectorized_sub()
# vectorize_toarray()
# np.array([*mcp])

(1, 134410)
18846


In [11]:
benchmark_small = {}
benchmark_big = {}

In [12]:
# Apenas para efeitos de benchmark, usar um corpus pequeno
# mc_small()
corpus_big()

In [35]:
# print(mcp[0])

# MLE para os dois abaixo...
ArraySub = lambda a, b:  a.toarray() - b
ArraySubQuick = lambda a, b:  a.toarray() - mc0ar

# Testar versão esparsa dos dois acima
ArraySubCsr = lambda a, b:  csr_matrix(a.toarray() - b)
ArraySubQuickCsr = lambda a, b:  csr_matrix(a.toarray() - mc0ar)

frompyfuncArraySub = np.frompyfunc(ArraySub, 2, 1)
frompyfuncArraySubQuick = np.frompyfunc(ArraySubQuick, 2, 1)
frompyfuncArraySubCsr = np.frompyfunc(ArraySubCsr, 2, 1)
frompyfuncArraySubQuickCsr = np.frompyfunc(ArraySubQuickCsr, 2, 1)
_ = np.frompyfunc(_to_array, 2, 1)

# MLE
def frompysub1_mle(cpy=mcopy, dest=mcp, idx = 0):
  return frompyfuncArraySub([*mcp], mc0)

# Bons resultados, mas MLE
def frompysub2_mle(cpy=mcopy, dest=mcp, idx = 0):
  return frompyfuncArraySubQuick([*mcp], mc0)
# Bons resultados, mas MLE
def frompysub3_big(cpy=mcopy, dest=mcp, idx = 0):
  return frompyfuncArraySubCsr([*mcp], mc0)
def frompysub4_big(cpy=mcopy, dest=mcp, idx = 0):
  return frompyfuncArraySubQuickCsr([*mcp], mc0)


In [14]:
expand_cp()

In [58]:
lista = [
    ('numpy_gambs_sub', numpy_gambs_sub),
    ('vectorizedpy_sub', vectorizedpy_sub),
    ('vectorized_gambs', vectorized_gambs),
    ('stupid_method', stupid_method),
    ('original_idea', original_idea),
    ('original_lil', original_lil),
    ('simple_sub', simple_sub),
    ('numpy_array_sub', numpy_array_sub),
    ('frompysub1_mle', frompysub1_mle),
    ('frompysub2_mle', frompysub2_mle),
    ('frompysub3_big', frompysub3_big),
    ('frompysub4_big', frompysub4_big),
]

to_run = [
    ('numpy_gambs_sub', numpy_gambs_sub),
    ('vectorizedpy_sub', vectorizedpy_sub),
    ('vectorized_gambs', vectorized_gambs),
    ('stupid_method', stupid_method),
    ('original_idea', original_idea),
    ('original_lil', original_lil),
    ('simple_sub', simple_sub),
#     ('frompysub1_mle', frompysub1_mle),
#     ('frompysub2_mle', frompysub2_mle),
    ('frompysub3_big', frompysub3_big),
    ('frompysub4_big', frompysub4_big),
#     ('numpy_array_sub', numpy_array_sub), # MLE
]

In [54]:
def timeSubtractions(repeat = 100, number = 100, pairs_name_func=to_run):
  d = {}
  for k, v in to_run:
    d[k] = timeit.repeat(
        stmt=v,
        globals=globals(), repeat=repeat, number=number)
  return repeat, number, d

# benchmark[big|small] -> nome -> (num, rep) -> (minimo, tempos)
benchmark = benchmark_small if mcp.shape[0] < 1000 else benchmark_big
for name, _ in to_run:
    if name not in benchmark:
        benchmark[name] = {}

rep, num , dic = timeSubtractions(repeat = 1, number=1)
# print(dic)
for name, _ in to_run:
    if (rep, num) not in benchmark[name]:
        benchmark[name][(rep, num)] = []
    benchmark[name][(rep, num)].extend( dic[name] )

# del benchmark
print(f"rep: {rep:3d}, num: {num:3d}")
for k,v in sorted(dic.items(), key=lambda x: min(x[1])):
    print(f"{k:25s}: {min(v):20f}")

rep:   1, num:   1
numpy_gambs_sub          :             3.316244


In [59]:
nn = 1
for _ in range(1):
    if not benchmark.get(nn):
      benchmark[nn] = {}
      for key, _ in to_run:
        benchmark[nn][key] = []
    else:
      d = benchmark[nn]
      for key, fun in to_run:
        if key not in d:
            benchmark[nn][key] = []

    dict_times = {}
    for key, func in to_run:
      globals()['func'] = func
      spent = timeit.repeat(stmt=f'func()', 
                      number=1, repeat=nn, globals=globals())
      dict_times[key] = spent
      benchmark[nn][key].append(spent)



    # dict_times['numpy_sub'] = loop_time_function(nn, numpy_array_sub)
    rl(aux)
    print(dict_times)
    aux.ShowOrderedDictList(dict_times, option='values')
    print('\n')

{'numpy_gambs_sub': [2.670005959000264], 'vectorizedpy_sub': [4.44353371500074], 'vectorized_gambs': [4.421214942000006], 'stupid_method': [4.582733500000359], 'original_idea': [4.570779548000246], 'original_lil': [6.863401900999634], 'simple_sub': [2.901210232000267], 'frompysub3_big': [27.09735780199935], 'frompysub4_big': [25.182184639000297]}
numpy_gambs_sub      => 2.6700060
simple_sub           => 2.9012102
vectorized_gambs     => 4.4212149
vectorizedpy_sub     => 4.4435337
original_idea        => 4.5707795
stupid_method        => 4.5827335
original_lil         => 6.8634019
frompysub4_big       => 25.1821846
frompysub3_big       => 27.0973578




In [None]:
rl(aux)
aux.ShowOrderedDictList(benchmark.get(nn, {}), option='values', smallTime=True)
print('====================')
aux.ShowOrderedDictList(benchmark.get(nn, {}), option='values', smallTime=False)