In [1]:
import pandas as pd
import os
import numpy as np
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
import collections
import helpers


items_dict = helpers.load_items()
interactions_train = helpers.load_interactions_df()

In [10]:
items_dict[1748830]

{'title': 'Relógio Medidor Inteligente Pulso Freqüência Cardíaca M3',
 'domain_id': 'MLB-SMARTWATCHES',
 'product_id': nan,
 'price': 90.0,
 'category_id': 'MLB135384',
 'condition': 'new'}

In [11]:
domain_item_dict = {}
for item in items_dict.keys():
    domain = items_dict[item]['domain_id']
    if domain not in domain_item_dict:
        domain_item_dict[domain] = []
    domain_item_dict[domain].append(item)

In [16]:
len(domain_item_dict['MLM-INDIVIDUAL_HOUSES_FOR_SALE'])

723

In [21]:
' '.join([items_dict[x]['title'] for x in domain_item_dict['MLM-INDIVIDUAL_HOUSES_FOR_SALE']])

'Casa Sola En Venta Con Gran Patio Solo Pago De Contado. Se Vende Hotel Boutique De 5 Estrellas, Con 37 Suites Con Vistas Al Mar Caribe Casa En Col. Club De Golf Bellavista, Tlalnepantla. Edo. Mex Casa De 3 Niveles A La Venta En Tultitlán, Izcalli Del Valle Casas En Venta En Fovissste, Manzanillo Majestuosa Residencia Con Inmejorable Ubicación Querétaro, Paseos Del Marqués, Casita En Venta! Casa Remodelada, Excelente Ubicación En Toluca Casa En Condominio - Villas Del Campo Casa En Venta En Centrio Casa En Venta En Bosques De La Florida, San Luis Potosi Casa De Campo En Venta En Villa Guerrero, Ixtapan De La Sal, Estado De México Hermosa Casa Sola En Privada Con Portón Eléctrico En Ahuatepec Bosque De Framboyanes Casa Nueva En Venta (vw) Villa Maya Kankabal, Carretera Izamal-tunkas Km. 21 Linda Casa.  Recuperación Bancaria  Cdmx Desierto De Los Leones, Preciosa Casa Con Salón De Juegos Y Hermoso Jardín Plano Apizaco, Fracc. Parque Ingles, Vendo Recidencia Nueva Casas Nuevas Fraccionami

In [3]:
interactions_train[interactions_train['event_type'] == 'search']

Unnamed: 0,user_id,item_id,event_type,event_timestamp,target
2,0,RELOGIO SMARTWATCH,search,2019-10-19T11:26:07.063-0400,1748830
20,1,DESMAMADEIRA ELETRICA,search,2019-10-07T09:45:29.322-0400,228737
22,1,DESMAMADEIRA ELETRICA,search,2019-10-07T09:46:17.100-0400,228737
23,1,DESMAMADEIRA ELETRICA,search,2019-10-07T09:46:19.173-0400,228737
25,1,DESMAMADEIRA ELETRICA,search,2019-10-07T18:53:20.113-0400,228737
...,...,...,...,...,...
11999155,413160,ALUGUEL BOB CAT ESCAVADEIRA,search,2019-10-15T07:14:39.241-0400,2022477
11999156,413161,XAOMI,search,2019-10-03T21:15:49.220-0400,1111021
11999157,413161,XAOMI,search,2019-10-03T21:15:52.335-0400,1111021
11999158,413161,XAOMI,search,2019-10-03T21:16:33.369-0400,1111021


In [None]:
len([x for x in items_dict.keys() if items_dict[x].condition == 'new'])

In [2]:
keys = list(items_dict.keys())

def normalize(item_title):
    return item_title.upper().strip().replace('.', '')

document_to_item = collections.defaultdict(list)
for x in keys:
    document_to_item[normalize(items_dict[x]['title'])].append(x)
    
documents = list(document_to_item.keys())

vectorizer = TfidfVectorizer(stop_words=['spanish', 'portuguese'], strip_accents='unicode', min_df=5)
X = vectorizer.fit_transform(documents)

In [3]:
search_queries = list(interactions_train[interactions_train['event_type'] == 'search']['item_id'].dropna().apply(normalize).unique())
len(search_queries)

847430

In [4]:
q = vectorizer.transform(search_queries)
q

<847430x377332 sparse matrix of type '<class 'numpy.float64'>'
	with 2752723 stored elements in Compressed Sparse Row format>

In [10]:
%%time
n = 10
heap = []
indexed_results = {}

for j in range(200):#q.shape[0])
    heap = []
    y = q.getrow(j)
    r = X.multiply(y).sum(1)
    rows, cols = r.nonzero()
    for i in range(len(rows)):
        val = r[rows[i], cols[i]]
        heapq.heappush(heap, (val, document_to_item[documents[rows[i]]]))
        if len(heap) > n:
            heapq.heappop(heap)
    indexed_results[search_queries[j]] = sorted(heap, key=lambda x: -x[0])
    if j % 1000 == 0 and j > 0:
        print('Saving pickle...')
        with open('./search.pickle', 'wb') as handle:
            pickle.dump(indexed_results, handle, protocol=pickle.HIGHEST_PROTOCOL)
        print('Saved pickle!')
    
        

CPU times: user 55.8 s, sys: 4.43 s, total: 1min
Wall time: 1min


In [35]:
%%time
n = 10
indexed_results = {}
step = 1000

def f(j):
    y = q.getrow(j)
    r = X.multiply(y).sum(1)
    #rows, cols = r.nonzero()
    #for i in range(len(rows)):
    #    val = r[rows[i], cols[i]]
    #    heapq.heappush(heap, (val, document_to_item[documents[rows[i]]]))
    #    if len(heap) > n:
    #        heapq.heappop(heap)
    indexed_results[search_queries[j]] = document_to_item[documents[np.argmax(r)]][0]#sorted(heap, key=lambda x: -x[0])
    #print(f"Finished {j}")
    if j % 1000 == 0 and j > 0:
        print('Saving pickle...')
        with open('./search.pickle', 'wb') as handle:
            pickle.dump(indexed_results, handle, protocol=pickle.HIGHEST_PROTOCOL)
        print('Saved pickle!')

#f(0)
for i in range(100):
    f(i)
    

CPU times: user 17 s, sys: 2.24 s, total: 19.2 s
Wall time: 19.2 s


In [34]:
indexed_results

{'RELOGIO SMARTWATCH': [95155, 313617, 1512214, 814788, 1469577],
 'DESMAMADEIRA ELETRICA': [1156086],
 'RADIOBOSS': [1230082],
 'SOUND FORGE': [337971],
 'SOUND FORGE PLUGINS': [937557],
 'AMAZFIT BIP': [1968144, 1258600, 1364544, 1821786],
 'AMAZFIT BIPAMAZFIT BIP LITE': [66654],
 'AMAZFIT BIPAMAZFIT BIP': [1968144, 1258600, 1364544, 1821786],
 'PULA PULA': [2058570, 2014165],
 'CABANA': [634923],
 'CARREGADOR 34A': [939286],
 'KINGS 34': [909618],
 'KINGS 34A': [939286],
 'GALAXY BUDS': [1087758, 1520178],
 'FONE XIAOMI': [1886003],
 'TAMPA TRASEIRA IPHONE8': [68885],
 'CAMBIO C4 PALLAS': [1782636],
 'LEECO COOL 1 DISPLAY': [1855680],
 'LEECO COOL 1 DISPLAY C103': [1855680],
 'SECADOR TAIFF': [40480],
 'AUDIFONOS BASS': [426544],
 'ILUMINADOR LED RING LIGHT MAQUIAGEM MAKE FOTOS': [426711],
 'HARMAN KARDON': [1745702],
 'MEMORIA DDR4': [852207],
 'MEMORIA DDR4 NOTEBOOK': [1796465],
 'CARABINAS CHUMBINHO 55': [1091164],
 'CARABINAS CHUMBINHO 45': [830159],
 'ESPINGARDA CHUMBINHO 55': 

In [None]:
results[i] = np.argsort(r.flatten())[-3:][::-1]
print('Saving pickle...')
with open('./search.pickle', 'wb') as handle:
    pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)
print('Saved pickle!')

In [1]:
from numba import jit, cuda 
import numpy as np 
# to measure exec time 
from timeit import default_timer as timer    
  
# normal function to run on cpu 
def func(a):                                 
    for i in range(10000000): 
        a[i]+= 1      
  
# function optimized to run on gpu  
@jit(target ="cuda")                          
def func2(a):
    for i in range(10000000): 
        a[i]+= 1
if __name__=="__main__": 
    n = 10000000                            
    a = np.ones(n, dtype = np.float64) 
    b = np.ones(n, dtype = np.float32) 
      
    start = timer() 
    func(a) 
    print("without GPU:", timer()-start)     
      
    start = timer() 
    func2(a) 
    print("with GPU:", timer()-start) 

ModuleNotFoundError: No module named 'numba'

In [None]:
%%time

data = {
    'user_id': [],
    'item_id': [],
    'event_type': [],
    'event_timestamp': [],
    'target': []
}

for row in interactions_train[interactions_train['event_type'] == 'search'].iterrows():
    if row['item_id'] not in search_map:
        continue
        
    for r in search_map[row['item_id']]:
        data['user_id'].append(row['user_id'])
        data['item_id'].append(r) 
        data['event_type'].append(row['event_type'])
        data['event_timestamp'].append(row['event_timestamp'])
        data['target'].append(row['target'])

In [None]:
results