In [1]:
import sys
import os
import warnings
os.environ['OPENBLAS_NUM_THREADS'] = '1'
warnings.filterwarnings('ignore')
import pickle
import datetime

import pandas as pd
import numpy as np
import time
import pyarrow.parquet as pq
from scipy.sparse import csr_matrix, coo_matrix, vstack, save_npz

import vaex

#from tqdm.notebook import tqdm

In [2]:
LOCAL_DATA_PATH = 'context_data'
SPLIT_SEED = 42
DATA_FILE = 'competition_data_final_pqt'
TARGET_FILE = 'public_train.pqt'
SUBMISSION_FILE = 'submit_2.pqt'

In [3]:
df = vaex.open(f'../{LOCAL_DATA_PATH}/{DATA_FILE}')

In [4]:
%%time
data_agg = df[['user_id', 'url_host', 'request_cnt']].\
    groupby(['user_id', 'url_host']).agg([('request_cnt', "sum")])

CPU times: total: 5min 13s
Wall time: 39.2 s


In [5]:
url_set = set(data_agg['url_host'].unique())
print(f'{len(url_set)} urls')
url_dict = {url: idurl for url, idurl in zip(url_set, range(len(url_set)))}
usr_set = set(data_agg['user_id'].unique())
print(f'{len(usr_set)} users')
usr_dict = {usr: user_id for usr, user_id in zip(usr_set, range(len(usr_set)))}

199683 urls
415317 users


In [6]:
%%time
values = data_agg['request_cnt'].values.astype(np.uint16)
rows = data_agg['user_id'].map(usr_dict).to_numpy()
cols = data_agg['url_host'].map(url_dict).to_numpy()
mat = csr_matrix((values, (rows, cols)), shape=(rows.max() + 1, cols.max() + 1))
mat

CPU times: total: 19.9 s
Wall time: 4.05 s


<415317x199683 sparse matrix of type '<class 'numpy.uint16'>'
	with 32277669 stored elements in Compressed Sparse Row format>

In [7]:
save_npz('mat.npz', mat)

with open('url_usr_dicts.pickle', 'wb') as outp:
    pickle.dump((url_dict, usr_dict), outp, pickle.HIGHEST_PROTOCOL)