In [11]:
import sys
import os
import warnings
os.environ['OPENBLAS_NUM_THREADS'] = '1'
warnings.filterwarnings('ignore')
import pickle

import pandas as pd
import numpy as np
import time
import pyarrow.parquet as pq
import scipy
import implicit
import bisect
import sklearn.metrics as m
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
#from sklearn.preprocessing import MinMaxScaler
import vaex
import seaborn as sns
from sklearn.decomposition import TruncatedSVD

In [2]:
LOCAL_DATA_PATH = 'context_data'
SPLIT_SEED = 42
DATA_FILE = 'competition_data_final_pqt'
TARGET_FILE = 'public_train.pqt'
SUBMISSION_FILE = 'submit_2.pqt'

In [3]:
df = vaex.open(f'../{LOCAL_DATA_PATH}/{DATA_FILE}')

In [4]:
%%time
data_agg = df[['user_id', 'url_host', 'request_cnt']].\
    groupby(['user_id', 'url_host']).agg([('request_cnt', "sum")])

CPU times: total: 5min 34s
Wall time: 43.5 s


In [5]:
url_set = set(data_agg['url_host'].unique())
print(f'{len(url_set)} urls')
url_dict = {url: idurl for url, idurl in zip(url_set, range(len(url_set)))}
usr_set = set(data_agg['user_id'].unique())
print(f'{len(usr_set)} users')
usr_dict = {usr: user_id for usr, user_id in zip(usr_set, range(len(usr_set)))}

199683 urls
415317 users


In [8]:
%%time
values = data_agg['request_cnt'].values
rows = data_agg['user_id'].map(usr_dict).to_numpy()
cols = data_agg['url_host'].map(url_dict).to_numpy()
mat = scipy.sparse.csr_matrix((values, (rows, cols)), shape=(rows.max() + 1, cols.max() + 1))
mat

CPU times: total: 21.8 s
Wall time: 4.62 s


<415317x199683 sparse matrix of type '<class 'numpy.int64'>'
	with 32277669 stored elements in Compressed Sparse Row format>

In [9]:
%%time
svd = TruncatedSVD(n_components=1000, n_iter=7, n_oversamples=15, random_state=SPLIT_SEED)
emb = svd.fit_transform(mat)
print(svd.explained_variance_ratio_.sum())

0.9987050120259241
CPU times: total: 19min 59s
Wall time: 6min 44s


In [12]:
with open('tsvd/emb.pickle', 'wb') as outp:
    pickle.dump(emb, outp, pickle.HIGHEST_PROTOCOL)