In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

In [5]:
import numpy as np
pd.Series([1,2,3,3,4]).nunique()

4

# Prepare data

In [52]:
%%time
df = pq.read_table('data/competition_data_final_pqt').select(['user_id', 'url_host', 'request_cnt', 'part_of_day', 'date', 'price']).to_pandas()

CPU times: user 1min 19s, sys: 1min 22s, total: 2min 41s
Wall time: 25.4 s


In [48]:
df.shape

(322899435, 5)

In [4]:
list(df.columns)

['user_id', 'url_host', 'request_cnt', 'part_of_day', 'date']

In [5]:
import tqdm

print("Unique values in column.")
for c in df.columns:
    if c != 'user_id':
        print(c, ":", len(df[c].unique()))

Unique values in column.
url_host : 199683
request_cnt : 15
part_of_day : 4
date : 396


In [6]:
df.part_of_day.value_counts()

day        107328399
evening     96239286
morning     85236015
night       34095735
Name: part_of_day, dtype: int64

### Impute price, create date+day_part event_time

In [7]:
df.price = df.price.fillna(0)
df.price = pd.qcut(df.price, 10, labels=False)

In [8]:
df['event_time'] = pd.to_datetime(df["date"]).values.astype('datetime64[h]').view('int64')

In [9]:
df['event_time'] += df['part_of_day'].map({'morning': 0, 'day': 6, 'evening': 12, 'night': 18})

In [10]:
df.head()

Unnamed: 0,user_id,url_host,request_cnt,part_of_day,date,event_time
0,45098,ad.adriver.ru,1,morning,2022-06-15,459792
1,45098,apple.com,1,morning,2022-06-19,459888
2,45098,avatars.mds.yandex.net,1,day,2022-06-12,459726
3,45098,googleads.g.doubleclick.net,1,day,2022-05-16,459078
4,45098,googleads.g.doubleclick.net,1,day,2022-05-30,459414


# Filter transactions

In [11]:
%%time
df_urls = df.groupby('url_host').agg({'user_id': lambda x: x.nunique()}).reset_index()
urls_set = set(df_urls[df_urls.user_id>1]['url_host'])

CPU times: user 52.1 s, sys: 8.68 s, total: 1min
Wall time: 1min


In [12]:
df_urls.shape[0], len(urls_set)

(199683, 132025)

In [13]:
df = df[df['url_host'].isin(urls_set)]

In [14]:
del df['date'], df_urls

# Create transactional data

In [16]:
df.head()

Unnamed: 0,user_id,url_host,request_cnt,part_of_day,event_time
0,45098,ad.adriver.ru,1,morning,459792
1,45098,apple.com,1,morning,459888
2,45098,avatars.mds.yandex.net,1,day,459726
3,45098,googleads.g.doubleclick.net,1,day,459078
4,45098,googleads.g.doubleclick.net,1,day,459414


In [17]:
from ptls.preprocessing import PandasDataPreprocessor

preprocessor = PandasDataPreprocessor(
    col_id='user_id',
    col_event_time='event_time',
    event_time_transformation='none',
    cols_category=['url_host', 'part_of_day'],
    #cols_category=['price', 'region_name', 'city_name', 'cpe_manufacturer_name', 'cpe_model_name', 'url_host', 'cpe_type_cd', 'cpe_model_os_type', 'part_of_day'],
    cols_numerical=['request_cnt'],
    return_records=False,
)

In [18]:
%%time
import pickle

trans = preprocessor.fit_transform(df)

with open('preprocessor.p', 'wb') as f:
    pickle.dump(preprocessor, f)

CPU times: user 5min 55s, sys: 2min 11s, total: 8min 6s
Wall time: 8min 6s


In [19]:
%%time

import pickle
import numpy as np


cols = ['url_host', 'request_cnt', 'part_of_day', 'event_time']

trans_save = trans
for col in tqdm.tqdm(cols):
    trans_save[col] = trans_save[col].apply(np.array)

trans_save.to_parquet('./data/trans_filtered.pq', engine='pyarrow')

100%|█████████████████████████████████████████████████████████████████████████████████| 4/4 [00:09<00:00,  2.50s/it]


CPU times: user 34.7 s, sys: 16 s, total: 50.7 s
Wall time: 50.4 s
