In [1]:
import sys
sys.path.append('../')

from typing import List, Dict, Tuple, Optional, Union
import os
import requests
from glob import glob
import json
from datetime import datetime, timedelta

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm

from dataset.utils import *

DATA_DIR = "/opt/datasets/thebackend"
PUSH_URL = BASE_URL + "/push-data"
LOGIN_URL = BASE_URL + "/login-data"

MONTHS_STR = ["jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"]

PUSH_DATA = os.path.join(DATA_DIR, "push.parquet")

In [2]:
push_df  = download_push(save_dir=DATA_DIR, overwrite=False, verbose=True)
login_df = pd.read_parquet(os.path.join(DATA_DIR, 'login_APR_JUL.parquet.gzip'))
crud_df  = pd.read_parquet(os.path.join(DATA_DIR, 'crud_APR_JUL.parquet.gzip'))

File /opt/datasets/thebackend/push.parquet.gzip already exists. Skipping...


In [9]:
avail_push = push_df.iloc[[21961, 24526, 27890, 28192, 30293, 29276]]
avail_push

Unnamed: 0,timestamp,game_id,is_ad
21961,2022-04-16 12:00:00,2097,True
24526,2022-05-14 12:00:00,2097,True
27890,2022-06-18 12:00:00,2097,True
28192,2022-06-21 12:30:00,3065,True
30293,2022-07-12 12:30:00,3065,True
29276,2022-07-02 12:30:00,1585,True


In [10]:
create_thebackend_dataset = lambda t, g: create_dataset(
    timestamp=t, 
    game_id=g, 
    duration=7, 
    after_hours=[3, 6, 12], 
    before_day=7, 
    login_df=login_df, 
    crud_df=crud_df
)

In [11]:
DISABLE_TQDM = True

all_datasets = [{'timestamp': t, 'game_id': g, 'data': create_thebackend_dataset(t, g)} for t, g in tqdm(zip(avail_push['timestamp'], avail_push['game_id']))]

0it [00:00, ?it/s]

Number of gamers for T=1 in game_id 2097: 936


  0%|          | 0/936 [00:00<?, ?it/s]

Number of gamers for T=0 in game_id 2097: 1098


  0%|          | 0/1098 [00:00<?, ?it/s]

Number of gamers for T=1 in game_id 2097: 581


  0%|          | 0/581 [00:00<?, ?it/s]

Number of gamers for T=0 in game_id 2097: 722


  0%|          | 0/722 [00:00<?, ?it/s]

Number of gamers for T=1 in game_id 2097: 393


  0%|          | 0/393 [00:00<?, ?it/s]

Number of gamers for T=0 in game_id 2097: 428


  0%|          | 0/428 [00:00<?, ?it/s]

Number of gamers for T=1 in game_id 3065: 204


  0%|          | 0/204 [00:00<?, ?it/s]

Number of gamers for T=0 in game_id 3065: 91


  0%|          | 0/91 [00:00<?, ?it/s]

Number of gamers for T=1 in game_id 3065: 187


  0%|          | 0/187 [00:00<?, ?it/s]

Number of gamers for T=0 in game_id 3065: 147


  0%|          | 0/147 [00:00<?, ?it/s]

Number of gamers for T=1 in game_id 1585: 1278


  0%|          | 0/1278 [00:00<?, ?it/s]

Number of gamers for T=0 in game_id 1585: 1261


  0%|          | 0/1261 [00:00<?, ?it/s]

In [28]:
dataset_info = pd.DataFrame({
    'timestamp': [d['timestamp'] for d in all_datasets],
    'game_id': [d['game_id'] for d in all_datasets],
    'total_users': [len(d['data']) for d in all_datasets],
    'unique_users': [len(np.unique([entry['gamer_id'] for entry in d['data']])) for d in all_datasets],
    'min_length': [np.min([len(entry['X']) for entry in d['data']]) for d in all_datasets],
    'max_length': [np.max([len(entry['X']) for entry in d['data']]) for d in all_datasets],
    'avg_length': [np.mean([len(entry['X']) for entry in d['data']]) for d in all_datasets],
    'std_length': [np.std([len(entry['X']) for entry in d['data']]) for d in all_datasets],
    'total_length': [np.sum([len(entry['X']) for entry in d['data']]) for d in all_datasets],
    'users_t1': [sum([entry['T'] for entry in d['data']]) for d in all_datasets],
    'users_t0': [sum([1-entry['T'] for entry in d['data']]) for d in all_datasets],
    'p(Y=1)': [np.array([entry['Y'] for entry in d['data']]).mean(axis=0)[0] for d in all_datasets],
})
dataset_info

Unnamed: 0,timestamp,game_id,total_users,unique_users,min_length,max_length,avg_length,std_length,total_length,users_t1,users_t0,p(Y=1)
0,2022-04-16 12:00:00,2097,2034,1523,1,32492,562.267453,1169.9168,1143652,936,1098,0.114553
1,2022-05-14 12:00:00,2097,1303,992,1,105907,701.260936,3248.876255,913743,581,722,0.124328
2,2022-06-18 12:00:00,2097,821,632,1,31904,697.386114,1549.618272,572554,393,428,0.121803
3,2022-06-21 12:30:00,3065,295,279,4,5054,604.789831,891.370636,178413,204,91,0.010169
4,2022-07-12 12:30:00,3065,334,297,1,8599,339.128743,988.845406,113269,187,147,0.05988
5,2022-07-02 12:30:00,1585,2539,1637,1,28146,5416.778259,5186.774487,13753200,1278,1261,0.224498


In [22]:
all_datasets[0]['data'][0]['gamer_id']

{'gamer_id': 'DB35469139C04074A9F7B3CD30026AFCBCB1BE7067EFA330471B074C74401894',
 'X':                    timestamp method  action
 0    2022-04-09 12:02:01.000  LOGIN       1
 1    2022-04-09 12:03:17.838   POST      11
 2    2022-04-09 12:23:56.065   POST      11
 3    2022-04-09 12:45:28.907   POST      11
 4    2022-04-09 13:02:12.790    GET      24
 ...                      ...    ...     ...
 1756 2022-04-16 11:41:19.154    GET       7
 1757 2022-04-16 11:41:19.355    GET       7
 1758 2022-04-16 11:41:19.399    GET       7
 1759 2022-04-16 11:41:19.405    GET       7
 1760 2022-04-16 11:42:32.630   POST      11
 
 [1761 rows x 3 columns],
 'Y': array([1., 1., 1.]),
 'T': 1}

In [52]:
SAVE_DIR = "/opt/datasets/thebackend/v0.1"
os.makedirs(SAVE_DIR, exist_ok=True)

for d in all_datasets:
    timestamp = d['timestamp']
    game_id = d['game_id']
    save_path = os.path.join(SAVE_DIR, f"{timestamp.strftime('%Y%m%d-%H%M%S')}_{game_id}")
    os.makedirs(save_path, exist_ok=True)
    
    infos = []
    for i, entry in enumerate(d['data']):
        gamer_id = entry['gamer_id']
        X = entry['X']
        Y = entry['Y']
        T = entry['T']

        X['action'] = X['action'].astype(np.int8)
        X['action'] = X['method'].replace({'MASK': 0, 'LOGIN': 1, 'GET': 2, 'POST': 3, 'PUT': 4, 'DELETE': 5}).astype(np.int8)

        infos.append({'X': f"{gamer_id}_T={T}.parquet", 'Y': Y.tolist(), 'T': T})
        X.to_parquet(os.path.join(save_path, f"{gamer_id}_T={T}.parquet"))
        # np.save(os.path.join(save_path, f"{gamer_id}_T={T}.npy"), X)

    with open(os.path.join(save_path, "info.json"), "w") as f:
        json.dump(infos, f)