# Preprocessing Piplines

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(rc={'figure.figsize':(12,5)})
%load_ext autoreload
%autoreload 2 

In [2]:
import re
from tqdm import tqdm

In [3]:
import sys
import os
sys.path.append(os.path.abspath('../modules'))
from lnds import longest_non_decreasing_subsequence as lnds

## Always

In [5]:
needed_columns = ['timestamp', 'maid', 'hh_id','iiqid', 'partner_id', 'ip', 'iscellip',
       'cellispid', 'domain','is_house_ip_or_source_ip', 'brand', 'model', 'os', 'osversion',
       'browser', 'advertisedbrowser', 'browserversion', 'type', 'is_best_ip']

In [17]:
dtype_dict = {
    'maid': 'category', 'hh_id': 'string',
    'iscellip': 'boolean', 'domain': 'string','is_house_ip_or_source_ip': 'boolean', 
    'brand': 'category', 'model': 'category', 'os': 'category', 'osversion': 'string',
    'browser': 'category', 'advertisedbrowser': 'string', 'browserversion': 'string', 'is_best_ip': 'boolean'
}

In [21]:
data = pd.read_csv("../Data/random_10k.csv", usecols=needed_columns, dtype=dtype_dict)

In [22]:
data.dropna(inplace=True)

In [77]:
# data = data[data['maid']==4 | data['maid']==7]

In [23]:
data['time'] = pd.to_datetime(data['timestamp'], unit='ms')

In [24]:
pattern = re.compile(r'^\d+(\.\d+)*$')
data = data[data.osversion.str.match(pattern)]

In [25]:
pattern = re.compile(r'^\d+(\.\d+)*$')
data = data[data.browserversion.str.match(pattern)]

In [44]:
data['is_hh'] = data['hh_id'].apply(lambda hh: hh.isnumeric())

In [45]:
data = data[data["time"] >= pd.to_datetime("2022-12-01")]

Remove cookies with mixed consts

In [46]:
unique_values_per_cookie = data.groupby(["hh_id", "iiqid"])[["timestamp", "brand", "model", "os", "browser"]].nunique()

In [47]:
unique_values_per_cookie["sum"] = unique_values_per_cookie.iloc[:,1:].sum(axis=1)

In [48]:
# Originnaly there where only 4 columns and the 'ok' sum was 4
equal_5 = unique_values_per_cookie[unique_values_per_cookie["sum"]==5]
more_than_5 = unique_values_per_cookie[unique_values_per_cookie["sum"]>5]

In [71]:
mixed_iiqids = more_than_5.index.get_level_values(level=1).to_list()

In [76]:
data = data[~data['iiqid'].isin(mixed_iiqids)]

Save the results

In [82]:
data.to_pickle("random_10k.pickle")

## Case based

Only thirds party cookies and maids:

In [38]:
# data[(data['maid'] == 4) | (data['maid'] == 7)]

Randomize 

In [10]:
houses = data['hh_id'].unique()
pairs = list(combinations(houses, 2))

In [12]:
sample_size = 2000
random_pairs = random.sample(pairs, sample_size)

In [13]:
pair = random_pairs[0]
pair_data = data[data['hh_id'].isin(pair)].copy()
pair_data.sort_values(by='time', inplace=True)
random_iiqid_0 = np.random.choice(pair_data.loc[pair_data['hh_id']==pair[0], 'iiqid'].unique())
random_iiqid_1 = np.random.choice(pair_data.loc[pair_data['hh_id']==pair[1], 'iiqid'].unique())
serialize_id = {random_iiqid_0: 0, random_iiqid_1: 1}