# Сохраним данные о покупках для валидации

In [1]:
import os
import csv
import json
import random
import pickle
import glob
from datetime import datetime
from collections import defaultdict

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

plt.style.use('seaborn-poster')

%matplotlib inline

## Все в один большой словарь

In [2]:
%%time
purchase_data = {}

with open('../../uplift_modeling/retailhero-uplift/data/purchases.csv', ) as csvfile:
    reader = csv.DictReader(csvfile)
    for row in tqdm(reader):
        
        client_id_cur = row['client_id']
        transaction_id_cur = row['transaction_id']
        
        # create new client if not exists
        if client_id_cur not in purchase_data:
            purchase_data[client_id_cur] = {}
        
        # create new transaction if not exists
        if transaction_id_cur not in purchase_data[client_id_cur]:
            _transaction = {
                'datetime': row['transaction_datetime'],
                'purchase_sum': row['purchase_sum'],
                'store_id': row['store_id'],
                'products': []
            }
            purchase_data[client_id_cur][transaction_id_cur] = _transaction
        
        # add new product to transaction
        product_cur = {
            'product_id': row['product_id'],
            'quantity' :row['product_quantity']
        }
        purchase_data[client_id_cur][transaction_id_cur]['products'].append(product_cur)
        
print(len(purchase_data))

45786568it [03:37, 210089.07it/s]

400162
CPU times: user 3min 41s, sys: 7.51 s, total: 3min 49s
Wall time: 3min 37s





In [4]:
purchase_data['000012768d']

{'7e3e2e3984': {'datetime': '2018-12-01 07:12:45',
  'purchase_sum': '1007.0',
  'store_id': '54a4a11a29',
  'products': [{'product_id': '9a80204f78', 'quantity': '2.0'},
   {'product_id': 'da89ebd374', 'quantity': '1.0'},
   {'product_id': '0a95e1151d', 'quantity': '1.0'},
   {'product_id': '4055b15e4a', 'quantity': '2.0'},
   {'product_id': 'a685f1916b', 'quantity': '1.0'},
   {'product_id': '21db5dbe53', 'quantity': '1.0'},
   {'product_id': '1e208d0b4c', 'quantity': '1.0'},
   {'product_id': '15ccaa8685', 'quantity': '1.0'},
   {'product_id': '45389bb5b0', 'quantity': '1.0'},
   {'product_id': 'cb4c804130', 'quantity': '1.0'},
   {'product_id': '7c39f1d12c', 'quantity': '1.0'},
   {'product_id': '63e2eac70d', 'quantity': '1.0'},
   {'product_id': 'c7cc613e79', 'quantity': '1.0'},
   {'product_id': 'ad865591c6', 'quantity': '1.0'},
   {'product_id': '7118c66f7f', 'quantity': '1.0'},
   {'product_id': 'c55ed13ebd', 'quantity': '1.0'},
   {'product_id': '9cdad65286', 'quantity': '1.0'

## Сохраняем большой словарь по чанкам

In [3]:
%%time
chunk_size = 20000
count = 0
chunk_ind = 0
file_name = '../_processed_data/client_tr_history_0.tsv'

for client_id in tqdm(purchase_data):
    
    if count == chunk_size:
        chunk_ind += 1
        file_name = '../_processed_data/client_tr_history_{num}.tsv'.format(num=str(chunk_ind))
        count = 0
    
    chunk_file = open(file_name, 'a')
    chunk_file.write(json.dumps(client_id) + '\t'+ json.dumps(purchase_data[client_id]) + '\n')
    chunk_file.close()
    
    count += 1

100%|██████████| 400162/400162 [00:40<00:00, 9919.20it/s] 

CPU times: user 36 s, sys: 4.36 s, total: 40.4 s
Wall time: 40.3 s





## Выделяем Клиентов для валидации check_queies_01_11

- Подходят те, кто совершал транзации после 2019-03-01
- При этом каждая транзакция после 2019-03-01 может быть использована как валидационная

In [2]:
%%time
seed = 7777
num_chunks_valid = 5
valid_time = datetime.strptime('2019-03-01 00:00:00', '%Y-%m-%d %H:%M:%S')
#valid_time = datetime.strptime('2019-03-02 10:05:00', '%Y-%m-%d %H:%M:%S')

# take several chunks only for validation
random.seed(seed)
valid_chunks = random.sample(glob.glob('../_processed_data/client_tr_history_*'), k=num_chunks_valid)
print(valid_chunks)

for chunk_file in valid_chunks:
    ind = chunk_file.split('_')[-1].split('.')[0]
    valid_file_name = '../_processed_data/check_queries_01_11_{num}.tsv'.format(num=ind)
    
    with open(chunk_file, 'r') as chunk:
        for row in tqdm(chunk):
            client_id, transaction_history = row.split('\t')
            client_id, transaction_history = json.loads(client_id), json.loads(transaction_history)
            
            # tr history: dict -> list and sort it
            tr_history = [transaction_history[tr] for tr in transaction_history]
            sorted_transactions = sorted(tr_history, key=lambda x: x['datetime'])
            
            # find candidates for valid transactions, if no - skip iteration
            split_candidates = [datetime.strptime(tr['datetime'], '%Y-%m-%d %H:%M:%S') \
                                for tr in sorted_transactions \
                                if datetime.strptime(tr['datetime'], '%Y-%m-%d %H:%M:%S') > valid_time]
            if len(split_candidates) == 0:
                continue
            
            # for random split get validation query
            split_time = random.choice(split_candidates)
            #for split_time in split_candidates:
            query_trans_history = [tr for tr in sorted_transactions 
                                       if datetime.strptime(
                                           tr['datetime'], '%Y-%m-%d %H:%M:%S'
                                       ) < split_time]
            _next_transaction = [tr for tr in sorted_transactions
                                  if datetime.strptime(tr['datetime'], '%Y-%m-%d %H:%M:%S') == split_time][0]
            next_transaction = {
                    'datetime': _next_transaction['datetime'],
                    'store_id': _next_transaction['store_id'],
                    'purchase_sum': _next_transaction['purchase_sum'],
                    'product_ids': [pr['product_id'] for pr in _next_transaction['products']],
            }
            query = {
                    'client_id': client_id,
                    'transaction_history': query_trans_history
            }
                
            valid_file = open(valid_file_name, 'a')
            valid_file.write(json.dumps(query) + '\t'+ json.dumps(next_transaction) + '\n')
            valid_file.close()

146it [00:00, 1458.27it/s]

['../_processed_data/client_tr_history_4.tsv', '../_processed_data/client_tr_history_0.tsv', '../_processed_data/client_tr_history_15.tsv', '../_processed_data/client_tr_history_16.tsv', '../_processed_data/client_tr_history_6.tsv']


20000it [00:11, 1703.76it/s]
20000it [00:11, 1757.62it/s]
20000it [00:11, 1697.25it/s]
20000it [00:11, 1700.66it/s]
20000it [00:11, 1689.57it/s]

CPU times: user 57.5 s, sys: 1.33 s, total: 58.8 s
Wall time: 58.5 s





## Выделяем Клиентов для валидации check_queies_02_01

- Подходят те, кто совершал транзации после 2019-03-01
- При этом каждая транзакция после 2019-03-01 может быть использована как валидационная

In [2]:
def dt_format_change(dt):
    """
    '%Y-%m-%d %H:%M:%S' -> '%Y-%m-%dT%H:%M:%S'
    
    dt: str in '%Y-%m-%d %H:%M:%S'
    """
    return datetime.strptime(dt, '%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%dT%H:%M:%S')

check_clients = [
    '001e840150', '00078c508d', '0008b2cb41', '00035a21d9', '0015aa77ce',
    '00184e8b0a', '000f3b9860', '001c2b565f', '000b0559be', '0016b0d9ad',
    '000a400848', '000940f00a', '000d599743', '00211fcfaa', '002283ef29',
    '001b8d6788', '00071890c8', '0007b4ca21', '000ddb6229', '0006b9ad75',
    '00042a927a', '00213be6fb', '0012d1d4aa', '00209f873d', '00184aab1b',
    '000702109b', '00031cbbe6', '001175d51b', '000bc820f6', '001c8984f0',
    '0010f1f8ca', '0009e6bafa', '000c049a1a', '0007667c60', '000ca87889',
    '00020e7b18', '001fb70769', '001566f916', '00010925a5', '001de90d21',
    '000efde438', '001392b297', '00134e091b', '0001f552b0', '00167a61e2',
    '000df9078a', '0018d2efac', '00047b3720', '00083b5b14', '0021e07838',
    '000c216adb', '001a2412c6', '0004315e57', '0019a16b6b', '000990be82',
    '0019fb86cb', '00184f3b10', '0010082ab3', '001dac232d', '0019ca361b',
    '0017fdd057', '000b9905d8', '0006fca4bf', '00140e5d34', '001d642f66',
    '001cef2991', '000bf8ff33', '00127b29bb', '0019e0f07d', '001c25b9e3',
    '000a9d12ff', '001f46aa2c', '000b45b7ac', '0018650c30', '0008244fb3',
    '0002ce2217', '00174b3561', '0004254599', '00068fd5dc', '001162084a',
    '000220a0a7', '000a00419c', '000bd5f2f1', '000012768d', '00022fd34f',
    '0018dea0ba', '0004e1e14e', '001d004e5e', '0004231e2a', '000bc94494',
    '00184df0c9', '000036f903', '0006f24465', '000ac12729', '0013c0cbab',
    '00177cee3e', '0020f90a83', '00065f11c7', '000f46bbfc', '00038f9200',
    '0017a7ebcb']
print(len(check_clients))

101


In [3]:
%%time
seed = 7777
num_chunks_valid = 5
valid_time = datetime.strptime('2019-03-01 00:00:00', '%Y-%m-%d %H:%M:%S')
#valid_time = datetime.strptime('2019-03-02 10:05:00', '%Y-%m-%d %H:%M:%S')

# take several chunks only for validation
random.seed(seed)
valid_chunks = [
    '../_processed_data/client_tr_history_4.tsv', 
    '../_processed_data/client_tr_history_0.tsv',
    '../_processed_data/client_tr_history_15.tsv', 
    '../_processed_data/client_tr_history_16.tsv', 
    '../_processed_data/client_tr_history_6.tsv'
]

for chunk_file in valid_chunks:
    ind = chunk_file.split('_')[-1].split('.')[0]
    valid_file_name = '../_processed_data/check_queries_02_01_{num}.tsv'.format(num=ind)
    
    with open(chunk_file, 'r') as chunk:
        for row in tqdm(chunk):
            client_id, transaction_history = row.split('\t')
            client_id, transaction_history = json.loads(client_id), json.loads(transaction_history)
            
            # drop clients from check file
            if client_id in check_clients:
                continue
                
            # tr history: dict -> list and sort it
            tr_history = [transaction_history[tr] for tr in transaction_history]
            sorted_transactions = sorted(tr_history, key=lambda x: x['datetime'])
            
            # find candidates for valid transactions, if no - skip iteration
            split_candidates = [datetime.strptime(tr['datetime'], '%Y-%m-%d %H:%M:%S') \
                                for tr in sorted_transactions \
                                if datetime.strptime(tr['datetime'], '%Y-%m-%d %H:%M:%S') > valid_time]
            if len(split_candidates) == 0:
                continue
            
            # for random split get validation query
            split_time = random.choice(split_candidates)
            #for split_time in split_candidates:
            query_trans_history = [tr for tr in sorted_transactions 
                                       if datetime.strptime(
                                           tr['datetime'], '%Y-%m-%d %H:%M:%S'
                                       ) < split_time]
            _next_transaction = [tr for tr in sorted_transactions
                                  if datetime.strptime(tr['datetime'], '%Y-%m-%d %H:%M:%S') == split_time][0]
            # change format of dt
            for tr in query_trans_history:
                tr['datetime'] = dt_format_change(tr['datetime'])
                
            next_transaction = {
                    'datetime': dt_format_change(_next_transaction['datetime']),
                    'store_id': _next_transaction['store_id'],
                    'purchase_sum': _next_transaction['purchase_sum'],
                    'product_ids': [pr['product_id'] for pr in _next_transaction['products']],
            }
            query = {
                    'client_id': client_id,
                    'query_time': dt_format_change(_next_transaction['datetime']),
                    'transaction_history': query_trans_history
            }
                
            valid_file = open(valid_file_name, 'a')
            valid_file.write(json.dumps(query) + '\t'+ json.dumps(next_transaction) + '\n')
            valid_file.close()

20000it [00:14, 1339.45it/s]
20000it [00:14, 1353.72it/s]
20000it [00:14, 1363.03it/s]
20000it [00:14, 1342.82it/s]
20000it [00:14, 1352.33it/s]

CPU times: user 1min 13s, sys: 1.28 s, total: 1min 15s
Wall time: 1min 14s



