In [1]:
import gzip
import random

import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import StratifiedShuffleSplit

In [2]:
test_df = pd.read_csv('..\\input\\mlboot_test.tsv', index_col=0)
train_df = pd.read_csv('..\\input\\mlboot_train_answers.tsv', delimiter='\t', index_col=0)
train_index = train_df.index

# Разметить таргет для vw
train_df[train_df['target']==0] = -1

In [3]:
test_df.head(2)

888b238b4d14c03173baa375a739f6bc
ac4b8244f3ae82df511b002257473c11


In [4]:
train_df.head(2)

Unnamed: 0_level_0,target
cuid,Unnamed: 1_level_1
dc28aafafcfa71c0bbfa2a5724af6061,-1
a4e12f1cbce08d16f3bb0672cef5193c,-1


# Создание датасета

Подготовка датасета формата Vowpal Wabbit.

In [5]:
def c_parse(counter_row):
    """Парсинг строк counter1,2,3 для использования в создании признаков файла vw.
    
    Input:
    1. counter_row = '{"809001":2,"848545":2,"565828":1,"490363":1}'
    2. counter_row = '{}'
    
    Output:
    1. c_edit_row = '809001:2 848545:2 565828:1 490363:1', vals = [2, 2, 1, 1]
    2. c_edit_row = '', vals = []
    """
    c_edit_row = counter_row[1:-1].replace('"','').replace(',','.0 ')
    if c_edit_row:
        c_edit_row += '.0'
#         vals = [int(x[:-2]) for y in c_edit_row.split(' ') for x in y.split(':')[1]]
        vals = [int(x.split(':')[1][:-2]) for x in c_edit_row.split(' ')]
        return c_edit_row + ' ', vals
    return '', []


def c_parse2(counter_row):
    """Парсинг строк counter1,2,3 для использования в создании признаков файла vw.
    
    Функция аналогична c_parse(), но использует преобразование к dict.
    Обрабатыват данные дольше (примерно в полтора раза).
    
    """
    d = json.loads(counter_row)
    if d:
        c_edit_row = ' '.join([token + ':' + str(val) + '.0' for token,val in d.items()]) + ' '
        
        return c_edit_row, d.values()
#         return c_edit_row, list(d.values())
    return '', []


def cat_f(cat_row, val=0):
    """Категориальная числовая фича, увеличенная на 1"""
    return str(int(cat_row)+val) + '.0'

### Создание дополнительных признаков

Дополнительные признаки из токенов (count, sum и проч).

In [6]:
def numc(c_vals):
    """Количество."""
    return str(len(c_vals)) + '.0'


def sumc(c_vals):
    """Сумма."""
    return str(sum(c_vals)) + '.0'


def meanc(c_vals):
    """Среднее."""
    if c_vals:
        return str(np.round(np.mean(c_vals),1))
    return '0.0'


def minc(c_vals):
    """Минимальное."""
    if c_vals:
        return str(min(c_vals)) + '.0'
    return '0.0'


def maxc(c_vals):
    """Максимальное."""
    if c_vals:
        return str(max(c_vals)) + '.0'
    return '0.0'

### Создание строки формата VW
Формат строки:
- признак -1 или 1
- |c catf:value (прибавить единицу)
- |f token:value (counter first)
- |s token:value (counter second)
- |t token:value (counter third)
- |d diff:value (тоже на всякий случай прибавить единицу)

Дополнительные признаки, что можно сделать:
- Количество токенов для каждого counter: |n nucf:value nucs:value nuct:value
- Сумма посещений токенов (сумма values): |p sucf:value sucs:value suct:value
- Среднее посещений токенов: |a mecf:value mecs:value mect:value
- Минимальное посещение токенов: |b micf:value mics:value mict:value
- Максимальное посещение токенов: |m macf:value macs:value mact:value

In [7]:
def to_vw(label, tags, cat_feature, counter1, counter2, counter3, day):
    
    c1, c1_val = c_parse(counter1)
    c2, c2_val = c_parse(counter2)
    c3, c3_val = c_parse(counter3)
    
    f, s, t = "", "", ""
    if c1:
        f = f"|f {c1}"
    if c2:
        s = f"|s {c2}"
    if c3:
        t = f"|t {c3}"
    
    if not isinstance(tags, list):
        tags = [tags]
#     tag = ["'" + t for t in tags[:-1]] + [tags[-1]]
    tag = f"{'_'.join(tags)}"
    
    return f"{label} {tag}" + \
        f"|c catf:{cat_f(cat_feature, 1)} " + f + s + t + \
        f"|d diff:{cat_f(day)} " + \
        f"|n nucf:{numc(c1_val)} nucs:{numc(c2_val)} nuct:{numc(c3_val)} " + \
        f"|p sucf:{sumc(c1_val)} sucs:{sumc(c2_val)} suct:{sumc(c3_val)} " + \
        f"|a mecf:{meanc(c1_val)} mecs:{meanc(c2_val)} mect:{meanc(c3_val)} " + \
        f"|b micf:{minc(c1_val)} mics:{minc(c2_val)} mict:{minc(c3_val)} " + \
        f"|m macf:{maxc(c1_val)} macs:{maxc(c2_val)} mact:{maxc(c3_val)}" + \
        "\n"

### Создание файлов для VW
##### Проверка функций создания строк для VW

In [8]:
with gzip.open('..\\input\\mlboot_data.tsv.gz', 'rt') as f:
    for row in f:
        break

cuid, cat_feature, counter1, counter2, counter3, day = row.split()
label = 1
tags = [f'day{day}', cat_feature, cuid]
to_vw(label, tags, cat_feature, counter1, counter2, counter3, day)

'1 day39_5_00000d2994b6df9239901389031acaac|c catf:6.0 |f 809001:2.0 848545:2.0 565828:1.0 490363:1.0 |s 85789:1.0 238490:1.0 32285:1.0 103987:1.0 16507:2.0 6477:1.0 92797:2.0 |d diff:39.0 |n nucf:4.0 nucs:7.0 nuct:0.0 |p sucf:6.0 sucs:9.0 suct:0.0 |a mecf:1.5 mecs:1.3 mect:0.0 |b micf:1.0 mics:1.0 mict:0.0 |m macf:2.0 macs:2.0 mact:0.0\n'

In [9]:
%%time

niter = 0
input_gzip = '..\\input\\mlboot_data.tsv.gz'

output_dataset = '..\\vw\\dataset.vw'
output_dataset_cuid = '..\\vw\\dataset_cuid.vw'

output_test = '..\\vw\\dataset_test.vw'
output_test_cuid = '..\\vw\\dataset_cuid_test.vw'

with gzip.open(input_gzip, 'rt') as input_data, \
    open(output_dataset, 'w') as dataset, open(output_dataset_cuid, 'w') as dataset_cuid, \
    open(output_test, 'w') as test, open(output_test_cuid, 'w') as test_cuid:

    for row in tqdm(input_data):
        cuid, cat_feature, counter1, counter2, counter3, day = row.split()
        tags = [f'day{day}', cat_feature, cuid]
        
        if cuid in train_index:
            label = train_df['target'][cuid]
            output_row = to_vw(label, tags, cat_feature, counter1, counter2, counter3, day)
            dataset.write(output_row)
            
            # Записать cuid
            dataset_cuid.write(f'{cuid}\t{cat_feature}\t{day}\n')
        else:
            label = '1'
            output_row = to_vw(label, tags, cat_feature, counter1, counter2, counter3, day)
            test.write(output_row)
            
            # Записать cuid
            test_cuid.write(f'{cuid}\t{cat_feature}\t{day}\n')

#         niter += 1
        
#         if niter > 1000:
#             break

19528597it [44:45, 7272.67it/s]


Wall time: 44min 45s


# Разделить выборку на train / valid

In [2]:
train_df = pd.read_csv('..\\input\\mlboot_train_answers.tsv', delimiter='\t')
train_df.head(2)

Unnamed: 0,cuid,target
0,dc28aafafcfa71c0bbfa2a5724af6061,0
1,a4e12f1cbce08d16f3bb0672cef5193c,0


#### Индексы разделения

In [3]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=17)

for train_index, valid_index in sss.split(train_df, train_df['target']):
    print("TRAIN:", train_index, "TEST:", valid_index)

TRAIN: [134680 141806 199462 ... 287909 267837 309817] TEST: [ 49559 232377  91810 ... 263939 311760 367063]


In [5]:
train_df['train'] = 0

train_df.loc[train_index, 'train'] = 1

train_df.index = train_df['cuid']
train_df.head(2)

Unnamed: 0_level_0,cuid,target,train
cuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dc28aafafcfa71c0bbfa2a5724af6061,dc28aafafcfa71c0bbfa2a5724af6061,0,1
a4e12f1cbce08d16f3bb0672cef5193c,a4e12f1cbce08d16f3bb0672cef5193c,0,1


In [6]:
dataset_file = '..\\vw\\dataset.vw'
train_file = '..\\vw\\dataset_train.vw'
valid_file = '..\\vw\\dataset_valid.vw'

n_iter = 0
with open(dataset_file, 'r') as d_file, open(train_file, 'w') as t_file, open(valid_file, 'w') as v_file:
    for row in tqdm(d_file):
        cuid = row.split('|')[0].split()[1].split('_')[-1]
            
        if train_df['train'][cuid] == 1:
            t_file.write(row)
        else:
            v_file.write(row)
        
#         n_iter += 1
#         if n_iter > 100000:
#             break

12874345it [08:17, 25873.31it/s]


# Перемешать данные (WARNING - High Memory Usage)

In [8]:
def shuffle_data(input_file, output_file):
    
    lines = open(input_file).readlines()
    random.shuffle(lines)
    open(output_file, 'w').writelines(lines)
    
    print(f'Data shuffle completed: {input_file}. Saved to {output_file}')

In [51]:
def shuffle_data2(input_file, output_file):
    """Работает в два раза медленнее!"""
    with open(input_file,'r') as source:
        data = [(random.random(), line) for line in source]
    data.sort()
    
    with open(output_file,'w') as target:
        for _, line in data:
            target.write( line )
    
    print(f'Data shuffle completed: {input_file}. Saved to {output_file}')

In [10]:
%%time

# Исходный датасет
shuffle_data('..\\vw\\dataset.vw', '..\\vw\\dataset_shuffled.vw')

Data shuffle completed: ..\vw\dataset.vw. Saved to ..\vw\dataset_shuffled.vw


In [9]:
%%time

# train
shuffle_data('..\\vw\\dataset_train.vw', '..\\vw\\dataset_train_shuffled.vw')

# valid
shuffle_data('..\\vw\\dataset_valid.vw', '..\\vw\\dataset_valid_shuffled.vw')

Data shuffle completed: ..\vw\dataset_train.vw. Saved to ..\vw\dataset_train_shuffled.vw
Data shuffle completed: ..\vw\dataset_valid.vw. Saved to ..\vw\dataset_valid_shuffled.vw
