In [116]:
import gzip

import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy import sparse

In [2]:
test_df = pd.read_csv('..\\input\\mlboot_test.tsv', index_col=0)
train_df = pd.read_csv('..\\input\\mlboot_train_answers.tsv', delimiter='\t', index_col=0)
train_index = train_df.index

# Разметить таргет для vw
train_df[train_df['target']==0] = -1

In [3]:
test_df.head(2)

888b238b4d14c03173baa375a739f6bc
ac4b8244f3ae82df511b002257473c11


In [4]:
train_df.head(2)

Unnamed: 0_level_0,target
cuid,Unnamed: 1_level_1
dc28aafafcfa71c0bbfa2a5724af6061,-1
a4e12f1cbce08d16f3bb0672cef5193c,-1


# Создание датасетов для дальнейшего преобразования данных в sparse

Идея:
- Объединить все признаки одинаковых cuid в одну строку
- Признаки одной категории сгруппировать последовательно в разные файлы

Полученные данные для train:
- dataset_s_cat.csv - категориальные признаки
- dataset_s_counter.csv - токены, преобразованные из исходных dict к виду 809001 809001 848545 848545 565828 490363 ...
- dataset_s_cuid.csv - список cuids
- dataset_s_day.csv - номера дней, пример строки: 39 43 45 46 48 53 55
- dataset_s_labels.csv - метки классов

Полученные данные для test (аналогичны, за исключением файла c labels):
- dataset_s_test_cat.csv - категориальные признаки
- dataset_s_test_counter.csv - токены, преобразованные из исходных dict к виду 809001 809001 848545 848545 565828 490363 ...
- dataset_s_test_cuid.csv - список cuids
- dataset_s_test_day.csv - номера дней, пример строки: 39 43 45 46 48 53 55

Количество строк в полученных данных равно количеству cuids

In [55]:
def c_parse(counter_row):
    """Парсинг строк counter1,2,3 для использования в создании признаков.
    
    Input:
    1. counter_row = '{"809001":2,"848545":2,"565828":1,"490363":1}'
    2. counter_row = '{}'
    
    Output:
    1. c_edit_row = '809001 809001 848545 848545 565828 490363 '
    2. c_edit_row = ''
    """
    c_edit_row = counter_row[1:-1].replace('"','')
    
    if c_edit_row:
        c_edit_row = c_edit_row.split(',')
#         c_edit_row = ' '.join([' '.join([x[0]] * int(x[1])) for ll in c_edit_row for x in [ll.split(':')]])
        c_edit_row = [' '.join([x[0]] * int(x[1])) for ll in c_edit_row for x in [ll.split(':')]]
        return c_edit_row
    
    return []


def cat_f(cat_row, val=0):
    """Категориальная числовая фича, увеличенная на val"""
    return str(int(cat_row)+val)

#### Проверка функций

In [59]:
with gzip.open('..\\input\\mlboot_data.tsv.gz', 'rt') as f:
    for row in f:
        break
cuid, cat_feature, counter1, counter2, counter3, day = row.split()
c1, c2, c3 = c_parse(counter1), c_parse(counter2), c_parse(counter3)

In [62]:
' '.join(c1 + c2 + c3)

'809001 809001 848545 848545 565828 490363 85789 238490 32285 103987 16507 16507 6477 92797 92797'

## Создание файлов

In [None]:
def to_vw(cat_feature, counter1, counter2, counter3, day):
    
    c1 = c_parse(counter1)
    c2 = c_parse(counter2)
    c3 = c_parse(counter3)
    
    cat_feature_vw = f"{cat_feature}"
    counter = ' '.join(c1 + c2 + c3)
    dayn_vw = f"{day}"
    
    return cat_feature_vw, counter, dayn_vw

##### Проверка функций создания строк для VW

In [66]:
with gzip.open('..\\input\\mlboot_data.tsv.gz', 'rt') as f:
    for row in f:
        break

cuid, cat_feature, counter1, counter2, counter3, day = row.split()
to_vw(cat_feature, counter1, counter2, counter3, day)

('5',
 '809001 809001 848545 848545 565828 490363 85789 238490 32285 103987 16507 16507 6477 92797 92797',
 '39')

In [78]:
# %%time

niter = 0
input_gzip = '..\\input\\mlboot_data.tsv.gz'

output_dataset = '..\\sparse\\dataset_s_counter.csv'
output_dataset_cat = '..\\sparse\\dataset_s_cat.csv'
output_dataset_day = '..\\sparse\\dataset_s_day.csv'
output_dataset_cuid = '..\\sparse\\dataset_s_cuid.csv'
output_dataset_labels = '..\\sparse\\dataset_s_labels.csv'

output_test = '..\\sparse\\dataset_s_test_counter.csv'
output_test_cat = '..\\sparse\\dataset_s_test_cat.csv'
output_test_day = '..\\sparse\\dataset_s_test_day.csv'
output_test_cuid = '..\\sparse\\dataset_s_test_cuid.csv'

prev_cuid = ''
prev_cuid_in_train = None

with gzip.open(input_gzip, 'rt') as input_data, \
    open(output_dataset, 'w') as dataset, \
    open(output_dataset_cat, 'w') as dataset_cat, \
    open(output_dataset_day, 'w') as dataset_day, \
    open(output_dataset_cuid, 'w') as dataset_cuid, \
    open(output_dataset_labels, 'w') as dataset_labels, \
    open(output_test, 'w') as test, \
    open(output_test_cat, 'w') as test_cat, \
    open(output_test_day, 'w') as test_day, \
    open(output_test_cuid, 'w') as test_cuid:

    for row in tqdm(input_data):
        cuid, cat_feature, counter1, counter2, counter3, day = row.split()
        cat_feature_vw, counter, dayn_vw = to_vw(cat_feature, counter1, counter2, counter3, day)
        
        if cuid != prev_cuid:  # Новый cuid
            niter += 1
            
            # Записать строку для предыдущего cuid
            if prev_cuid_in_train:
                dataset.write(counter_full + '\n')
                dataset_cat.write(cat_full + '\n')
                dataset_day.write(dayn_full + '\n')
                dataset_labels.write(str(label) + '\n')
            elif prev_cuid_in_train is not None:
                test.write(counter_full + '\n')
                test_cat.write(cat_full + '\n')
                test_day.write(dayn_full + '\n')
            
#             if niter > 1000:
#                 break
            
            cat_full = cat_feature_vw
            counter_full = counter
            dayn_full = dayn_vw
            
            cuid_in_train = cuid in train_index
            if cuid_in_train:
                label = train_df['target'][cuid]
                dataset_cuid.write(f'{cuid}\n')
            else:
                test_cuid.write(f'{cuid}\n')
            
            prev_cuid = cuid
            prev_cuid_in_train = cuid_in_train
        else:
            cat_full += ' ' + cat_feature_vw
            counter_full += ' ' + counter
            dayn_full += ' ' + dayn_vw
    
    # Записать последнюю строку в файл
    if prev_cuid_in_train:
        dataset.write(counter_full + '\n')
        dataset_cat.write(cat_full + '\n')
        dataset_day.write(dayn_full + '\n')
        dataset_labels.write(str(label) + '\n')
    else:
        test.write(counter_full + '\n')
        test_cat.write(cat_full + '\n')
        test_day.write(dayn_full + '\n')
    
    # Последний cuid
    if cuid in train_index:
        dataset_cuid.write(f'{cuid}\n')
    else:
        test_cuid.write(f'{cuid}\n')

19528597it [18:03, 18023.39it/s]
