In [None]:
import csv, sys, collections
from datetime import datetime
from utils.common import *

# Configuration

In [None]:
RAW_TRAIN_PATH = "../tr.r0.csv"
RAW_VAL_PATH = "../va.r0.csv"

# file name after pre-processing step (processed csv file)
PROCESSED_TRAIN_PATH = "processed_train.csv"
PROCESSED_VAL_PATH = "processed_val.csv"

# hashed csv file's name
HASHED_TRAIN_PATH = "hashed_train.csv"
HASHED_VAL_PATH = "hashed_val.csv"

ROWS_FOR_TRAINING = -1 # set as -1 if you want to train with entire date
LEARNING_RATE = 0.03
EPOCHS = 13

# removed 'pub_id', 'pub_domain', 'pub_category', added 'app/site_id', 'app/site_domain', 'app/site_category'
FIELDS = ['id','click','hour','banner_pos','device_id','device_ip','device_model','device_conn_type','C14','C17','C20','C21',
         'app_id', 'app_domain', 'app_category', 'site_id', 'site_domain', 'site_category']
NEW_FIELDS = FIELDS+['device_id_count','device_ip_count','user_count','smooth_user_hour_count','user_click_histroy']

# Calculate counting features

In [None]:
def scan(path, is_train):
    '''
    copy from base/util/gen_data.py
    '''
    for i, row in enumerate(csv.DictReader(open(path)), start=1):
        if i >= ROWS_FOR_TRAINING and is_train:
            break
        user = def_user(row)
        id_cnt[row['device_id']] += 1
        ip_cnt[row['device_ip']] += 1
        user_cnt[user] += 1
        user_hour_cnt[user+'-'+row['hour']] += 1

In [None]:
start = datetime.now()
print('Start: {0}'.format(str(start)))

id_cnt = collections.defaultdict(int)
ip_cnt = collections.defaultdict(int)
user_cnt = collections.defaultdict(int)
user_hour_cnt = collections.defaultdict(int)

scan(RAW_TRAIN_PATH, is_train=ROWS_FOR_TRAINING != -1)
scan(RAW_VAL_PATH, False)
print('End: {0}, Elapsed time: {1}'.format(
        str(datetime.now()),
        str(datetime.now() - start))
     )

# Add counting features & history features to new csv file 

In [None]:
def gen_data(src_path, dst_path, is_train):
    '''
    copy from base/util/gen_data.py
    '''
    reader = csv.DictReader(open(src_path))
    writer = csv.DictWriter(open(dst_path, 'w'), NEW_FIELDS)
    writer.writeheader()

    for i, row in enumerate(reader, start=1):
        if i >= ROWS_FOR_TRAINING and is_train:
            break
        new_row = {}
        for field in FIELDS:
            new_row[field] = row[field]

        new_row['device_id_count'] = id_cnt[row['device_id']]
        new_row['device_ip_count'] = ip_cnt[row['device_ip']]

        user, hour = def_user(row), row['hour']
        new_row['user_count'] = user_cnt[user]
        new_row['smooth_user_hour_count'] = str(user_hour_cnt[user+'-'+hour])

        if has_id_info(row):

            if history[user]['prev_hour'] != row['hour']:
                history[user]['history'] = (history[user]['history'] + history[user]['buffer'])[-4:]
                history[user]['buffer'] = ''
                history[user]['prev_hour'] = row['hour']

            new_row['user_click_histroy'] = history[user]['history']

            if is_train:
                history[user]['buffer'] += row['click']
        else:
            new_row['user_click_histroy'] = ''
            
        writer.writerow(new_row)

In [None]:
start = datetime.now()
print('Start: {0}'.format(str(start)))

history = collections.defaultdict(lambda: {'history': '', 'buffer': '', 'prev_hour': ''})

gen_data(src_path=RAW_TRAIN_PATH, dst_path=PROCESSED_TRAIN_PATH,is_train=ROWS_FOR_TRAINING != -1)
gen_data(src_path=RAW_VAL_PATH, dst_path=PROCESSED_VAL_PATH, is_train=False)

print('End: {0}, Elapsed time: {1}'.format(
        str(datetime.now()),
        str(datetime.now() - start))
     )

# Paralized hashing PROCESSED_TRAIN/VAL_PATH and save to HASHED_TRAIN/VAL_PATH

In [None]:
nr_thread = 12

In [None]:
# split processed file into nr_thread csv_files
split(path=PROCESSED_TRAIN_PATH, nr_thread=nr_thread)
split(path=PROCESSED_VAL_PATH, nr_thread=nr_thread)

In [None]:
# parallelly hashing splited csv_files and save to nr_thread hashed csv_files
parallel_convert(
    "utils/2.py", 
    [PROCESSED_TRAIN_PATH, PROCESSED_VAL_PATH, HASHED_TRAIN_PATH, HASHED_VAL_PATH], 
    nr_thread)

In [None]:
# delete old splited processed files
delete(PROCESSED_TRAIN_PATH, nr_thread)
delete(PROCESSED_VAL_PATH, nr_thread)

# merge nr_thread hashed csv_files into 1 file
cat(HASHED_TRAIN_PATH, nr_thread)
cat(HASHED_VAL_PATH, nr_thread)

# delete old splited hashed csv_files
delete(HASHED_TRAIN_PATH, nr_thread)
delete(HASHED_VAL_PATH, nr_thread)

In [None]:
no_thread = 1 # number of thread for training
cmd = '../base/mark1 -r {0} -s {1} -t {2} {3} {4}'.format(LEARNING_RATE, no_thread, EPOCHS, HASHED_VAL_PATH, HASHED_TRAIN_PATH)
subprocess.call(cmd.split())