In [1]:
from nltk.tokenize import RegexpTokenizer
import cPickle as pickle
import string
import os

%matplotlib inline
import numpy as np
import random
import time

import logging
from logging import info

from multiprocessing import Pool as ThreadPool
import itertools

In [2]:
root = logging.getLogger()
for handler in root.handlers[:]:
    root.removeHandler(handler)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # adds a default StreamHanlder
#root.addHandler(logging.StreamHandler())

In [3]:
SAMPLE_RATIO = 0.01

In [4]:
root_location = "/big/s/shalaby/"
exports_location = root_location + "exported_data/"

#training_file = root_location + 'docs_output_training_validation_documents_' + str(SAMPLE_RATIO)
training_file = root_location + 'docs_output.json'

doc_classifications_map_file = exports_location + "doc_classification_map.pkl"
sections_file = exports_location + "sections.pkl"
classes_file = exports_location + "classes.pkl"
subclasses_file = exports_location + "subclasses.pkl"
classifications_output = exports_location + "classifications.pkl"
# training_docs_list_file = exports_location + "training_documents_" + str(SAMPLE_RATIO) + "_sample.pkl"
# validation_docs_list_file = exports_location + "validation_documents_" + str(SAMPLE_RATIO) + "_sample.pkl"
training_docs_list_file = exports_location + "training_docs_list.pkl"
validation_docs_list_file = exports_location + "validation_docs_list.pkl"
test_docs_list_file = exports_location + "test_docs_list.pkl"

In [5]:
%%time
doc_classification_map = pickle.load(open(doc_classifications_map_file))
sections = pickle.load(open(sections_file))
classes = pickle.load(open(classes_file))
subclasses = pickle.load(open(subclasses_file))
training_docs_list = pickle.load(open(training_docs_list_file))
validation_docs_list = pickle.load(open(validation_docs_list_file))
test_docs_list = pickle.load(open(test_docs_list_file))

CPU times: user 29.3 s, sys: 6.14 s, total: 35.4 s
Wall time: 37.5 s


In [6]:
len(training_docs_list)

1286325

In [7]:
len(training_docs_list) + len(validation_docs_list)

1607798

In [8]:
len(validation_docs_list)

321473

In [9]:
len(test_docs_list)

401877

In [15]:
MIN_SIZE = 0
NUMBER_INDICATOR = "number_inidicator"
CURRENCY_INDICATOR = "currency_inidicator"
CHEMICAL_INDICATOR = "chemical_inidicator"

In [16]:
def stemtokenizer(text):
    """ MAIN FUNCTION to get clean stems out of a text. A list of clean stems are returned """
    tokenizer = RegexpTokenizer(r'\s+', gaps=True)
    tokens = tokenizer.tokenize(text)
    stems = []  # result
    for token in tokens:
        stem = token.lower()
        stem = stem.strip(string.punctuation)
        if stem:
            if is_number(stem):
                stem = NUMBER_INDICATOR
            elif is_currency(stem):
                stem = CURRENCY_INDICATOR
            elif is_chemical(stem):
                stem = CHEMICAL_INDICATOR
            else:
                stem = stem.strip(string.punctuation)
            if stem and len(stem) >= MIN_SIZE:
                # extract uni-grams
                stems.append(stem)
    del tokens
    return stems

def is_number(str):
    """ Returns true if given string is a number (float or int)"""
    try:
        float(str.replace(",", ""))
        return True
    except ValueError:
        return False

def is_currency(str):
    return str[0] == "$"

def is_chemical(str):
    return str.count("-") > 3

## Get Separate Training and Validation Data Files

In [11]:
BATCH_SIZE = 10000
# TRAINING_PREPROCESSED_FILES_PREFIX = "/mnt/data2/shalaby/training_docs_sample_%s_data_preprocessed-" % str(SAMPLE_RATIO)
# TRAINING_PREPROCESSED_DOCIDS_FILES_PREFIX = "/mnt/data2/shalaby/training_docs_sample_%s_docids_preprocessed-" % str(SAMPLE_RATIO)
# VALIDATION_PREPROCESSED_FILES_PREFIX = "/mnt/data2/shalaby/validation_docs_sample_%s_data_preprocessed-" % str(SAMPLE_RATIO)
# VALIDATION_PREPROCESSED_DOCIDS_FILES_PREFIX = "/mnt/data2/shalaby/validation_docs_sample_%s_docids_preprocessed-" % str(SAMPLE_RATIO)

TRAINING_PREPROCESSED_FILES_PREFIX = "/big/s/shalaby/preprocessed_data/training_docs_data_preprocessed-"
TRAINING_PREPROCESSED_DOCIDS_FILES_PREFIX = "/big/s/shalaby/preprocessed_data/training_docs_docids_preprocessed-"
VALIDATION_PREPROCESSED_FILES_PREFIX = "/big/s/shalaby/preprocessed_data/validation_docs_data_preprocessed-"
VALIDATION_PREPROCESSED_DOCIDS_FILES_PREFIX = "/big/s/shalaby/preprocessed_data/validation_docs_docids_preprocessed-"
TEST_PREPROCESSED_FILES_PREFIX = "/big/s/shalaby/preprocessed_data/test_docs_data_preprocessed-"
TEST_PREPROCESSED_DOCIDS_FILES_PREFIX = "/big/s/shalaby/preprocessed_data/test_docs_docids_preprocessed-"


TRAINING_MERGED_PREPROCESSED_FILES_PREFIX = "/big/s/shalaby/preprocessed_data/training_docs_merged_data_preprocessed-"
TRAINING_MERGED_PREPROCESSED_DOCIDS_FILES_PREFIX = "/big/s/shalaby/preprocessed_data/training_docs_merged_docids_preprocessed-"
VALIDATION_MERGED_PREPROCESSED_FILES_PREFIX = "/big/s/shalaby/preprocessed_data/validation_docs_merged_data_preprocessed-"
VALIDATION_MERGED_PREPROCESSED_DOCIDS_FILES_PREFIX = "/big/s/shalaby/preprocessed_data/validation_docs_merged_docids_preprocessed-"
TEST_MERGED_PREPROCESSED_FILES_PREFIX = "/big/s/shalaby/preprocessed_data/test_docs_merged_data_preprocessed-"
TEST_MERGED_PREPROCESSED_DOCIDS_FILES_PREFIX = "/big/s/shalaby/preprocessed_data/test_docs_merged_docids_preprocessed-"

In [12]:
def write_batch(file_prefix, doc_files_prefix, batch_lines, doc_ids, batch_start):
    if len(batch_lines):
        print "writing batch %d" % batch_start
        with open(file_prefix + str(batch_start), 'w') as batch_file:
            for line in batch_lines:
                batch_file.write((u" ".join(line) + "\n").encode('utf-8'))
        pickle.dump(doc_ids, open(doc_files_prefix + str(batch_start), 'w'))


Load Line Positions

In [13]:
line_positions = pickle.load(open("/big/s/shalaby/exported_data/line_positions.pkl", "r"))

Compute Line Positions

In [9]:
%%time
line_positions = dict()
with open(training_file) as f:
    
    i = 0
    line_positions[i] = f.tell()
    line = f.readline()
    while line:
        i+=1
        if not line.strip(): continue
        line_positions[i] = f.tell()
        line = f.readline()
    del line_positions[i]

CPU times: user 3min 54s, sys: 30.6 s, total: 4min 25s
Wall time: 4min 53s


In [12]:
pickle.dump(line_positions, open("/big/s/shalaby/exported_data/line_positions.pkl", "w"))

In [14]:
len(line_positions)

2009750

# Training

This will create uneven batches of documents, we need to run the following step to make it even

In [32]:
def multithreaded_batch_creation(start_index):

    file_prefix = TRAINING_PREPROCESSED_FILES_PREFIX
    doc_file_prefix = TRAINING_PREPROCESSED_DOCIDS_FILES_PREFIX
    
    if os.path.exists(file_prefix + str(start_index)):
        info("Batch {} already exists, skipping..".format(start_index))
        return
    
    info("Batch creation working on {}\n".format(start_index))
    with open(training_file) as file_obj:
        file_obj.seek(line_positions[start_index])
        token_lines, doc_ids = [], []
        start_time = time.time()
        for i, line in enumerate(file_obj):
            (doc_id, text) = eval(line)
            if doc_id in training_docs_list:
                token_lines.append(stemtokenizer(text))
                doc_ids.append(doc_id)
                if len(token_lines) % 1000 == 0: info(len(token_lines))
            if i >= BATCH_SIZE:
                break
    duration = time.time() - start_time
    info("Finished batch of {:d} in {:.0f}m {:.0f}s".format(BATCH_SIZE, *divmod(duration, 60)))
    info("For index {}, the actual number of lines written is: {}".format(start_index, len(token_lines)))
    write_batch(file_prefix, doc_file_prefix, token_lines, doc_ids, start_index)
    del token_lines
    del doc_ids

In [None]:
pool.close()
pool.terminate()

In [35]:
(divmod(len(line_positions), BATCH_SIZE)[0] +1) * BATCH_SIZE

2010000

In [36]:
pool = ThreadPool(10)
# +1 since range is end-exclusive
batches = range(0, (divmod(len(line_positions), BATCH_SIZE)[0] + 1) * BATCH_SIZE, BATCH_SIZE )
indices = pool.map(multithreaded_batch_creation, batches)

2017-01-03 20:05:40,516 : INFO : Batch 180000 already exists, skipping..
2017-01-03 20:05:40,517 : INFO : Batch 240000 already exists, skipping..
2017-01-03 20:05:40,517 : INFO : Batch 120000 already exists, skipping..
2017-01-03 20:05:40,517 : INFO : Batch 420000 already exists, skipping..
2017-01-03 20:05:40,517 : INFO : Batch 480000 already exists, skipping..
2017-01-03 20:05:40,521 : INFO : Batch 190000 already exists, skipping..
2017-01-03 20:05:40,522 : INFO : Batch 130000 already exists, skipping..
2017-01-03 20:05:40,522 : INFO : Batch 430000 already exists, skipping..
2017-01-03 20:05:40,517 : INFO : Batch 0 already exists, skipping..
2017-01-03 20:05:40,521 : INFO : Batch 60000 already exists, skipping..
2017-01-03 20:05:40,516 : INFO : Batch 540000 already exists, skipping..
2017-01-03 20:05:40,519 : INFO : Batch 360000 already exists, skipping..
2017-01-03 20:05:40,522 : INFO : Batch 490000 already exists, skipping..
2017-01-03 20:05:40,523 : INFO : Batch 200000 already exi

writing batch 2000000


### Join up the files we wrote to form BATCH_SIZE block files

In [50]:
curr_index = 0
curr_read_file_index = 0
curr_write_file_index = 0

read_file = None
write_file = None

def read_line():
    global read_file, curr_read_file_index
    while True:
        if read_file is None:
            if os.path.exists(TRAINING_PREPROCESSED_FILES_PREFIX + str(curr_read_file_index)):
                read_file = open(TRAINING_PREPROCESSED_FILES_PREFIX + str(curr_read_file_index), "r")
            else: 
                raise StopIteration()
        for line in read_file:
            yield line
            
        curr_read_file_index += BATCH_SIZE
        info("Reading new file for batch {}".format(curr_read_file_index))
        read_file.close()
        read_file = None

def write_line(line):
    global write_file, curr_write_file_index, curr_index
    if write_file is None:
        write_file = open(TRAINING_MERGED_PREPROCESSED_FILES_PREFIX + str(curr_write_file_index), "w")
    write_file.write(line)
    curr_index += 1
    if curr_index % BATCH_SIZE == 0:
        curr_write_file_index += BATCH_SIZE
        info("Writing to a new file for batch {}".format(curr_write_file_index))
        write_file.close()
        write_file = None
    

for line in read_line():
    write_line(line)

2017-01-04 02:25:14,695 : INFO : Reading new file for batch 10000
2017-01-04 02:25:16,752 : INFO : Writing to a new file for batch 10000
2017-01-04 02:25:27,313 : INFO : Reading new file for batch 20000
2017-01-04 02:25:30,973 : INFO : Reading new file for batch 30000
2017-01-04 02:25:31,467 : INFO : Writing to a new file for batch 20000
2017-01-04 02:25:44,787 : INFO : Reading new file for batch 40000
2017-01-04 02:25:48,578 : INFO : Writing to a new file for batch 30000
2017-01-04 02:25:59,947 : INFO : Reading new file for batch 50000
2017-01-04 02:26:04,004 : INFO : Reading new file for batch 60000
2017-01-04 02:26:05,102 : INFO : Writing to a new file for batch 40000
2017-01-04 02:26:17,433 : INFO : Reading new file for batch 70000
2017-01-04 02:26:20,341 : INFO : Writing to a new file for batch 50000
2017-01-04 02:26:31,509 : INFO : Reading new file for batch 80000
2017-01-04 02:26:36,285 : INFO : Reading new file for batch 90000
2017-01-04 02:26:37,758 : INFO : Writing to a new f

#### Rearranging Docids files

In [60]:
all_doc_ids = []

for i in range(0, 100000000, BATCH_SIZE):
    if os.path.exists(TRAINING_PREPROCESSED_DOCIDS_FILES_PREFIX + str(i)):
        doc_ids = pickle.load(open(TRAINING_PREPROCESSED_DOCIDS_FILES_PREFIX + str(i)))
        all_doc_ids.extend(doc_ids)
    else:
        break
        

for i in range(0, 100000000, BATCH_SIZE):
    if i < len(all_doc_ids):
        pickle.dump(all_doc_ids[i: i + BATCH_SIZE], open(TRAINING_MERGED_PREPROCESSED_DOCIDS_FILES_PREFIX + str(i), "w"))
    else:
        break

In [59]:
len(all_doc_ids)

1286458

# Validation

In [63]:
def multithreaded_batch_creation(start_index):

    file_prefix = VALIDATION_PREPROCESSED_FILES_PREFIX
    doc_file_prefix = VALIDATION_PREPROCESSED_DOCIDS_FILES_PREFIX
    
    if os.path.exists(file_prefix + str(start_index)):
        info("Batch {} already exists, skipping..".format(start_index))
        return
    
    info("Batch creation working on {}\n".format(start_index))
    with open(training_file) as file_obj:
        file_obj.seek(line_positions[start_index])
        token_lines, doc_ids = [], []
        start_time = time.time()
        for i, line in enumerate(file_obj):
            (doc_id, text) = eval(line)
            if doc_id in validation_docs_list:
                token_lines.append(stemtokenizer(text))
                doc_ids.append(doc_id)
                if len(token_lines) % 1000 == 0: info(len(token_lines))
            if i >= BATCH_SIZE:
                break
    duration = time.time() - start_time
    info("Finished batch of {:d} in {:.0f}m {:.0f}s".format(BATCH_SIZE, *divmod(duration, 60)))
    info("For index {}, the actual number of lines written is: {}".format(start_index, len(token_lines)))
    write_batch(file_prefix, doc_file_prefix, token_lines, doc_ids, start_index)
    del token_lines
    del doc_ids

In [64]:
pool = ThreadPool(10)
# +1 since range is end-exclusive
batches = range(0, (divmod(len(line_positions), BATCH_SIZE)[0] + 1) * BATCH_SIZE, BATCH_SIZE )
indices = pool.map(multithreaded_batch_creation, batches)

2017-01-04 04:12:22,405 : INFO : Batch creation working on 0

2017-01-04 04:12:22,419 : INFO : Batch creation working on 300000

2017-01-04 04:12:22,406 : INFO : Batch creation working on 180000

2017-01-04 04:12:22,431 : INFO : Batch creation working on 60000

2017-01-04 04:12:22,499 : INFO : Batch creation working on 120000

2017-01-04 04:12:22,503 : INFO : Batch creation working on 420000

2017-01-04 04:12:22,507 : INFO : Batch creation working on 480000

2017-01-04 04:12:22,499 : INFO : Batch creation working on 240000

2017-01-04 04:12:22,510 : INFO : Batch creation working on 540000

2017-01-04 04:12:22,499 : INFO : Batch creation working on 360000

2017-01-04 04:19:00,955 : INFO : 1000
2017-01-04 04:19:04,784 : INFO : 1000
2017-01-04 04:19:05,780 : INFO : 1000
2017-01-04 04:19:17,581 : INFO : 1000
2017-01-04 04:19:22,408 : INFO : 1000
2017-01-04 04:19:22,974 : INFO : 1000
2017-01-04 04:19:26,641 : INFO : 1000
2017-01-04 04:19:33,416 : INFO : 1000
2017-01-04 04:21:05,734 : INFO :

writing batch 540000


2017-01-04 04:23:16,265 : INFO : Finished batch of 10000 in 10m 54s
2017-01-04 04:23:16,273 : INFO : For index 180000, the actual number of lines written is: 1556


writing batch 180000


2017-01-04 04:23:17,067 : INFO : Finished batch of 10000 in 10m 55s
2017-01-04 04:23:17,070 : INFO : For index 360000, the actual number of lines written is: 1637


writing batch 360000


2017-01-04 04:23:17,203 : INFO : Finished batch of 10000 in 10m 55s
2017-01-04 04:23:17,206 : INFO : For index 480000, the actual number of lines written is: 1634


writing batch 480000


2017-01-04 04:23:20,495 : INFO : Batch creation working on 550000

2017-01-04 04:23:21,771 : INFO : Finished batch of 10000 in 10m 59s
2017-01-04 04:23:21,773 : INFO : For index 240000, the actual number of lines written is: 1586


writing batch 240000


2017-01-04 04:23:22,104 : INFO : Batch creation working on 190000

2017-01-04 04:23:22,640 : INFO : Finished batch of 10000 in 11m 0s
2017-01-04 04:23:22,643 : INFO : For index 300000, the actual number of lines written is: 1540


writing batch 300000


2017-01-04 04:23:23,854 : INFO : Batch creation working on 490000

2017-01-04 04:23:24,300 : INFO : Batch creation working on 370000

2017-01-04 04:23:27,516 : INFO : Finished batch of 10000 in 11m 5s
2017-01-04 04:23:27,519 : INFO : For index 420000, the actual number of lines written is: 1618


writing batch 420000


2017-01-04 04:23:27,662 : INFO : Batch creation working on 250000

2017-01-04 04:23:29,120 : INFO : Batch creation working on 310000

2017-01-04 04:23:30,027 : INFO : Finished batch of 10000 in 11m 8s
2017-01-04 04:23:30,029 : INFO : For index 0, the actual number of lines written is: 1611


writing batch 0


2017-01-04 04:23:33,354 : INFO : Batch creation working on 430000

2017-01-04 04:23:35,394 : INFO : Batch creation working on 10000

2017-01-04 04:25:03,365 : INFO : Finished batch of 10000 in 12m 41s
2017-01-04 04:25:03,368 : INFO : For index 120000, the actual number of lines written is: 1597


writing batch 120000


2017-01-04 04:25:07,316 : INFO : Batch creation working on 130000

2017-01-04 04:25:14,336 : INFO : Finished batch of 10000 in 12m 52s
2017-01-04 04:25:14,347 : INFO : For index 60000, the actual number of lines written is: 1624


writing batch 60000


2017-01-04 04:25:19,098 : INFO : Batch creation working on 70000

2017-01-04 04:28:34,772 : INFO : 1000
2017-01-04 04:28:35,756 : INFO : 1000
2017-01-04 04:28:43,858 : INFO : 1000
2017-01-04 04:28:47,491 : INFO : 1000
2017-01-04 04:28:49,938 : INFO : 1000
2017-01-04 04:28:52,128 : INFO : 1000
2017-01-04 04:29:01,897 : INFO : 1000
2017-01-04 04:29:02,225 : INFO : 1000
2017-01-04 04:30:10,355 : INFO : 1000
2017-01-04 04:30:43,232 : INFO : 1000
2017-01-04 04:31:41,401 : INFO : Finished batch of 10000 in 8m 21s
2017-01-04 04:31:41,404 : INFO : For index 550000, the actual number of lines written is: 1572


writing batch 550000


2017-01-04 04:31:44,375 : INFO : Finished batch of 10000 in 8m 21s
2017-01-04 04:31:44,377 : INFO : For index 490000, the actual number of lines written is: 1549


writing batch 490000


2017-01-04 04:31:46,653 : INFO : Batch creation working on 560000

2017-01-04 04:31:48,973 : INFO : Batch creation working on 500000

2017-01-04 04:31:50,761 : INFO : Finished batch of 10000 in 8m 26s
2017-01-04 04:31:50,764 : INFO : For index 370000, the actual number of lines written is: 1681


writing batch 370000


2017-01-04 04:31:53,082 : INFO : Finished batch of 10000 in 8m 31s
2017-01-04 04:31:53,085 : INFO : For index 190000, the actual number of lines written is: 1599


writing batch 190000


2017-01-04 04:31:53,115 : INFO : Finished batch of 10000 in 8m 20s
2017-01-04 04:31:53,118 : INFO : For index 430000, the actual number of lines written is: 1619


writing batch 430000


2017-01-04 04:31:55,994 : INFO : Batch creation working on 380000

2017-01-04 04:31:58,605 : INFO : Finished batch of 10000 in 8m 29s
2017-01-04 04:31:58,610 : INFO : For index 310000, the actual number of lines written is: 1625


writing batch 310000


2017-01-04 04:31:59,268 : INFO : Batch creation working on 440000

2017-01-04 04:31:59,775 : INFO : Batch creation working on 200000

2017-01-04 04:32:04,492 : INFO : Batch creation working on 320000

2017-01-04 04:32:09,492 : INFO : Finished batch of 10000 in 8m 34s
2017-01-04 04:32:09,496 : INFO : For index 10000, the actual number of lines written is: 1575


writing batch 10000


2017-01-04 04:32:09,886 : INFO : Finished batch of 10000 in 8m 42s
2017-01-04 04:32:09,889 : INFO : For index 250000, the actual number of lines written is: 1641


writing batch 250000


2017-01-04 04:32:16,355 : INFO : Batch creation working on 20000

2017-01-04 04:32:16,527 : INFO : Batch creation working on 260000

2017-01-04 04:33:29,944 : INFO : Finished batch of 10000 in 8m 23s
2017-01-04 04:33:29,947 : INFO : For index 130000, the actual number of lines written is: 1616


writing batch 130000


2017-01-04 04:33:34,850 : INFO : Batch creation working on 140000

2017-01-04 04:33:38,516 : INFO : Finished batch of 10000 in 8m 19s
2017-01-04 04:33:38,525 : INFO : For index 70000, the actual number of lines written is: 1528


writing batch 70000


2017-01-04 04:33:42,999 : INFO : Batch creation working on 80000

2017-01-04 04:36:39,516 : INFO : 1000
2017-01-04 04:36:56,523 : INFO : 1000
2017-01-04 04:36:57,355 : INFO : 1000
2017-01-04 04:37:02,454 : INFO : 1000
2017-01-04 04:37:05,895 : INFO : 1000
2017-01-04 04:37:12,249 : INFO : 1000
2017-01-04 04:37:18,777 : INFO : 1000
2017-01-04 04:37:35,161 : INFO : 1000
2017-01-04 04:38:51,479 : INFO : 1000
2017-01-04 04:38:53,787 : INFO : 1000
2017-01-04 04:39:58,035 : INFO : Finished batch of 10000 in 8m 2s
2017-01-04 04:39:58,038 : INFO : For index 380000, the actual number of lines written is: 1629


writing batch 380000


2017-01-04 04:40:02,467 : INFO : Finished batch of 10000 in 8m 16s
2017-01-04 04:40:02,470 : INFO : For index 560000, the actual number of lines written is: 1650


writing batch 560000


2017-01-04 04:40:02,856 : INFO : Batch creation working on 390000

2017-01-04 04:40:06,260 : INFO : Finished batch of 10000 in 8m 17s
2017-01-04 04:40:06,270 : INFO : For index 500000, the actual number of lines written is: 1620


writing batch 500000


2017-01-04 04:40:07,466 : INFO : Batch creation working on 570000

2017-01-04 04:40:10,114 : INFO : Finished batch of 10000 in 8m 11s
2017-01-04 04:40:10,117 : INFO : For index 440000, the actual number of lines written is: 1594


writing batch 440000


2017-01-04 04:40:12,097 : INFO : Batch creation working on 510000

2017-01-04 04:40:14,638 : INFO : Batch creation working on 450000

2017-01-04 04:40:16,010 : INFO : Finished batch of 10000 in 8m 16s
2017-01-04 04:40:16,012 : INFO : For index 200000, the actual number of lines written is: 1650


writing batch 200000


2017-01-04 04:40:20,086 : INFO : Finished batch of 10000 in 8m 16s
2017-01-04 04:40:20,089 : INFO : For index 320000, the actual number of lines written is: 1625


writing batch 320000


2017-01-04 04:40:21,120 : INFO : Batch creation working on 210000

2017-01-04 04:40:24,998 : INFO : Batch creation working on 330000

2017-01-04 04:40:28,811 : INFO : Finished batch of 10000 in 8m 12s
2017-01-04 04:40:28,814 : INFO : For index 260000, the actual number of lines written is: 1635


writing batch 260000


2017-01-04 04:40:33,538 : INFO : Batch creation working on 270000

2017-01-04 04:40:38,425 : INFO : Finished batch of 10000 in 8m 22s
2017-01-04 04:40:38,431 : INFO : For index 20000, the actual number of lines written is: 1557


writing batch 20000


2017-01-04 04:40:43,655 : INFO : Batch creation working on 30000

2017-01-04 04:41:41,541 : INFO : Finished batch of 10000 in 8m 7s
2017-01-04 04:41:41,544 : INFO : For index 140000, the actual number of lines written is: 1537


writing batch 140000


2017-01-04 04:41:45,909 : INFO : Batch creation working on 150000

2017-01-04 04:41:59,100 : INFO : Finished batch of 10000 in 8m 16s
2017-01-04 04:41:59,102 : INFO : For index 80000, the actual number of lines written is: 1594


writing batch 80000


2017-01-04 04:42:04,261 : INFO : Batch creation working on 90000

2017-01-04 04:45:13,432 : INFO : 1000
2017-01-04 04:45:13,857 : INFO : 1000
2017-01-04 04:45:14,499 : INFO : 1000
2017-01-04 04:45:20,418 : INFO : 1000
2017-01-04 04:45:21,012 : INFO : 1000
2017-01-04 04:45:29,178 : INFO : 1000
2017-01-04 04:45:42,601 : INFO : 1000
2017-01-04 04:45:46,203 : INFO : 1000
2017-01-04 04:46:48,640 : INFO : 1000
2017-01-04 04:47:22,390 : INFO : 1000
2017-01-04 04:48:04,269 : INFO : Finished batch of 10000 in 8m 1s
2017-01-04 04:48:04,272 : INFO : For index 390000, the actual number of lines written is: 1587


writing batch 390000


2017-01-04 04:48:09,139 : INFO : Batch creation working on 400000

2017-01-04 04:48:19,249 : INFO : Finished batch of 10000 in 8m 12s
2017-01-04 04:48:19,252 : INFO : For index 570000, the actual number of lines written is: 1594


writing batch 570000


2017-01-04 04:48:23,716 : INFO : Batch creation working on 580000

2017-01-04 04:48:26,255 : INFO : Finished batch of 10000 in 8m 12s
2017-01-04 04:48:26,258 : INFO : For index 450000, the actual number of lines written is: 1609


writing batch 450000


2017-01-04 04:48:30,199 : INFO : Finished batch of 10000 in 8m 18s
2017-01-04 04:48:30,202 : INFO : For index 510000, the actual number of lines written is: 1624


writing batch 510000


2017-01-04 04:48:31,028 : INFO : Batch creation working on 460000

2017-01-04 04:48:35,093 : INFO : Finished batch of 10000 in 8m 14s
2017-01-04 04:48:35,096 : INFO : For index 210000, the actual number of lines written is: 1557


writing batch 210000


2017-01-04 04:48:35,489 : INFO : Batch creation working on 520000

2017-01-04 04:48:37,644 : INFO : Finished batch of 10000 in 8m 13s
2017-01-04 04:48:37,647 : INFO : For index 330000, the actual number of lines written is: 1660


writing batch 330000


2017-01-04 04:48:39,847 : INFO : Batch creation working on 220000

2017-01-04 04:48:41,812 : INFO : Finished batch of 10000 in 8m 8s
2017-01-04 04:48:41,814 : INFO : For index 270000, the actual number of lines written is: 1619


writing batch 270000


2017-01-04 04:48:42,354 : INFO : Batch creation working on 340000

2017-01-04 04:48:42,434 : INFO : Finished batch of 10000 in 7m 59s
2017-01-04 04:48:42,437 : INFO : For index 30000, the actual number of lines written is: 1573


writing batch 30000


2017-01-04 04:48:47,344 : INFO : Batch creation working on 280000

2017-01-04 04:48:47,393 : INFO : Batch creation working on 40000

2017-01-04 04:50:03,613 : INFO : Finished batch of 10000 in 8m 18s
2017-01-04 04:50:03,624 : INFO : For index 150000, the actual number of lines written is: 1589


writing batch 150000


2017-01-04 04:50:08,400 : INFO : Batch creation working on 160000

2017-01-04 04:50:24,855 : INFO : Finished batch of 10000 in 8m 21s
2017-01-04 04:50:24,858 : INFO : For index 90000, the actual number of lines written is: 1582


writing batch 90000


2017-01-04 04:50:30,159 : INFO : Batch creation working on 100000

2017-01-04 04:53:10,705 : INFO : 1000
2017-01-04 04:53:28,255 : INFO : 1000
2017-01-04 04:53:33,180 : INFO : 1000
2017-01-04 04:53:37,549 : INFO : 1000
2017-01-04 04:53:48,245 : INFO : 1000
2017-01-04 04:53:51,285 : INFO : 1000
2017-01-04 04:54:02,709 : INFO : 1000
2017-01-04 04:54:22,978 : INFO : 1000
2017-01-04 04:55:00,261 : INFO : 1000
2017-01-04 04:55:36,167 : INFO : 1000
2017-01-04 04:56:18,202 : INFO : Finished batch of 10000 in 8m 9s
2017-01-04 04:56:18,204 : INFO : For index 400000, the actual number of lines written is: 1620


writing batch 400000


2017-01-04 04:56:23,226 : INFO : Batch creation working on 410000

2017-01-04 04:56:37,090 : INFO : Finished batch of 10000 in 8m 6s
2017-01-04 04:56:37,101 : INFO : For index 460000, the actual number of lines written is: 1541


writing batch 460000


2017-01-04 04:56:39,502 : INFO : Finished batch of 10000 in 8m 4s
2017-01-04 04:56:39,505 : INFO : For index 520000, the actual number of lines written is: 1617


writing batch 520000


2017-01-04 04:56:39,938 : INFO : Finished batch of 10000 in 8m 0s
2017-01-04 04:56:39,941 : INFO : For index 220000, the actual number of lines written is: 1509


writing batch 220000


2017-01-04 04:56:40,407 : INFO : Finished batch of 10000 in 8m 17s
2017-01-04 04:56:40,409 : INFO : For index 580000, the actual number of lines written is: 1600


writing batch 580000


2017-01-04 04:56:41,781 : INFO : Batch creation working on 470000

2017-01-04 04:56:44,037 : INFO : Batch creation working on 530000

2017-01-04 04:56:45,864 : INFO : Batch creation working on 230000

2017-01-04 04:56:46,329 : INFO : Batch creation working on 590000

2017-01-04 04:56:51,995 : INFO : Finished batch of 10000 in 8m 10s
2017-01-04 04:56:51,997 : INFO : For index 340000, the actual number of lines written is: 1634


writing batch 340000


2017-01-04 04:56:53,798 : INFO : Finished batch of 10000 in 8m 6s
2017-01-04 04:56:54,136 : INFO : For index 40000, the actual number of lines written is: 1465


writing batch 40000


2017-01-04 04:56:56,427 : INFO : Batch creation working on 350000

2017-01-04 04:56:58,471 : INFO : Batch creation working on 50000

2017-01-04 04:57:00,429 : INFO : Finished batch of 10000 in 8m 13s
2017-01-04 04:57:00,432 : INFO : For index 280000, the actual number of lines written is: 1616


writing batch 280000


2017-01-04 04:57:05,484 : INFO : Batch creation working on 290000

2017-01-04 04:58:11,523 : INFO : Finished batch of 10000 in 8m 3s
2017-01-04 04:58:11,535 : INFO : For index 160000, the actual number of lines written is: 1591


writing batch 160000


2017-01-04 04:58:16,703 : INFO : Batch creation working on 170000

2017-01-04 04:58:46,571 : INFO : Finished batch of 10000 in 8m 16s
2017-01-04 04:58:46,574 : INFO : For index 100000, the actual number of lines written is: 1586


writing batch 100000


2017-01-04 04:58:51,299 : INFO : Batch creation working on 110000

2017-01-04 05:01:25,890 : INFO : 1000
2017-01-04 05:01:36,199 : INFO : 1000
2017-01-04 05:01:39,396 : INFO : 1000
2017-01-04 05:01:41,790 : INFO : 1000
2017-01-04 05:01:47,126 : INFO : 1000
2017-01-04 05:02:12,160 : INFO : 1000
2017-01-04 05:02:28,572 : INFO : 1000
2017-01-04 05:02:54,030 : INFO : 1000
2017-01-04 05:03:36,299 : INFO : 1000
2017-01-04 05:03:48,997 : INFO : 1000
2017-01-04 05:04:31,868 : INFO : Finished batch of 10000 in 8m 9s
2017-01-04 05:04:31,870 : INFO : For index 410000, the actual number of lines written is: 1589


writing batch 410000


2017-01-04 05:04:36,651 : INFO : Batch creation working on 600000

2017-01-04 05:04:43,441 : INFO : Finished batch of 10000 in 7m 59s
2017-01-04 05:04:43,452 : INFO : For index 530000, the actual number of lines written is: 1575


writing batch 530000


2017-01-04 05:04:48,069 : INFO : Finished batch of 10000 in 8m 6s
2017-01-04 05:04:48,071 : INFO : For index 470000, the actual number of lines written is: 1605


writing batch 470000


2017-01-04 05:04:48,247 : INFO : Batch creation working on 660000

2017-01-04 05:04:52,204 : INFO : Finished batch of 10000 in 8m 6s
2017-01-04 05:04:52,207 : INFO : For index 230000, the actual number of lines written is: 1623


writing batch 230000


2017-01-04 05:04:52,742 : INFO : Batch creation working on 720000

2017-01-04 05:04:56,961 : INFO : Batch creation working on 780000

2017-01-04 05:05:08,990 : INFO : Finished batch of 10000 in 8m 23s
2017-01-04 05:05:08,992 : INFO : For index 590000, the actual number of lines written is: 1633


writing batch 590000


2017-01-04 05:05:14,343 : INFO : Batch creation working on 840000

2017-01-04 05:05:18,185 : INFO : Finished batch of 10000 in 8m 20s
2017-01-04 05:05:18,187 : INFO : For index 50000, the actual number of lines written is: 1523


writing batch 50000


2017-01-04 05:05:18,403 : INFO : Finished batch of 10000 in 8m 22s
2017-01-04 05:05:18,412 : INFO : For index 350000, the actual number of lines written is: 1557


writing batch 350000


2017-01-04 05:05:23,775 : INFO : Batch creation working on 900000

2017-01-04 05:05:23,842 : INFO : Batch creation working on 960000

2017-01-04 05:05:40,667 : INFO : Finished batch of 10000 in 8m 35s
2017-01-04 05:05:40,669 : INFO : For index 290000, the actual number of lines written is: 1492


writing batch 290000


2017-01-04 05:05:45,655 : INFO : Batch creation working on 1020000

2017-01-04 05:06:27,510 : INFO : Finished batch of 10000 in 8m 11s
2017-01-04 05:06:27,514 : INFO : For index 170000, the actual number of lines written is: 1587


writing batch 170000


2017-01-04 05:06:31,768 : INFO : Batch creation working on 1080000

2017-01-04 05:06:56,293 : INFO : Finished batch of 10000 in 8m 5s
2017-01-04 05:06:56,295 : INFO : For index 110000, the actual number of lines written is: 1652


writing batch 110000


2017-01-04 05:07:00,544 : INFO : Batch creation working on 1140000

2017-01-04 05:09:28,512 : INFO : 1000
2017-01-04 05:09:37,678 : INFO : 1000
2017-01-04 05:09:54,596 : INFO : 1000
2017-01-04 05:09:59,075 : INFO : 1000
2017-01-04 05:10:14,906 : INFO : 1000
2017-01-04 05:10:20,343 : INFO : 1000
2017-01-04 05:10:40,554 : INFO : 1000
2017-01-04 05:10:47,697 : INFO : 1000
2017-01-04 05:11:44,277 : INFO : 1000
2017-01-04 05:11:59,188 : INFO : 1000
2017-01-04 05:12:41,262 : INFO : Finished batch of 10000 in 8m 5s
2017-01-04 05:12:41,265 : INFO : For index 600000, the actual number of lines written is: 1623


writing batch 600000


2017-01-04 05:12:46,139 : INFO : Batch creation working on 610000

2017-01-04 05:12:54,764 : INFO : Finished batch of 10000 in 8m 6s
2017-01-04 05:12:54,775 : INFO : For index 660000, the actual number of lines written is: 1556


writing batch 660000


2017-01-04 05:12:59,352 : INFO : Batch creation working on 670000

2017-01-04 05:13:03,954 : INFO : Finished batch of 10000 in 8m 7s
2017-01-04 05:13:03,956 : INFO : For index 780000, the actual number of lines written is: 1595


writing batch 780000


2017-01-04 05:13:08,740 : INFO : Batch creation working on 790000

2017-01-04 05:13:14,744 : INFO : Finished batch of 10000 in 8m 22s
2017-01-04 05:13:14,747 : INFO : For index 720000, the actual number of lines written is: 1680


writing batch 720000


2017-01-04 05:13:19,879 : INFO : Batch creation working on 730000

2017-01-04 05:13:26,249 : INFO : Finished batch of 10000 in 8m 12s
2017-01-04 05:13:26,252 : INFO : For index 840000, the actual number of lines written is: 1618


writing batch 840000


2017-01-04 05:13:29,081 : INFO : Finished batch of 10000 in 8m 5s
2017-01-04 05:13:29,083 : INFO : For index 960000, the actual number of lines written is: 1548


writing batch 960000


2017-01-04 05:13:31,074 : INFO : Batch creation working on 850000

2017-01-04 05:13:33,684 : INFO : Batch creation working on 970000

2017-01-04 05:13:38,624 : INFO : Finished batch of 10000 in 8m 15s
2017-01-04 05:13:38,636 : INFO : For index 900000, the actual number of lines written is: 1668


writing batch 900000


2017-01-04 05:13:43,826 : INFO : Batch creation working on 910000

2017-01-04 05:13:49,556 : INFO : Finished batch of 10000 in 8m 4s
2017-01-04 05:13:49,559 : INFO : For index 1020000, the actual number of lines written is: 1633


writing batch 1020000


2017-01-04 05:13:54,003 : INFO : Batch creation working on 1030000

2017-01-04 05:14:37,771 : INFO : Finished batch of 10000 in 8m 6s
2017-01-04 05:14:37,773 : INFO : For index 1080000, the actual number of lines written is: 1540


writing batch 1080000


2017-01-04 05:14:42,390 : INFO : Batch creation working on 1090000

2017-01-04 05:15:01,177 : INFO : Finished batch of 10000 in 8m 1s
2017-01-04 05:15:01,180 : INFO : For index 1140000, the actual number of lines written is: 1590


writing batch 1140000


2017-01-04 05:15:06,214 : INFO : Batch creation working on 1150000

2017-01-04 05:17:36,790 : INFO : 1000
2017-01-04 05:18:13,863 : INFO : 1000
2017-01-04 05:18:14,212 : INFO : 1000
2017-01-04 05:18:29,521 : INFO : 1000
2017-01-04 05:18:32,162 : INFO : 1000
2017-01-04 05:18:42,986 : INFO : 1000
2017-01-04 05:18:58,329 : INFO : 1000
2017-01-04 05:19:00,242 : INFO : 1000
2017-01-04 05:19:49,256 : INFO : 1000
2017-01-04 05:20:22,288 : INFO : 1000
2017-01-04 05:20:47,280 : INFO : Finished batch of 10000 in 8m 1s
2017-01-04 05:20:47,283 : INFO : For index 610000, the actual number of lines written is: 1623


writing batch 610000


2017-01-04 05:20:52,325 : INFO : Batch creation working on 620000

2017-01-04 05:21:14,562 : INFO : Finished batch of 10000 in 8m 15s
2017-01-04 05:21:14,565 : INFO : For index 670000, the actual number of lines written is: 1589


writing batch 670000


2017-01-04 05:21:18,670 : INFO : Finished batch of 10000 in 8m 10s
2017-01-04 05:21:18,672 : INFO : For index 790000, the actual number of lines written is: 1586


writing batch 790000


2017-01-04 05:21:19,331 : INFO : Batch creation working on 680000

2017-01-04 05:21:24,180 : INFO : Batch creation working on 800000

2017-01-04 05:21:35,872 : INFO : Finished batch of 10000 in 8m 16s
2017-01-04 05:21:35,874 : INFO : For index 730000, the actual number of lines written is: 1639


writing batch 730000


2017-01-04 05:21:36,069 : INFO : Finished batch of 10000 in 8m 5s
2017-01-04 05:21:36,071 : INFO : For index 850000, the actual number of lines written is: 1627


writing batch 850000


2017-01-04 05:21:42,125 : INFO : Batch creation working on 740000

2017-01-04 05:21:42,148 : INFO : Batch creation working on 860000

2017-01-04 05:21:54,222 : INFO : Finished batch of 10000 in 8m 10s
2017-01-04 05:21:54,225 : INFO : For index 910000, the actual number of lines written is: 1555


writing batch 910000


2017-01-04 05:21:55,971 : INFO : Finished batch of 10000 in 8m 22s
2017-01-04 05:21:55,980 : INFO : For index 970000, the actual number of lines written is: 1598


writing batch 970000


2017-01-04 05:21:58,598 : INFO : Batch creation working on 920000

2017-01-04 05:22:00,785 : INFO : Batch creation working on 980000

2017-01-04 05:22:19,384 : INFO : Finished batch of 10000 in 8m 25s
2017-01-04 05:22:19,386 : INFO : For index 1030000, the actual number of lines written is: 1674


writing batch 1030000


2017-01-04 05:22:24,432 : INFO : Batch creation working on 1040000

2017-01-04 05:22:56,175 : INFO : Finished batch of 10000 in 8m 14s
2017-01-04 05:22:56,177 : INFO : For index 1090000, the actual number of lines written is: 1570


writing batch 1090000


2017-01-04 05:23:00,673 : INFO : Batch creation working on 1100000

2017-01-04 05:23:25,479 : INFO : Finished batch of 10000 in 8m 19s
2017-01-04 05:23:25,482 : INFO : For index 1150000, the actual number of lines written is: 1605


writing batch 1150000


2017-01-04 05:23:29,774 : INFO : Batch creation working on 1160000

2017-01-04 05:25:45,403 : INFO : 1000
2017-01-04 05:26:21,426 : INFO : 1000
2017-01-04 05:26:30,517 : INFO : 1000
2017-01-04 05:26:41,592 : INFO : 1000
2017-01-04 05:26:44,069 : INFO : 1000
2017-01-04 05:27:35,135 : INFO : 1000
2017-01-04 05:27:38,057 : INFO : 1000
2017-01-04 05:28:20,306 : INFO : 1000
2017-01-04 05:29:07,477 : INFO : 1000
2017-01-04 05:30:41,489 : INFO : 1000
2017-01-04 05:30:52,746 : INFO : Finished batch of 10000 in 10m 0s
2017-01-04 05:30:52,748 : INFO : For index 620000, the actual number of lines written is: 1630


writing batch 620000


2017-01-04 05:30:57,469 : INFO : Batch creation working on 630000

2017-01-04 05:31:56,061 : INFO : Finished batch of 10000 in 10m 32s
2017-01-04 05:31:56,090 : INFO : For index 800000, the actual number of lines written is: 1641


writing batch 800000


2017-01-04 05:32:02,817 : INFO : Finished batch of 10000 in 10m 43s
2017-01-04 05:32:02,886 : INFO : For index 680000, the actual number of lines written is: 1605


writing batch 680000


2017-01-04 05:32:04,052 : INFO : Batch creation working on 810000

2017-01-04 05:32:12,424 : INFO : Batch creation working on 690000

2017-01-04 05:32:28,213 : INFO : Finished batch of 10000 in 10m 46s
2017-01-04 05:32:28,217 : INFO : For index 860000, the actual number of lines written is: 1597


writing batch 860000


2017-01-04 05:32:29,300 : INFO : Finished batch of 10000 in 10m 47s
2017-01-04 05:32:29,322 : INFO : For index 740000, the actual number of lines written is: 1605


writing batch 740000


2017-01-04 05:32:36,724 : INFO : Batch creation working on 870000

2017-01-04 05:32:37,965 : INFO : Batch creation working on 750000

2017-01-04 05:33:13,971 : INFO : Finished batch of 10000 in 11m 13s
2017-01-04 05:33:13,997 : INFO : For index 980000, the actual number of lines written is: 1577


writing batch 980000


2017-01-04 05:33:23,674 : INFO : Batch creation working on 990000

2017-01-04 05:33:33,125 : INFO : Finished batch of 10000 in 11m 35s
2017-01-04 05:33:33,202 : INFO : For index 920000, the actual number of lines written is: 1645


writing batch 920000


2017-01-04 05:33:44,185 : INFO : Batch creation working on 930000

2017-01-04 05:34:14,069 : INFO : Finished batch of 10000 in 11m 50s
2017-01-04 05:34:14,084 : INFO : For index 1040000, the actual number of lines written is: 1586


writing batch 1040000


2017-01-04 05:34:23,812 : INFO : Batch creation working on 1050000

2017-01-04 05:35:21,922 : INFO : Finished batch of 10000 in 12m 21s
2017-01-04 05:35:21,924 : INFO : For index 1100000, the actual number of lines written is: 1612


writing batch 1100000


2017-01-04 05:35:26,716 : INFO : Batch creation working on 1110000

2017-01-04 05:36:00,347 : INFO : Finished batch of 10000 in 12m 31s
2017-01-04 05:36:00,355 : INFO : For index 1160000, the actual number of lines written is: 1534


writing batch 1160000


2017-01-04 05:36:09,745 : INFO : Batch creation working on 1170000

2017-01-04 05:40:04,916 : INFO : 1000
2017-01-04 05:41:25,183 : INFO : 1000
2017-01-04 05:41:31,110 : INFO : 1000
2017-01-04 05:41:46,898 : INFO : 1000
2017-01-04 05:42:18,182 : INFO : 1000
2017-01-04 05:44:01,977 : INFO : 1000
2017-01-04 05:44:26,653 : INFO : 1000
2017-01-04 05:44:36,096 : INFO : 1000
2017-01-04 05:45:36,255 : INFO : 1000
2017-01-04 05:46:02,403 : INFO : Finished batch of 10000 in 15m 5s
2017-01-04 05:46:02,435 : INFO : For index 630000, the actual number of lines written is: 1623


writing batch 630000


2017-01-04 05:46:12,087 : INFO : Batch creation working on 640000

2017-01-04 05:46:42,216 : INFO : 1000
2017-01-04 05:47:37,123 : INFO : Finished batch of 10000 in 15m 33s
2017-01-04 05:47:37,134 : INFO : For index 810000, the actual number of lines written is: 1633


writing batch 810000


2017-01-04 05:47:44,459 : INFO : Finished batch of 10000 in 15m 32s
2017-01-04 05:47:44,496 : INFO : For index 690000, the actual number of lines written is: 1612


writing batch 690000


2017-01-04 05:47:46,414 : INFO : Batch creation working on 820000

2017-01-04 05:47:53,394 : INFO : Batch creation working on 700000

2017-01-04 05:48:01,943 : INFO : Finished batch of 10000 in 15m 25s
2017-01-04 05:48:01,983 : INFO : For index 870000, the actual number of lines written is: 1654


writing batch 870000


2017-01-04 05:48:11,450 : INFO : Batch creation working on 880000

2017-01-04 05:48:18,312 : INFO : Finished batch of 10000 in 15m 40s
2017-01-04 05:48:18,338 : INFO : For index 750000, the actual number of lines written is: 1603


writing batch 750000


2017-01-04 05:48:28,290 : INFO : Batch creation working on 760000

2017-01-04 05:49:49,181 : INFO : Finished batch of 10000 in 16m 25s
2017-01-04 05:49:49,210 : INFO : For index 990000, the actual number of lines written is: 1558


writing batch 990000


2017-01-04 05:49:58,197 : INFO : Batch creation working on 1000000

2017-01-04 05:50:26,892 : INFO : Finished batch of 10000 in 16m 43s
2017-01-04 05:50:26,921 : INFO : For index 930000, the actual number of lines written is: 1592


writing batch 930000


2017-01-04 05:50:37,320 : INFO : Batch creation working on 940000

2017-01-04 05:51:09,722 : INFO : Finished batch of 10000 in 16m 46s
2017-01-04 05:51:09,730 : INFO : For index 1050000, the actual number of lines written is: 1559


writing batch 1050000


2017-01-04 05:51:20,268 : INFO : Batch creation working on 1060000

2017-01-04 05:52:25,699 : INFO : Finished batch of 10000 in 16m 59s
2017-01-04 05:52:25,740 : INFO : For index 1110000, the actual number of lines written is: 1595


writing batch 1110000


2017-01-04 05:52:35,131 : INFO : Batch creation working on 1120000

2017-01-04 05:53:35,242 : INFO : Finished batch of 10000 in 17m 25s
2017-01-04 05:53:35,250 : INFO : For index 1170000, the actual number of lines written is: 1604


writing batch 1170000


2017-01-04 05:53:44,727 : INFO : Batch creation working on 1180000

2017-01-04 05:55:36,718 : INFO : 1000
2017-01-04 05:57:59,168 : INFO : 1000
2017-01-04 05:58:06,167 : INFO : 1000
2017-01-04 05:58:10,040 : INFO : 1000
2017-01-04 05:58:19,315 : INFO : 1000
2017-01-04 06:01:17,794 : INFO : 1000
2017-01-04 06:01:30,355 : INFO : Finished batch of 10000 in 15m 18s
2017-01-04 06:01:30,392 : INFO : For index 640000, the actual number of lines written is: 1610


writing batch 640000


2017-01-04 06:01:38,341 : INFO : Batch creation working on 650000

2017-01-04 06:01:56,806 : INFO : 1000
2017-01-04 06:02:25,698 : INFO : 1000
2017-01-04 06:03:26,304 : INFO : Finished batch of 10000 in 15m 40s
2017-01-04 06:03:26,307 : INFO : For index 820000, the actual number of lines written is: 1585


writing batch 820000


2017-01-04 06:03:26,762 : INFO : Finished batch of 10000 in 15m 33s
2017-01-04 06:03:26,765 : INFO : For index 700000, the actual number of lines written is: 1580


writing batch 700000


2017-01-04 06:03:30,121 : INFO : 1000
2017-01-04 06:03:32,084 : INFO : Batch creation working on 710000

2017-01-04 06:03:32,091 : INFO : Batch creation working on 830000

2017-01-04 06:03:38,817 : INFO : Finished batch of 10000 in 15m 27s
2017-01-04 06:03:38,820 : INFO : For index 880000, the actual number of lines written is: 1600


writing batch 880000


2017-01-04 06:03:47,068 : INFO : Batch creation working on 890000

2017-01-04 06:04:03,387 : INFO : Finished batch of 10000 in 15m 35s
2017-01-04 06:04:03,438 : INFO : For index 760000, the actual number of lines written is: 1639


writing batch 760000


2017-01-04 06:04:12,493 : INFO : Batch creation working on 770000

2017-01-04 06:04:13,666 : INFO : 1000
2017-01-04 06:07:01,902 : INFO : Finished batch of 10000 in 17m 4s
2017-01-04 06:07:01,905 : INFO : For index 1000000, the actual number of lines written is: 1533


writing batch 1000000


2017-01-04 06:07:07,858 : INFO : Batch creation working on 1010000

2017-01-04 06:07:16,857 : INFO : Finished batch of 10000 in 16m 39s
2017-01-04 06:07:16,860 : INFO : For index 940000, the actual number of lines written is: 1565


writing batch 940000


2017-01-04 06:07:26,198 : INFO : Batch creation working on 950000

2017-01-04 06:08:01,396 : INFO : Finished batch of 10000 in 16m 41s
2017-01-04 06:08:01,415 : INFO : For index 1060000, the actual number of lines written is: 1553


writing batch 1060000


2017-01-04 06:08:09,677 : INFO : Batch creation working on 1070000

2017-01-04 06:09:12,228 : INFO : Finished batch of 10000 in 16m 37s
2017-01-04 06:09:12,258 : INFO : For index 1120000, the actual number of lines written is: 1576


writing batch 1120000


2017-01-04 06:09:21,544 : INFO : Batch creation working on 1130000

2017-01-04 06:10:20,945 : INFO : 1000
2017-01-04 06:10:29,818 : INFO : Finished batch of 10000 in 16m 45s
2017-01-04 06:10:29,874 : INFO : For index 1180000, the actual number of lines written is: 1588


writing batch 1180000


2017-01-04 06:10:39,835 : INFO : Batch creation working on 1190000

2017-01-04 06:12:28,578 : INFO : 1000
2017-01-04 06:12:44,482 : INFO : 1000
2017-01-04 06:12:46,661 : INFO : 1000
2017-01-04 06:13:59,034 : INFO : 1000
2017-01-04 06:15:47,679 : INFO : Finished batch of 10000 in 14m 9s
2017-01-04 06:15:47,697 : INFO : For index 650000, the actual number of lines written is: 1580


writing batch 650000


2017-01-04 06:15:56,808 : INFO : Batch creation working on 1200000

2017-01-04 06:17:04,712 : INFO : 1000
2017-01-04 06:17:33,087 : INFO : 1000
2017-01-04 06:17:58,418 : INFO : 1000
2017-01-04 06:18:24,825 : INFO : Finished batch of 10000 in 14m 38s
2017-01-04 06:18:24,831 : INFO : For index 890000, the actual number of lines written is: 1596


writing batch 890000


2017-01-04 06:18:29,569 : INFO : Finished batch of 10000 in 14m 17s
2017-01-04 06:18:29,612 : INFO : For index 770000, the actual number of lines written is: 1634


writing batch 770000


2017-01-04 06:18:34,656 : INFO : Batch creation working on 1260000

2017-01-04 06:18:37,616 : INFO : Finished batch of 10000 in 15m 6s
2017-01-04 06:18:37,633 : INFO : For index 830000, the actual number of lines written is: 1661


writing batch 830000


2017-01-04 06:18:38,190 : INFO : Batch creation working on 1320000

2017-01-04 06:18:48,604 : INFO : Batch creation working on 1380000

2017-01-04 06:19:07,361 : INFO : 1000
2017-01-04 06:19:37,288 : INFO : Finished batch of 10000 in 16m 5s
2017-01-04 06:19:37,307 : INFO : For index 710000, the actual number of lines written is: 1631


writing batch 710000


2017-01-04 06:19:46,106 : INFO : Batch creation working on 1440000

2017-01-04 06:20:28,754 : INFO : 1000
2017-01-04 06:22:33,641 : INFO : Finished batch of 10000 in 15m 26s
2017-01-04 06:22:33,657 : INFO : For index 1010000, the actual number of lines written is: 1678


writing batch 1010000


2017-01-04 06:22:42,180 : INFO : Batch creation working on 1500000

2017-01-04 06:22:59,254 : INFO : Finished batch of 10000 in 15m 33s
2017-01-04 06:22:59,274 : INFO : For index 950000, the actual number of lines written is: 1549


writing batch 950000


2017-01-04 06:23:07,539 : INFO : Batch creation working on 1560000

2017-01-04 06:23:25,882 : INFO : Finished batch of 10000 in 15m 16s
2017-01-04 06:23:25,900 : INFO : For index 1070000, the actual number of lines written is: 1651


writing batch 1070000


2017-01-04 06:23:34,059 : INFO : Batch creation working on 1620000

2017-01-04 06:24:23,301 : INFO : Finished batch of 10000 in 15m 2s
2017-01-04 06:24:23,311 : INFO : For index 1130000, the actual number of lines written is: 1615


writing batch 1130000


2017-01-04 06:24:30,887 : INFO : Batch creation working on 1680000

2017-01-04 06:25:00,122 : INFO : 1000
2017-01-04 06:25:18,262 : INFO : Finished batch of 10000 in 14m 38s
2017-01-04 06:25:18,283 : INFO : For index 1190000, the actual number of lines written is: 1505


writing batch 1190000


2017-01-04 06:25:25,096 : INFO : Batch creation working on 1740000

2017-01-04 06:27:33,478 : INFO : 1000
2017-01-04 06:27:36,664 : INFO : 1000
2017-01-04 06:28:25,042 : INFO : 1000
2017-01-04 06:28:39,488 : INFO : 1000
2017-01-04 06:30:27,656 : INFO : Finished batch of 10000 in 14m 31s
2017-01-04 06:30:27,661 : INFO : For index 1200000, the actual number of lines written is: 1606


writing batch 1200000


2017-01-04 06:30:37,263 : INFO : Batch creation working on 1210000

2017-01-04 06:31:30,569 : INFO : 1000
2017-01-04 06:32:35,288 : INFO : 1000
2017-01-04 06:33:00,924 : INFO : 1000
2017-01-04 06:33:15,884 : INFO : Finished batch of 10000 in 14m 38s
2017-01-04 06:33:15,907 : INFO : For index 1320000, the actual number of lines written is: 1642


writing batch 1320000


2017-01-04 06:33:24,492 : INFO : Finished batch of 10000 in 14m 36s
2017-01-04 06:33:24,526 : INFO : For index 1380000, the actual number of lines written is: 1687


writing batch 1380000


2017-01-04 06:33:25,541 : INFO : Batch creation working on 1330000

2017-01-04 06:33:31,793 : INFO : Finished batch of 10000 in 14m 57s
2017-01-04 06:33:31,810 : INFO : For index 1260000, the actual number of lines written is: 1549


writing batch 1260000


2017-01-04 06:33:34,088 : INFO : Batch creation working on 1390000

2017-01-04 06:33:40,506 : INFO : Batch creation working on 1270000

2017-01-04 06:33:52,933 : INFO : 1000
2017-01-04 06:34:03,064 : INFO : Finished batch of 10000 in 14m 17s
2017-01-04 06:34:03,069 : INFO : For index 1440000, the actual number of lines written is: 1575


writing batch 1440000


2017-01-04 06:34:09,456 : INFO : 1000
2017-01-04 06:34:09,876 : INFO : Batch creation working on 1450000

2017-01-04 06:37:32,226 : INFO : Finished batch of 10000 in 14m 50s
2017-01-04 06:37:32,250 : INFO : For index 1500000, the actual number of lines written is: 1679


writing batch 1500000


2017-01-04 06:37:40,823 : INFO : Batch creation working on 1510000

2017-01-04 06:37:43,845 : INFO : Finished batch of 10000 in 14m 36s
2017-01-04 06:37:43,850 : INFO : For index 1560000, the actual number of lines written is: 1574


writing batch 1560000


2017-01-04 06:37:51,881 : INFO : Batch creation working on 1570000

2017-01-04 06:38:15,631 : INFO : Finished batch of 10000 in 14m 42s
2017-01-04 06:38:15,651 : INFO : For index 1620000, the actual number of lines written is: 1558


writing batch 1620000


2017-01-04 06:38:23,452 : INFO : Batch creation working on 1630000

2017-01-04 06:39:04,139 : INFO : Finished batch of 10000 in 14m 33s
2017-01-04 06:39:04,143 : INFO : For index 1680000, the actual number of lines written is: 1590


writing batch 1680000


2017-01-04 06:39:12,389 : INFO : Batch creation working on 1690000

2017-01-04 06:39:33,709 : INFO : 1000
2017-01-04 06:40:07,938 : INFO : Finished batch of 10000 in 14m 43s
2017-01-04 06:40:07,952 : INFO : For index 1740000, the actual number of lines written is: 1590


writing batch 1740000


2017-01-04 06:40:16,275 : INFO : Batch creation working on 1750000

2017-01-04 06:42:30,960 : INFO : 1000
2017-01-04 06:42:36,653 : INFO : 1000
2017-01-04 06:42:42,929 : INFO : 1000
2017-01-04 06:43:07,461 : INFO : 1000
2017-01-04 06:45:28,516 : INFO : Finished batch of 10000 in 14m 51s
2017-01-04 06:45:28,539 : INFO : For index 1210000, the actual number of lines written is: 1603


writing batch 1210000


2017-01-04 06:45:37,014 : INFO : Batch creation working on 1220000

2017-01-04 06:46:51,559 : INFO : 1000
2017-01-04 06:47:33,028 : INFO : 1000
2017-01-04 06:48:01,286 : INFO : 1000
2017-01-04 06:48:04,505 : INFO : Finished batch of 10000 in 14m 39s
2017-01-04 06:48:04,509 : INFO : For index 1330000, the actual number of lines written is: 1585


writing batch 1330000


2017-01-04 06:48:13,037 : INFO : Batch creation working on 1340000

2017-01-04 06:48:26,937 : INFO : Finished batch of 10000 in 14m 53s
2017-01-04 06:48:26,981 : INFO : For index 1390000, the actual number of lines written is: 1653


writing batch 1390000


2017-01-04 06:48:27,149 : INFO : Finished batch of 10000 in 14m 47s
2017-01-04 06:48:27,168 : INFO : For index 1270000, the actual number of lines written is: 1655


writing batch 1270000


2017-01-04 06:48:36,795 : INFO : Batch creation working on 1280000

2017-01-04 06:48:37,131 : INFO : Batch creation working on 1400000

2017-01-04 06:48:38,413 : INFO : 1000
2017-01-04 06:49:04,325 : INFO : Finished batch of 10000 in 14m 54s
2017-01-04 06:49:04,349 : INFO : For index 1450000, the actual number of lines written is: 1650


writing batch 1450000


2017-01-04 06:49:13,600 : INFO : Batch creation working on 1460000

2017-01-04 06:49:40,580 : INFO : 1000
2017-01-04 06:52:16,601 : INFO : Finished batch of 10000 in 14m 36s
2017-01-04 06:52:16,634 : INFO : For index 1510000, the actual number of lines written is: 1605


writing batch 1510000


2017-01-04 06:52:24,153 : INFO : Batch creation working on 1520000

2017-01-04 06:52:37,635 : INFO : Finished batch of 10000 in 14m 46s
2017-01-04 06:52:37,640 : INFO : For index 1570000, the actual number of lines written is: 1575


writing batch 1570000


2017-01-04 06:52:45,693 : INFO : Batch creation working on 1580000

2017-01-04 06:53:14,881 : INFO : Finished batch of 10000 in 14m 51s
2017-01-04 06:53:14,902 : INFO : For index 1630000, the actual number of lines written is: 1583


writing batch 1630000


2017-01-04 06:53:22,964 : INFO : Batch creation working on 1640000

2017-01-04 06:54:03,067 : INFO : Finished batch of 10000 in 14m 51s
2017-01-04 06:54:03,088 : INFO : For index 1690000, the actual number of lines written is: 1599


writing batch 1690000


2017-01-04 06:54:12,063 : INFO : Batch creation working on 1700000

2017-01-04 06:55:04,517 : INFO : 1000
2017-01-04 06:55:08,077 : INFO : Finished batch of 10000 in 14m 52s
2017-01-04 06:55:08,085 : INFO : For index 1750000, the actual number of lines written is: 1588


writing batch 1750000


2017-01-04 06:55:16,272 : INFO : Batch creation working on 1760000

2017-01-04 06:57:25,000 : INFO : 1000
2017-01-04 06:57:47,062 : INFO : 1000
2017-01-04 06:57:52,391 : INFO : 1000
2017-01-04 06:58:38,679 : INFO : 1000
2017-01-04 07:00:34,247 : INFO : Finished batch of 10000 in 14m 57s
2017-01-04 07:00:34,280 : INFO : For index 1220000, the actual number of lines written is: 1587


writing batch 1220000


2017-01-04 07:00:42,922 : INFO : Batch creation working on 1230000

2017-01-04 07:01:26,910 : INFO : 1000
2017-01-04 07:02:04,978 : INFO : 1000
2017-01-04 07:02:43,838 : INFO : 1000
2017-01-04 07:03:09,292 : INFO : Finished batch of 10000 in 14m 32s
2017-01-04 07:03:09,297 : INFO : For index 1400000, the actual number of lines written is: 1584


writing batch 1400000


2017-01-04 07:03:15,392 : INFO : Finished batch of 10000 in 15m 2s
2017-01-04 07:03:15,397 : INFO : For index 1340000, the actual number of lines written is: 1625


writing batch 1340000


2017-01-04 07:03:16,660 : INFO : Batch creation working on 1410000

2017-01-04 07:03:24,488 : INFO : Batch creation working on 1350000

2017-01-04 07:03:30,842 : INFO : Finished batch of 10000 in 14m 54s
2017-01-04 07:03:30,852 : INFO : For index 1280000, the actual number of lines written is: 1621


writing batch 1280000


2017-01-04 07:03:39,406 : INFO : Batch creation working on 1290000

2017-01-04 07:03:53,027 : INFO : 1000
2017-01-04 07:03:56,921 : INFO : Finished batch of 10000 in 14m 43s
2017-01-04 07:03:56,944 : INFO : For index 1460000, the actual number of lines written is: 1592


writing batch 1460000


2017-01-04 07:04:04,882 : INFO : Batch creation working on 1470000

2017-01-04 07:04:32,466 : INFO : 1000
2017-01-04 07:07:26,954 : INFO : Finished batch of 10000 in 15m 3s
2017-01-04 07:07:26,958 : INFO : For index 1520000, the actual number of lines written is: 1680


writing batch 1520000


2017-01-04 07:07:35,551 : INFO : Batch creation working on 1530000

2017-01-04 07:07:44,834 : INFO : Finished batch of 10000 in 14m 59s
2017-01-04 07:07:44,843 : INFO : For index 1580000, the actual number of lines written is: 1606


writing batch 1580000


2017-01-04 07:07:53,144 : INFO : Batch creation working on 1590000

2017-01-04 07:08:25,701 : INFO : Finished batch of 10000 in 15m 3s
2017-01-04 07:08:25,713 : INFO : For index 1640000, the actual number of lines written is: 1647


writing batch 1640000


2017-01-04 07:08:34,270 : INFO : Batch creation working on 1650000

2017-01-04 07:09:17,787 : INFO : Finished batch of 10000 in 15m 6s
2017-01-04 07:09:17,834 : INFO : For index 1700000, the actual number of lines written is: 1584


writing batch 1700000


2017-01-04 07:09:25,898 : INFO : Batch creation working on 1710000

2017-01-04 07:10:09,264 : INFO : 1000
2017-01-04 07:10:14,803 : INFO : Finished batch of 10000 in 14m 59s
2017-01-04 07:10:14,807 : INFO : For index 1760000, the actual number of lines written is: 1620


writing batch 1760000


2017-01-04 07:10:23,587 : INFO : Batch creation working on 1770000

2017-01-04 07:12:42,653 : INFO : 1000
2017-01-04 07:13:05,797 : INFO : 1000
2017-01-04 07:13:11,261 : INFO : 1000
2017-01-04 07:13:40,451 : INFO : 1000
2017-01-04 07:15:30,573 : INFO : Finished batch of 10000 in 14m 48s
2017-01-04 07:15:30,592 : INFO : For index 1230000, the actual number of lines written is: 1563


writing batch 1230000


2017-01-04 07:15:38,142 : INFO : Batch creation working on 1240000

2017-01-04 07:16:49,262 : INFO : 1000
2017-01-04 07:17:09,220 : INFO : 1000
2017-01-04 07:17:36,754 : INFO : 1000
2017-01-04 07:18:12,598 : INFO : Finished batch of 10000 in 14m 48s
2017-01-04 07:18:12,603 : INFO : For index 1350000, the actual number of lines written is: 1538


writing batch 1350000


2017-01-04 07:18:17,584 : INFO : Finished batch of 10000 in 15m 1s
2017-01-04 07:18:17,590 : INFO : For index 1410000, the actual number of lines written is: 1599


writing batch 1410000


2017-01-04 07:18:20,960 : INFO : Batch creation working on 1360000

2017-01-04 07:18:26,645 : INFO : Batch creation working on 1420000

2017-01-04 07:18:28,096 : INFO : 1000
2017-01-04 07:18:29,541 : INFO : Finished batch of 10000 in 14m 50s
2017-01-04 07:18:29,546 : INFO : For index 1290000, the actual number of lines written is: 1581


writing batch 1290000


2017-01-04 07:18:37,988 : INFO : Batch creation working on 1300000

2017-01-04 07:18:51,814 : INFO : Finished batch of 10000 in 14m 47s
2017-01-04 07:18:51,823 : INFO : For index 1470000, the actual number of lines written is: 1565


writing batch 1470000


2017-01-04 07:18:59,971 : INFO : Batch creation working on 1480000

2017-01-04 07:21:39,350 : INFO : 1000
2017-01-04 07:22:28,014 : INFO : Finished batch of 10000 in 14m 52s
2017-01-04 07:22:28,018 : INFO : For index 1530000, the actual number of lines written is: 1647


writing batch 1530000


2017-01-04 07:22:36,473 : INFO : Batch creation working on 1540000

2017-01-04 07:23:04,221 : INFO : Finished batch of 10000 in 15m 11s
2017-01-04 07:23:04,246 : INFO : For index 1590000, the actual number of lines written is: 1642


writing batch 1590000


2017-01-04 07:23:13,348 : INFO : Batch creation working on 1600000

2017-01-04 07:23:17,556 : INFO : Finished batch of 10000 in 14m 43s
2017-01-04 07:23:17,561 : INFO : For index 1650000, the actual number of lines written is: 1621


writing batch 1650000


2017-01-04 07:23:25,388 : INFO : Batch creation working on 1660000

2017-01-04 07:24:27,639 : INFO : Finished batch of 10000 in 15m 2s
2017-01-04 07:24:27,643 : INFO : For index 1710000, the actual number of lines written is: 1610


writing batch 1710000


2017-01-04 07:24:35,375 : INFO : Batch creation working on 1720000

2017-01-04 07:24:44,352 : INFO : 1000
2017-01-04 07:27:23,796 : INFO : Finished batch of 10000 in 17m 0s
2017-01-04 07:27:23,811 : INFO : For index 1770000, the actual number of lines written is: 1592


writing batch 1770000


2017-01-04 07:27:36,025 : INFO : 1000
2017-01-04 07:27:42,140 : INFO : Batch creation working on 1780000

2017-01-04 07:27:48,792 : INFO : 1000
2017-01-04 07:27:52,082 : INFO : 1000
2017-01-04 07:28:02,730 : INFO : 1000
2017-01-04 07:30:31,777 : INFO : Finished batch of 10000 in 14m 54s
2017-01-04 07:30:31,809 : INFO : For index 1240000, the actual number of lines written is: 1597


writing batch 1240000


2017-01-04 07:30:39,599 : INFO : Batch creation working on 1250000

2017-01-04 07:32:16,203 : INFO : 1000
2017-01-04 07:32:42,998 : INFO : 1000
2017-01-04 07:32:52,798 : INFO : 1000
2017-01-04 07:33:24,169 : INFO : Finished batch of 10000 in 15m 3s
2017-01-04 07:33:24,207 : INFO : For index 1360000, the actual number of lines written is: 1587


writing batch 1360000


2017-01-04 07:33:29,999 : INFO : Finished batch of 10000 in 14m 52s
2017-01-04 07:33:30,008 : INFO : For index 1300000, the actual number of lines written is: 1600


writing batch 1300000


2017-01-04 07:33:32,490 : INFO : Batch creation working on 1370000

2017-01-04 07:33:38,198 : INFO : Finished batch of 10000 in 15m 12s
2017-01-04 07:33:38,204 : INFO : For index 1420000, the actual number of lines written is: 1593


writing batch 1420000


2017-01-04 07:33:38,242 : INFO : Batch creation working on 1310000

2017-01-04 07:33:47,505 : INFO : Batch creation working on 1430000

2017-01-04 07:34:11,537 : INFO : Finished batch of 10000 in 15m 12s
2017-01-04 07:34:11,541 : INFO : For index 1480000, the actual number of lines written is: 1685


writing batch 1480000


2017-01-04 07:34:19,531 : INFO : Batch creation working on 1490000

2017-01-04 07:34:23,265 : INFO : 1000
2017-01-04 07:37:07,829 : INFO : 1000
2017-01-04 07:37:45,863 : INFO : Finished batch of 10000 in 15m 9s
2017-01-04 07:37:45,897 : INFO : For index 1540000, the actual number of lines written is: 1537


writing batch 1540000


2017-01-04 07:37:53,784 : INFO : Batch creation working on 1550000

2017-01-04 07:38:13,459 : INFO : Finished batch of 10000 in 15m 0s
2017-01-04 07:38:13,466 : INFO : For index 1600000, the actual number of lines written is: 1603


writing batch 1600000


2017-01-04 07:38:21,911 : INFO : Batch creation working on 1610000

2017-01-04 07:38:40,827 : INFO : Finished batch of 10000 in 15m 15s
2017-01-04 07:38:40,846 : INFO : For index 1660000, the actual number of lines written is: 1611


writing batch 1660000


2017-01-04 07:38:49,770 : INFO : Batch creation working on 1670000

2017-01-04 07:39:48,700 : INFO : 1000
2017-01-04 07:39:49,582 : INFO : Finished batch of 10000 in 15m 14s
2017-01-04 07:39:49,610 : INFO : For index 1720000, the actual number of lines written is: 1555


writing batch 1720000


2017-01-04 07:39:58,350 : INFO : Batch creation working on 1730000

2017-01-04 07:42:44,505 : INFO : Finished batch of 10000 in 15m 2s
2017-01-04 07:42:44,511 : INFO : For index 1780000, the actual number of lines written is: 1618


writing batch 1780000


2017-01-04 07:42:46,509 : INFO : 1000
2017-01-04 07:42:52,605 : INFO : Batch creation working on 1790000

2017-01-04 07:42:56,129 : INFO : 1000
2017-01-04 07:43:33,020 : INFO : 1000
2017-01-04 07:44:00,075 : INFO : 1000
2017-01-04 07:45:55,916 : INFO : Finished batch of 10000 in 15m 16s
2017-01-04 07:45:55,939 : INFO : For index 1250000, the actual number of lines written is: 1655


writing batch 1250000


2017-01-04 07:46:04,972 : INFO : Batch creation working on 1800000

2017-01-04 07:47:01,735 : INFO : 1000
2017-01-04 07:47:36,540 : INFO : 1000
2017-01-04 07:47:51,663 : INFO : 1000
2017-01-04 07:48:36,435 : INFO : Finished batch of 10000 in 15m 4s
2017-01-04 07:48:36,444 : INFO : For index 1370000, the actual number of lines written is: 1607


writing batch 1370000


2017-01-04 07:48:43,834 : INFO : Finished batch of 10000 in 15m 6s
2017-01-04 07:48:43,838 : INFO : For index 1310000, the actual number of lines written is: 1618


writing batch 1310000


2017-01-04 07:48:44,823 : INFO : Batch creation working on 1860000

2017-01-04 07:48:51,305 : INFO : Batch creation working on 1920000

2017-01-04 07:48:58,218 : INFO : Finished batch of 10000 in 15m 11s
2017-01-04 07:48:58,222 : INFO : For index 1430000, the actual number of lines written is: 1592


writing batch 1430000


2017-01-04 07:49:06,366 : INFO : Batch creation working on 1980000

2017-01-04 07:49:25,410 : INFO : Finished batch of 10000 in 15m 6s
2017-01-04 07:49:25,413 : INFO : For index 1490000, the actual number of lines written is: 1603


writing batch 1490000


2017-01-04 07:49:32,077 : INFO : 1000
2017-01-04 07:52:05,891 : INFO : 1000
2017-01-04 07:52:47,552 : INFO : Finished batch of 10000 in 14m 54s
2017-01-04 07:52:47,566 : INFO : For index 1550000, the actual number of lines written is: 1644


writing batch 1550000


2017-01-04 07:53:06,239 : INFO : Finished batch of 10000 in 14m 44s
2017-01-04 07:53:06,244 : INFO : For index 1610000, the actual number of lines written is: 1635


writing batch 1610000


2017-01-04 07:53:30,443 : INFO : Finished batch of 10000 in 14m 41s
2017-01-04 07:53:30,450 : INFO : For index 1670000, the actual number of lines written is: 1642


writing batch 1670000


2017-01-04 07:54:14,243 : INFO : Finished batch of 10000 in 14m 16s
2017-01-04 07:54:14,248 : INFO : For index 1730000, the actual number of lines written is: 1574


writing batch 1730000


2017-01-04 07:54:49,259 : INFO : 1000
2017-01-04 07:56:28,310 : INFO : Finished batch of 10000 in 13m 36s
2017-01-04 07:56:28,314 : INFO : For index 1790000, the actual number of lines written is: 1550


writing batch 1790000


2017-01-04 07:56:40,028 : INFO : 1000
2017-01-04 07:56:52,272 : INFO : 1000
2017-01-04 07:57:12,525 : INFO : 1000
2017-01-04 07:58:56,025 : INFO : Finished batch of 10000 in 12m 51s
2017-01-04 07:58:56,029 : INFO : For index 1800000, the actual number of lines written is: 1596


writing batch 1800000


2017-01-04 07:59:02,207 : INFO : Batch creation working on 1810000

2017-01-04 08:00:41,843 : INFO : Finished batch of 10000 in 11m 57s
2017-01-04 08:00:41,847 : INFO : For index 1860000, the actual number of lines written is: 1557


writing batch 1860000


2017-01-04 08:00:48,116 : INFO : Batch creation working on 1870000

2017-01-04 08:00:58,508 : INFO : Finished batch of 10000 in 12m 7s
2017-01-04 08:00:58,513 : INFO : For index 1920000, the actual number of lines written is: 1679


writing batch 1920000


2017-01-04 08:01:05,424 : INFO : Batch creation working on 1930000

2017-01-04 08:01:18,160 : INFO : Finished batch of 10000 in 12m 12s
2017-01-04 08:01:18,189 : INFO : For index 1980000, the actual number of lines written is: 1566


writing batch 1980000


2017-01-04 08:01:24,981 : INFO : Batch creation working on 1990000

2017-01-04 08:06:13,397 : INFO : 1000
2017-01-04 08:08:12,086 : INFO : 1000
2017-01-04 08:08:17,238 : INFO : 1000
2017-01-04 08:08:40,641 : INFO : 1000
2017-01-04 08:10:36,903 : INFO : Finished batch of 10000 in 11m 35s
2017-01-04 08:10:36,910 : INFO : For index 1810000, the actual number of lines written is: 1589


writing batch 1810000


2017-01-04 08:10:43,218 : INFO : Batch creation working on 1820000

2017-01-04 08:12:13,918 : INFO : Finished batch of 10000 in 11m 26s
2017-01-04 08:12:13,922 : INFO : For index 1870000, the actual number of lines written is: 1576


writing batch 1870000


2017-01-04 08:12:20,809 : INFO : Batch creation working on 1880000

2017-01-04 08:12:47,745 : INFO : Finished batch of 10000 in 11m 42s
2017-01-04 08:12:47,750 : INFO : For index 1930000, the actual number of lines written is: 1614


writing batch 1930000


2017-01-04 08:12:55,179 : INFO : Batch creation working on 1940000

2017-01-04 08:12:56,175 : INFO : Finished batch of 10000 in 11m 31s
2017-01-04 08:12:56,178 : INFO : For index 1990000, the actual number of lines written is: 1651


writing batch 1990000


2017-01-04 08:13:02,738 : INFO : Batch creation working on 2000000

2017-01-04 08:18:30,624 : INFO : 1000
2017-01-04 08:20:01,341 : INFO : 1000
2017-01-04 08:20:54,175 : INFO : 1000
2017-01-04 08:20:54,359 : INFO : 1000
2017-01-04 08:22:44,181 : INFO : Finished batch of 10000 in 12m 1s
2017-01-04 08:22:44,186 : INFO : For index 1820000, the actual number of lines written is: 1573


writing batch 1820000


2017-01-04 08:22:50,938 : INFO : Batch creation working on 1830000

2017-01-04 08:24:14,433 : INFO : Finished batch of 10000 in 11m 54s
2017-01-04 08:24:14,452 : INFO : For index 1880000, the actual number of lines written is: 1570


writing batch 1880000


2017-01-04 08:24:20,687 : INFO : Batch creation working on 1890000

2017-01-04 08:24:30,270 : INFO : Finished batch of 10000 in 11m 28s
2017-01-04 08:24:30,287 : INFO : For index 2000000, the actual number of lines written is: 1539


writing batch 2000000


2017-01-04 08:24:44,450 : INFO : Finished batch of 10000 in 11m 49s
2017-01-04 08:24:44,453 : INFO : For index 1940000, the actual number of lines written is: 1539


writing batch 1940000


2017-01-04 08:24:50,782 : INFO : Batch creation working on 1950000

2017-01-04 08:30:05,931 : INFO : 1000
2017-01-04 08:32:05,758 : INFO : 1000
2017-01-04 08:32:18,029 : INFO : 1000
2017-01-04 08:34:51,482 : INFO : Finished batch of 10000 in 12m 1s
2017-01-04 08:34:51,485 : INFO : For index 1830000, the actual number of lines written is: 1613


writing batch 1830000


2017-01-04 08:34:58,851 : INFO : Batch creation working on 1840000

2017-01-04 08:36:36,619 : INFO : Finished batch of 10000 in 12m 16s
2017-01-04 08:36:36,621 : INFO : For index 1890000, the actual number of lines written is: 1532


writing batch 1890000


2017-01-04 08:36:40,491 : INFO : Batch creation working on 1900000

2017-01-04 08:37:31,770 : INFO : Finished batch of 10000 in 12m 41s
2017-01-04 08:37:31,776 : INFO : For index 1950000, the actual number of lines written is: 1585


writing batch 1950000


2017-01-04 08:37:38,998 : INFO : Batch creation working on 1960000

2017-01-04 08:43:33,201 : INFO : 1000
2017-01-04 08:46:10,054 : INFO : 1000
2017-01-04 08:46:35,658 : INFO : 1000
2017-01-04 08:49:15,163 : INFO : Finished batch of 10000 in 14m 16s
2017-01-04 08:49:15,165 : INFO : For index 1840000, the actual number of lines written is: 1587


writing batch 1840000


2017-01-04 08:49:19,707 : INFO : Batch creation working on 1850000

2017-01-04 08:51:01,782 : INFO : Finished batch of 10000 in 14m 21s
2017-01-04 08:51:01,786 : INFO : For index 1900000, the actual number of lines written is: 1532


writing batch 1900000


2017-01-04 08:51:09,347 : INFO : Batch creation working on 1910000

2017-01-04 08:52:24,218 : INFO : Finished batch of 10000 in 14m 45s
2017-01-04 08:52:24,221 : INFO : For index 1960000, the actual number of lines written is: 1644


writing batch 1960000


2017-01-04 08:52:28,649 : INFO : Batch creation working on 1970000

2017-01-04 08:58:29,945 : INFO : 1000
2017-01-04 08:59:56,880 : INFO : 1000
2017-01-04 09:01:21,172 : INFO : 1000
2017-01-04 09:03:48,143 : INFO : Finished batch of 10000 in 14m 28s
2017-01-04 09:03:48,147 : INFO : For index 1850000, the actual number of lines written is: 1510


writing batch 1850000


2017-01-04 09:05:09,293 : INFO : Finished batch of 10000 in 13m 60s
2017-01-04 09:05:09,298 : INFO : For index 1910000, the actual number of lines written is: 1606


writing batch 1910000


2017-01-04 09:06:38,757 : INFO : Finished batch of 10000 in 14m 10s
2017-01-04 09:06:38,762 : INFO : For index 1970000, the actual number of lines written is: 1635


writing batch 1970000


#### Join up the files we wrote to form BATCH_SIZE block files

In [65]:
curr_index = 0
curr_read_file_index = 0
curr_write_file_index = 0

read_file = None
write_file = None

def read_line():
    global read_file, curr_read_file_index
    while True:
        if read_file is None:
            if os.path.exists(VALIDATION_PREPROCESSED_FILES_PREFIX + str(curr_read_file_index)):
                read_file = open(VALIDATION_PREPROCESSED_FILES_PREFIX + str(curr_read_file_index), "r")
            else: 
                raise StopIteration()
        for line in read_file:
            yield line
            
        curr_read_file_index += BATCH_SIZE
        info("Reading new file for batch {}".format(curr_read_file_index))
        read_file.close()
        read_file = None

def write_line(line):
    global write_file, curr_write_file_index, curr_index
    if write_file is None:
        write_file = open(VALIDATION_MERGED_PREPROCESSED_FILES_PREFIX + str(curr_write_file_index), "w")
    write_file.write(line)
    curr_index += 1
    if curr_index % BATCH_SIZE == 0:
        curr_write_file_index += BATCH_SIZE
        info("Writing to a new file for batch {}".format(curr_write_file_index))
        write_file.close()
        write_file = None
    

for line in read_line():
    write_line(line)

2017-01-04 09:06:46,449 : INFO : Reading new file for batch 10000
2017-01-04 09:06:47,245 : INFO : Reading new file for batch 20000
2017-01-04 09:06:48,032 : INFO : Reading new file for batch 30000
2017-01-04 09:06:48,855 : INFO : Reading new file for batch 40000
2017-01-04 09:06:49,657 : INFO : Reading new file for batch 50000
2017-01-04 09:06:50,486 : INFO : Reading new file for batch 60000
2017-01-04 09:06:50,819 : INFO : Writing to a new file for batch 10000
2017-01-04 09:06:59,380 : INFO : Reading new file for batch 70000
2017-01-04 09:07:00,196 : INFO : Reading new file for batch 80000
2017-01-04 09:07:01,050 : INFO : Reading new file for batch 90000
2017-01-04 09:07:02,035 : INFO : Reading new file for batch 100000
2017-01-04 09:07:02,900 : INFO : Reading new file for batch 110000
2017-01-04 09:07:03,702 : INFO : Reading new file for batch 120000
2017-01-04 09:07:04,183 : INFO : Writing to a new file for batch 20000
2017-01-04 09:07:12,708 : INFO : Reading new file for batch 130

#### Rearranging Docids files

In [66]:
all_doc_ids = []

for i in range(0, 100000000, BATCH_SIZE):
    if os.path.exists(VALIDATION_PREPROCESSED_DOCIDS_FILES_PREFIX + str(i)):
        doc_ids = pickle.load(open(VALIDATION_PREPROCESSED_DOCIDS_FILES_PREFIX + str(i)))
        all_doc_ids.extend(doc_ids)
    else:
        break
        

for i in range(0, 100000000, BATCH_SIZE):
    if i < len(all_doc_ids):
        pickle.dump(all_doc_ids[i: i + BATCH_SIZE], open(VALIDATION_MERGED_PREPROCESSED_DOCIDS_FILES_PREFIX + str(i), "w"))
    else:
        break

# Test Data

In [18]:
def multithreaded_batch_creation(start_index):

    file_prefix = TEST_PREPROCESSED_FILES_PREFIX
    doc_file_prefix = TEST_PREPROCESSED_DOCIDS_FILES_PREFIX
    
    if os.path.exists(file_prefix + str(start_index)):
        info("Batch {} already exists, skipping..".format(start_index))
        return
    
    info("Batch creation working on {}\n".format(start_index))
    with open(training_file) as file_obj:
        file_obj.seek(line_positions[start_index])
        token_lines, doc_ids = [], []
        start_time = time.time()
        for i, line in enumerate(file_obj):
            (doc_id, text) = eval(line)
            if doc_id in test_docs_list:
                token_lines.append(stemtokenizer(text))
                doc_ids.append(doc_id)
                if len(token_lines) % 1000 == 0: info(len(token_lines))
            if i >= BATCH_SIZE:
                break
    duration = time.time() - start_time
    info("Finished batch of {:d} in {:.0f}m {:.0f}s".format(BATCH_SIZE, *divmod(duration, 60)))
    info("For index {}, the actual number of lines written is: {}".format(start_index, len(token_lines)))
    write_batch(file_prefix, doc_file_prefix, token_lines, doc_ids, start_index)
    del token_lines
    del doc_ids

In [17]:
(divmod(len(line_positions), BATCH_SIZE)[0] +1) * BATCH_SIZE

2010000

In [19]:
pool = ThreadPool(16)
# +1 since range is end-exclusive
batches = range(0, (divmod(len(line_positions), BATCH_SIZE)[0] + 1) * BATCH_SIZE, BATCH_SIZE )
indices = pool.map(multithreaded_batch_creation, batches)

2017-01-30 18:34:43,473 : INFO : Batch creation working on 40000

2017-01-30 18:34:43,473 : INFO : Batch creation working on 400000

2017-01-30 18:34:43,474 : INFO : Batch creation working on 240000

2017-01-30 18:34:43,473 : INFO : Batch creation working on 0

2017-01-30 18:34:43,473 : INFO : Batch creation working on 200000

2017-01-30 18:34:43,474 : INFO : Batch creation working on 440000

2017-01-30 18:34:43,474 : INFO : Batch creation working on 360000

2017-01-30 18:34:43,477 : INFO : Batch creation working on 120000

2017-01-30 18:34:43,477 : INFO : Batch creation working on 80000

2017-01-30 18:34:43,477 : INFO : Batch creation working on 480000

2017-01-30 18:34:43,478 : INFO : Batch creation working on 280000

2017-01-30 18:34:43,479 : INFO : Batch creation working on 320000

2017-01-30 18:34:43,479 : INFO : Batch creation working on 160000

2017-01-30 18:34:43,479 : INFO : Batch creation working on 520000

2017-01-30 18:34:43,482 : INFO : Batch creation working on 560000

20

writing batch 0


2017-01-30 18:48:07,244 : INFO : Batch creation working on 10000

2017-01-30 18:48:18,067 : INFO : 2000
2017-01-30 18:48:19,040 : INFO : 2000
2017-01-30 18:48:26,435 : INFO : 2000
2017-01-30 18:48:41,425 : INFO : Finished batch of 10000 in 13m 58s
2017-01-30 18:48:41,430 : INFO : For index 40000, the actual number of lines written is: 2029


writing batch 40000


2017-01-30 18:48:43,648 : INFO : Finished batch of 10000 in 13m 60s
2017-01-30 18:48:43,651 : INFO : For index 200000, the actual number of lines written is: 1977


writing batch 200000


2017-01-30 18:48:47,621 : INFO : Batch creation working on 50000

2017-01-30 18:48:49,688 : INFO : Batch creation working on 210000

2017-01-30 18:48:50,889 : INFO : Finished batch of 10000 in 14m 7s
2017-01-30 18:48:50,892 : INFO : For index 440000, the actual number of lines written is: 1990


writing batch 440000


2017-01-30 18:48:51,857 : INFO : Finished batch of 10000 in 14m 8s
2017-01-30 18:48:51,860 : INFO : For index 400000, the actual number of lines written is: 2083


writing batch 400000


2017-01-30 18:48:58,996 : INFO : Batch creation working on 450000

2017-01-30 18:48:59,303 : INFO : Finished batch of 10000 in 14m 16s
2017-01-30 18:48:59,306 : INFO : For index 280000, the actual number of lines written is: 2092


writing batch 280000


2017-01-30 18:48:59,499 : INFO : Batch creation working on 410000

2017-01-30 18:49:02,258 : INFO : Finished batch of 10000 in 14m 18s
2017-01-30 18:49:02,261 : INFO : For index 600000, the actual number of lines written is: 1992


writing batch 600000


2017-01-30 18:49:06,531 : INFO : Batch creation working on 290000

2017-01-30 18:49:08,779 : INFO : Batch creation working on 610000

2017-01-30 18:51:52,214 : INFO : 2000
2017-01-30 18:51:52,661 : INFO : 2000
2017-01-30 18:51:57,505 : INFO : 2000
2017-01-30 18:52:05,077 : INFO : Finished batch of 10000 in 17m 21s
2017-01-30 18:52:05,080 : INFO : For index 120000, the actual number of lines written is: 2012


writing batch 120000


2017-01-30 18:52:05,669 : INFO : Finished batch of 10000 in 17m 22s
2017-01-30 18:52:05,675 : INFO : For index 520000, the actual number of lines written is: 2019


writing batch 520000


2017-01-30 18:52:09,372 : INFO : Finished batch of 10000 in 17m 26s
2017-01-30 18:52:09,376 : INFO : For index 160000, the actual number of lines written is: 2029


writing batch 160000


2017-01-30 18:52:14,195 : INFO : Batch creation working on 130000

2017-01-30 18:52:14,366 : INFO : Batch creation working on 530000

2017-01-30 18:52:16,357 : INFO : Finished batch of 10000 in 17m 33s
2017-01-30 18:52:16,361 : INFO : For index 360000, the actual number of lines written is: 1967


writing batch 360000


2017-01-30 18:52:17,599 : INFO : 2000
2017-01-30 18:52:17,882 : INFO : Finished batch of 10000 in 17m 34s
2017-01-30 18:52:17,885 : INFO : For index 480000, the actual number of lines written is: 2000


writing batch 480000


2017-01-30 18:52:18,218 : INFO : Batch creation working on 170000

2017-01-30 18:52:19,083 : INFO : Finished batch of 10000 in 17m 35s
2017-01-30 18:52:19,088 : INFO : For index 320000, the actual number of lines written is: 1988


writing batch 320000


2017-01-30 18:52:21,031 : INFO : Finished batch of 10000 in 17m 37s
2017-01-30 18:52:21,037 : INFO : For index 240000, the actual number of lines written is: 1949


writing batch 240000


2017-01-30 18:52:21,213 : INFO : 2000
2017-01-30 18:52:24,951 : INFO : Finished batch of 10000 in 17m 41s
2017-01-30 18:52:24,954 : INFO : For index 560000, the actual number of lines written is: 2006


writing batch 560000


2017-01-30 18:52:26,161 : INFO : Batch creation working on 370000

2017-01-30 18:52:26,430 : INFO : Finished batch of 10000 in 17m 43s
2017-01-30 18:52:26,436 : INFO : For index 80000, the actual number of lines written is: 1966


writing batch 80000


2017-01-30 18:52:27,957 : INFO : Batch creation working on 490000

2017-01-30 18:52:28,979 : INFO : Batch creation working on 330000

2017-01-30 18:52:30,281 : INFO : Batch creation working on 250000

2017-01-30 18:52:35,834 : INFO : Batch creation working on 570000

2017-01-30 18:52:36,573 : INFO : Batch creation working on 90000

2017-01-30 18:54:44,315 : INFO : 1000
2017-01-30 18:55:24,954 : INFO : 1000
2017-01-30 18:55:40,813 : INFO : 1000
2017-01-30 18:55:41,182 : INFO : 1000
2017-01-30 18:55:55,220 : INFO : 1000
2017-01-30 18:56:06,476 : INFO : 1000
2017-01-30 18:56:15,295 : INFO : 1000
2017-01-30 19:00:38,745 : INFO : 1000
2017-01-30 19:00:52,225 : INFO : 1000
2017-01-30 19:00:55,223 : INFO : 1000
2017-01-30 19:00:55,422 : INFO : 1000
2017-01-30 19:00:57,606 : INFO : 1000
2017-01-30 19:01:06,683 : INFO : 1000
2017-01-30 19:01:14,538 : INFO : 2000
2017-01-30 19:01:20,931 : INFO : 1000
2017-01-30 19:01:24,569 : INFO : Finished batch of 10000 in 13m 17s
2017-01-30 19:01:24,571 : IN

writing batch 10000


2017-01-30 19:01:30,773 : INFO : 1000
2017-01-30 19:01:30,976 : INFO : Batch creation working on 20000

2017-01-30 19:01:33,367 : INFO : 1000
2017-01-30 19:02:27,729 : INFO : 2000
2017-01-30 19:02:31,057 : INFO : 2000
2017-01-30 19:02:46,215 : INFO : Finished batch of 10000 in 13m 57s
2017-01-30 19:02:46,218 : INFO : For index 210000, the actual number of lines written is: 2038


writing batch 210000


2017-01-30 19:02:46,585 : INFO : Finished batch of 10000 in 13m 59s
2017-01-30 19:02:46,588 : INFO : For index 50000, the actual number of lines written is: 2040


writing batch 50000


2017-01-30 19:02:52,364 : INFO : Finished batch of 10000 in 13m 53s
2017-01-30 19:02:52,369 : INFO : For index 410000, the actual number of lines written is: 1883


writing batch 410000


2017-01-30 19:02:54,173 : INFO : Finished batch of 10000 in 13m 55s
2017-01-30 19:02:54,177 : INFO : For index 450000, the actual number of lines written is: 1983


writing batch 450000


2017-01-30 19:02:54,282 : INFO : Batch creation working on 60000

2017-01-30 19:02:54,523 : INFO : Batch creation working on 220000

2017-01-30 19:02:54,947 : INFO : 2000
2017-01-30 19:02:58,282 : INFO : Batch creation working on 420000

2017-01-30 19:03:00,102 : INFO : Batch creation working on 460000

2017-01-30 19:03:00,148 : INFO : 2000
2017-01-30 19:03:05,481 : INFO : Finished batch of 10000 in 13m 57s
2017-01-30 19:03:05,485 : INFO : For index 610000, the actual number of lines written is: 2009


writing batch 610000


2017-01-30 19:03:11,977 : INFO : Batch creation working on 620000

2017-01-30 19:03:18,596 : INFO : Finished batch of 10000 in 14m 12s
2017-01-30 19:03:18,598 : INFO : For index 290000, the actual number of lines written is: 2058


writing batch 290000


2017-01-30 19:03:25,307 : INFO : Batch creation working on 300000

2017-01-30 19:08:00,251 : INFO : 1000
2017-01-30 19:09:00,213 : INFO : 2000
2017-01-30 19:09:19,789 : INFO : Finished batch of 10000 in 17m 5s
2017-01-30 19:09:19,796 : INFO : For index 530000, the actual number of lines written is: 1987


writing batch 530000


2017-01-30 19:09:22,996 : INFO : Finished batch of 10000 in 17m 5s
2017-01-30 19:09:23,000 : INFO : For index 170000, the actual number of lines written is: 2045


writing batch 170000


2017-01-30 19:09:27,480 : INFO : Batch creation working on 540000

2017-01-30 19:09:29,931 : INFO : Finished batch of 10000 in 17m 1s
2017-01-30 19:09:29,934 : INFO : For index 330000, the actual number of lines written is: 1989


writing batch 330000


2017-01-30 19:09:30,800 : INFO : Batch creation working on 180000

2017-01-30 19:09:34,081 : INFO : Finished batch of 10000 in 17m 20s
2017-01-30 19:09:34,086 : INFO : For index 130000, the actual number of lines written is: 1982


writing batch 130000


2017-01-30 19:09:37,812 : INFO : Batch creation working on 340000

2017-01-30 19:09:41,011 : INFO : 1000
2017-01-30 19:09:42,129 : INFO : Batch creation working on 140000

2017-01-30 19:09:43,923 : INFO : Finished batch of 10000 in 17m 18s
2017-01-30 19:09:43,926 : INFO : For index 370000, the actual number of lines written is: 1980


writing batch 370000


2017-01-30 19:09:47,242 : INFO : 2000
2017-01-30 19:09:49,029 : INFO : Finished batch of 10000 in 17m 13s
2017-01-30 19:09:49,032 : INFO : For index 570000, the actual number of lines written is: 2004


writing batch 570000


2017-01-30 19:09:49,354 : INFO : 2000
2017-01-30 19:09:49,795 : INFO : 2000
2017-01-30 19:09:51,417 : INFO : 2000
2017-01-30 19:09:52,078 : INFO : Finished batch of 10000 in 17m 24s
2017-01-30 19:09:52,093 : INFO : For index 490000, the actual number of lines written is: 2003


writing batch 490000


2017-01-30 19:09:52,354 : INFO : Batch creation working on 380000

2017-01-30 19:09:56,110 : INFO : 1000
2017-01-30 19:09:56,866 : INFO : Batch creation working on 580000

2017-01-30 19:09:57,369 : INFO : Finished batch of 10000 in 17m 27s
2017-01-30 19:09:57,372 : INFO : For index 250000, the actual number of lines written is: 2021


writing batch 250000


2017-01-30 19:09:58,004 : INFO : 1000
2017-01-30 19:10:00,195 : INFO : Batch creation working on 500000

2017-01-30 19:10:01,418 : INFO : Finished batch of 10000 in 17m 25s
2017-01-30 19:10:01,423 : INFO : For index 90000, the actual number of lines written is: 2011


writing batch 90000


2017-01-30 19:10:05,367 : INFO : Batch creation working on 260000

2017-01-30 19:10:09,056 : INFO : 1000
2017-01-30 19:10:09,137 : INFO : Batch creation working on 100000

2017-01-30 19:10:18,695 : INFO : 1000
2017-01-30 19:10:41,536 : INFO : 1000
2017-01-30 19:14:33,556 : INFO : 2000
2017-01-30 19:14:46,196 : INFO : Finished batch of 10000 in 13m 15s
2017-01-30 19:14:46,199 : INFO : For index 20000, the actual number of lines written is: 2035


writing batch 20000


2017-01-30 19:14:52,595 : INFO : Batch creation working on 30000

2017-01-30 19:16:32,531 : INFO : 2000
2017-01-30 19:16:44,957 : INFO : Finished batch of 10000 in 13m 45s
2017-01-30 19:16:44,960 : INFO : For index 460000, the actual number of lines written is: 1978


writing batch 460000


2017-01-30 19:16:50,746 : INFO : Batch creation working on 470000

2017-01-30 19:16:53,046 : INFO : 2000
2017-01-30 19:16:53,725 : INFO : Finished batch of 10000 in 13m 59s
2017-01-30 19:16:53,727 : INFO : For index 60000, the actual number of lines written is: 2003


writing batch 60000


2017-01-30 19:16:56,421 : INFO : Finished batch of 10000 in 14m 2s
2017-01-30 19:16:56,426 : INFO : For index 220000, the actual number of lines written is: 2051


writing batch 220000


2017-01-30 19:17:00,258 : INFO : 2000
2017-01-30 19:17:00,301 : INFO : Batch creation working on 70000

2017-01-30 19:17:03,160 : INFO : Batch creation working on 230000

2017-01-30 19:17:07,782 : INFO : Finished batch of 10000 in 14m 9s
2017-01-30 19:17:07,785 : INFO : For index 420000, the actual number of lines written is: 1975


writing batch 420000


2017-01-30 19:17:14,237 : INFO : Batch creation working on 430000

2017-01-30 19:17:15,806 : INFO : Finished batch of 10000 in 14m 4s
2017-01-30 19:17:15,809 : INFO : For index 620000, the actual number of lines written is: 2040


writing batch 620000


2017-01-30 19:17:22,046 : INFO : Batch creation working on 630000

2017-01-30 19:17:28,868 : INFO : 2000
2017-01-30 19:17:31,403 : INFO : Finished batch of 10000 in 14m 6s
2017-01-30 19:17:31,406 : INFO : For index 300000, the actual number of lines written is: 2007


writing batch 300000


2017-01-30 19:17:37,686 : INFO : Batch creation working on 310000

2017-01-30 19:17:58,360 : INFO : 1000
2017-01-30 19:18:10,536 : INFO : 1000
2017-01-30 19:18:12,058 : INFO : 1000
2017-01-30 19:18:22,864 : INFO : 1000
2017-01-30 19:18:33,073 : INFO : 1000
2017-01-30 19:18:43,805 : INFO : 1000
2017-01-30 19:18:47,324 : INFO : 1000
2017-01-30 19:18:50,111 : INFO : 1000
2017-01-30 19:18:55,603 : INFO : 1000
2017-01-30 19:21:28,654 : INFO : 1000
2017-01-30 19:23:40,201 : INFO : 1000
2017-01-30 19:24:10,810 : INFO : 1000
2017-01-30 19:24:14,757 : INFO : 1000
2017-01-30 19:24:16,330 : INFO : 1000
2017-01-30 19:24:28,059 : INFO : 1000
2017-01-30 19:24:44,761 : INFO : 1000
2017-01-30 19:26:12,043 : INFO : 2000
2017-01-30 19:26:33,355 : INFO : Finished batch of 10000 in 17m 6s
2017-01-30 19:26:33,380 : INFO : For index 540000, the actual number of lines written is: 1990


writing batch 540000


2017-01-30 19:26:36,308 : INFO : Finished batch of 10000 in 17m 6s
2017-01-30 19:26:36,312 : INFO : For index 180000, the actual number of lines written is: 1994


writing batch 180000


2017-01-30 19:26:40,833 : INFO : Batch creation working on 550000

2017-01-30 19:26:43,631 : INFO : Batch creation working on 190000

2017-01-30 19:26:52,114 : INFO : Finished batch of 10000 in 17m 14s
2017-01-30 19:26:52,118 : INFO : For index 340000, the actual number of lines written is: 2089


writing batch 340000


2017-01-30 19:26:52,572 : INFO : Finished batch of 10000 in 17m 0s
2017-01-30 19:26:52,576 : INFO : For index 380000, the actual number of lines written is: 1951


writing batch 380000


2017-01-30 19:27:00,085 : INFO : Finished batch of 10000 in 17m 18s
2017-01-30 19:27:00,090 : INFO : For index 140000, the actual number of lines written is: 1983


writing batch 140000


2017-01-30 19:27:01,253 : INFO : Batch creation working on 390000

2017-01-30 19:27:02,785 : INFO : Finished batch of 10000 in 16m 57s
2017-01-30 19:27:02,791 : INFO : For index 260000, the actual number of lines written is: 1951


writing batch 260000


2017-01-30 19:27:03,242 : INFO : Batch creation working on 350000

2017-01-30 19:27:08,244 : INFO : Batch creation working on 150000

2017-01-30 19:27:08,988 : INFO : Finished batch of 10000 in 17m 9s
2017-01-30 19:27:08,991 : INFO : For index 500000, the actual number of lines written is: 1997


writing batch 500000


2017-01-30 19:27:09,594 : INFO : Finished batch of 10000 in 17m 13s
2017-01-30 19:27:09,598 : INFO : For index 580000, the actual number of lines written is: 1975


writing batch 580000


2017-01-30 19:27:10,706 : INFO : Batch creation working on 270000

2017-01-30 19:27:18,683 : INFO : Batch creation working on 510000

2017-01-30 19:27:18,941 : INFO : Batch creation working on 590000

2017-01-30 19:27:18,943 : INFO : Finished batch of 10000 in 17m 10s
2017-01-30 19:27:18,962 : INFO : For index 100000, the actual number of lines written is: 1986


writing batch 100000


2017-01-30 19:27:27,118 : INFO : Batch creation working on 110000

2017-01-30 19:28:10,296 : INFO : Finished batch of 10000 in 13m 18s
2017-01-30 19:28:10,299 : INFO : For index 30000, the actual number of lines written is: 1960


writing batch 30000


2017-01-30 19:28:16,232 : INFO : Batch creation working on 640000

2017-01-30 19:31:04,075 : INFO : 2000
2017-01-30 19:31:05,599 : INFO : Finished batch of 10000 in 14m 15s
2017-01-30 19:31:05,602 : INFO : For index 470000, the actual number of lines written is: 1986


writing batch 470000


2017-01-30 19:31:10,584 : INFO : Finished batch of 10000 in 14m 7s
2017-01-30 19:31:10,587 : INFO : For index 230000, the actual number of lines written is: 1951


writing batch 230000


2017-01-30 19:31:12,439 : INFO : Batch creation working on 680000

2017-01-30 19:31:14,053 : INFO : Finished batch of 10000 in 14m 14s
2017-01-30 19:31:14,056 : INFO : For index 70000, the actual number of lines written is: 2018


writing batch 70000


2017-01-30 19:31:16,105 : INFO : 2000
2017-01-30 19:31:17,076 : INFO : Finished batch of 10000 in 14m 3s
2017-01-30 19:31:17,080 : INFO : For index 430000, the actual number of lines written is: 2001


writing batch 430000


2017-01-30 19:31:17,480 : INFO : Batch creation working on 720000

2017-01-30 19:31:18,661 : INFO : 2000
2017-01-30 19:31:21,245 : INFO : Batch creation working on 760000

2017-01-30 19:31:23,404 : INFO : Batch creation working on 800000

2017-01-30 19:31:28,721 : INFO : Finished batch of 10000 in 14m 7s
2017-01-30 19:31:28,724 : INFO : For index 630000, the actual number of lines written is: 2028


writing batch 630000


2017-01-30 19:31:34,741 : INFO : Batch creation working on 840000

2017-01-30 19:31:38,806 : INFO : Finished batch of 10000 in 14m 1s
2017-01-30 19:31:38,809 : INFO : For index 310000, the actual number of lines written is: 1981


writing batch 310000


2017-01-30 19:31:44,677 : INFO : Batch creation working on 880000

2017-01-30 19:35:02,170 : INFO : 1000
2017-01-30 19:35:08,455 : INFO : 1000
2017-01-30 19:35:09,629 : INFO : 1000
2017-01-30 19:35:38,605 : INFO : 1000
2017-01-30 19:35:42,587 : INFO : 1000
2017-01-30 19:35:54,670 : INFO : 1000
2017-01-30 19:35:57,679 : INFO : 1000
2017-01-30 19:35:58,130 : INFO : 1000
2017-01-30 19:36:08,760 : INFO : 1000
2017-01-30 19:36:09,494 : INFO : 1000
2017-01-30 19:37:48,498 : INFO : 1000
2017-01-30 19:38:28,656 : INFO : 1000
2017-01-30 19:38:31,231 : INFO : 1000
2017-01-30 19:38:33,624 : INFO : 1000
2017-01-30 19:38:41,360 : INFO : 1000
2017-01-30 19:38:50,709 : INFO : 1000
2017-01-30 19:41:23,676 : INFO : Finished batch of 10000 in 13m 7s
2017-01-30 19:41:23,682 : INFO : For index 640000, the actual number of lines written is: 1967


writing batch 640000


2017-01-30 19:41:29,455 : INFO : Batch creation working on 650000

2017-01-30 19:43:32,519 : INFO : 2000
2017-01-30 19:43:50,289 : INFO : 2000
2017-01-30 19:44:02,767 : INFO : 2000
2017-01-30 19:44:02,848 : INFO : Finished batch of 10000 in 17m 22s
2017-01-30 19:44:02,851 : INFO : For index 550000, the actual number of lines written is: 2024


writing batch 550000


2017-01-30 19:44:06,623 : INFO : Finished batch of 10000 in 17m 5s
2017-01-30 19:44:06,628 : INFO : For index 390000, the actual number of lines written is: 1944


writing batch 390000


2017-01-30 19:44:10,318 : INFO : 2000
2017-01-30 19:44:11,276 : INFO : Batch creation working on 920000

2017-01-30 19:44:13,857 : INFO : Batch creation working on 960000

2017-01-30 19:44:13,915 : INFO : Finished batch of 10000 in 17m 30s
2017-01-30 19:44:13,919 : INFO : For index 190000, the actual number of lines written is: 2085


writing batch 190000


2017-01-30 19:44:15,743 : INFO : 2000
2017-01-30 19:44:19,492 : INFO : Finished batch of 10000 in 17m 9s
2017-01-30 19:44:19,495 : INFO : For index 270000, the actual number of lines written is: 1956


writing batch 270000


2017-01-30 19:44:24,130 : INFO : Batch creation working on 1000000

2017-01-30 19:44:26,789 : INFO : Finished batch of 10000 in 17m 24s
2017-01-30 19:44:26,793 : INFO : For index 350000, the actual number of lines written is: 1952


writing batch 350000


2017-01-30 19:44:27,142 : INFO : Finished batch of 10000 in 17m 19s
2017-01-30 19:44:27,147 : INFO : For index 150000, the actual number of lines written is: 2044


writing batch 150000


2017-01-30 19:44:28,255 : INFO : Batch creation working on 1040000

2017-01-30 19:44:34,317 : INFO : 2000
2017-01-30 19:44:36,454 : INFO : Batch creation working on 1080000

2017-01-30 19:44:36,625 : INFO : Finished batch of 10000 in 17m 10s
2017-01-30 19:44:36,629 : INFO : For index 110000, the actual number of lines written is: 2047


writing batch 110000


2017-01-30 19:44:36,848 : INFO : Batch creation working on 1120000

2017-01-30 19:44:36,986 : INFO : Finished batch of 10000 in 17m 18s
2017-01-30 19:44:36,990 : INFO : For index 590000, the actual number of lines written is: 2038


writing batch 590000


2017-01-30 19:44:41,898 : INFO : Finished batch of 10000 in 17m 23s
2017-01-30 19:44:41,901 : INFO : For index 510000, the actual number of lines written is: 1985


writing batch 510000


2017-01-30 19:44:46,245 : INFO : Batch creation working on 1160000

2017-01-30 19:44:46,684 : INFO : Batch creation working on 1200000

2017-01-30 19:44:51,452 : INFO : Batch creation working on 1240000

2017-01-30 19:45:14,483 : INFO : Finished batch of 10000 in 13m 57s
2017-01-30 19:45:14,486 : INFO : For index 720000, the actual number of lines written is: 1950


writing batch 720000


2017-01-30 19:45:15,131 : INFO : Finished batch of 10000 in 13m 52s
2017-01-30 19:45:15,139 : INFO : For index 800000, the actual number of lines written is: 1991


writing batch 800000


2017-01-30 19:45:15,865 : INFO : Finished batch of 10000 in 14m 3s
2017-01-30 19:45:15,868 : INFO : For index 680000, the actual number of lines written is: 2116


writing batch 680000


2017-01-30 19:45:21,066 : INFO : Batch creation working on 730000

2017-01-30 19:45:23,546 : INFO : Batch creation working on 810000

2017-01-30 19:45:23,727 : INFO : Batch creation working on 690000

2017-01-30 19:45:25,339 : INFO : Finished batch of 10000 in 14m 4s
2017-01-30 19:45:25,343 : INFO : For index 760000, the actual number of lines written is: 1982


writing batch 760000


2017-01-30 19:45:28,104 : INFO : Finished batch of 10000 in 13m 53s
2017-01-30 19:45:28,109 : INFO : For index 840000, the actual number of lines written is: 1999


writing batch 840000


2017-01-30 19:45:32,396 : INFO : Batch creation working on 770000

2017-01-30 19:45:34,718 : INFO : Batch creation working on 850000

2017-01-30 19:45:41,032 : INFO : 2000
2017-01-30 19:45:49,155 : INFO : Finished batch of 10000 in 14m 4s
2017-01-30 19:45:49,158 : INFO : For index 880000, the actual number of lines written is: 2016


writing batch 880000


2017-01-30 19:45:56,713 : INFO : Batch creation working on 890000

2017-01-30 19:48:24,708 : INFO : 1000
2017-01-30 19:52:35,272 : INFO : 1000
2017-01-30 19:52:36,866 : INFO : 1000
2017-01-30 19:52:37,504 : INFO : 1000
2017-01-30 19:52:37,837 : INFO : 1000
2017-01-30 19:52:45,255 : INFO : 1000
2017-01-30 19:52:50,725 : INFO : 1000
2017-01-30 19:52:52,273 : INFO : 1000
2017-01-30 19:52:52,834 : INFO : 1000
2017-01-30 19:52:54,259 : INFO : 1000
2017-01-30 19:52:56,439 : INFO : 1000
2017-01-30 19:52:58,590 : INFO : 1000
2017-01-30 19:53:00,227 : INFO : 1000
2017-01-30 19:53:08,338 : INFO : 1000
2017-01-30 19:53:25,035 : INFO : 1000
2017-01-30 19:53:43,212 : INFO : 1000
2017-01-30 19:54:51,926 : INFO : 2000
2017-01-30 19:54:57,023 : INFO : Finished batch of 10000 in 13m 28s
2017-01-30 19:54:57,025 : INFO : For index 650000, the actual number of lines written is: 2008


writing batch 650000


2017-01-30 19:55:03,198 : INFO : Batch creation working on 660000

2017-01-30 19:59:27,335 : INFO : Finished batch of 10000 in 14m 6s
2017-01-30 19:59:27,343 : INFO : For index 730000, the actual number of lines written is: 1985


writing batch 730000


2017-01-30 19:59:29,750 : INFO : Finished batch of 10000 in 13m 55s
2017-01-30 19:59:29,754 : INFO : For index 850000, the actual number of lines written is: 1984


writing batch 850000


2017-01-30 19:59:31,053 : INFO : Finished batch of 10000 in 14m 7s
2017-01-30 19:59:31,056 : INFO : For index 690000, the actual number of lines written is: 1995


writing batch 690000


2017-01-30 19:59:33,727 : INFO : Batch creation working on 740000

2017-01-30 19:59:35,447 : INFO : Batch creation working on 860000

2017-01-30 19:59:36,366 : INFO : Finished batch of 10000 in 14m 13s
2017-01-30 19:59:36,370 : INFO : For index 810000, the actual number of lines written is: 1961


writing batch 810000


2017-01-30 19:59:37,222 : INFO : Batch creation working on 700000

2017-01-30 19:59:40,158 : INFO : Finished batch of 10000 in 14m 8s
2017-01-30 19:59:40,162 : INFO : For index 770000, the actual number of lines written is: 1974


writing batch 770000


2017-01-30 19:59:42,831 : INFO : Batch creation working on 820000

2017-01-30 19:59:46,339 : INFO : Batch creation working on 780000

2017-01-30 20:00:08,569 : INFO : 2000
2017-01-30 20:00:12,346 : INFO : Finished batch of 10000 in 14m 16s
2017-01-30 20:00:12,348 : INFO : For index 890000, the actual number of lines written is: 2004


writing batch 890000


2017-01-30 20:00:18,826 : INFO : Batch creation working on 900000

2017-01-30 20:01:04,472 : INFO : 2000
2017-01-30 20:01:07,447 : INFO : Finished batch of 10000 in 16m 54s
2017-01-30 20:01:07,456 : INFO : For index 960000, the actual number of lines written is: 1991


writing batch 960000


2017-01-30 20:01:15,074 : INFO : Batch creation working on 970000

2017-01-30 20:01:19,054 : INFO : Finished batch of 10000 in 17m 8s
2017-01-30 20:01:19,062 : INFO : For index 920000, the actual number of lines written is: 1961


writing batch 920000


2017-01-30 20:01:23,126 : INFO : 2000
2017-01-30 20:01:26,763 : INFO : Batch creation working on 930000

2017-01-30 20:01:30,800 : INFO : 2000
2017-01-30 20:01:30,820 : INFO : Finished batch of 10000 in 16m 54s
2017-01-30 20:01:30,823 : INFO : For index 1120000, the actual number of lines written is: 1971


writing batch 1120000


2017-01-30 20:01:35,777 : INFO : Finished batch of 10000 in 17m 7s
2017-01-30 20:01:35,779 : INFO : For index 1040000, the actual number of lines written is: 2055


writing batch 1040000


2017-01-30 20:01:36,474 : INFO : Finished batch of 10000 in 16m 60s
2017-01-30 20:01:36,478 : INFO : For index 1080000, the actual number of lines written is: 1999


writing batch 1080000


2017-01-30 20:01:39,339 : INFO : Batch creation working on 1130000

2017-01-30 20:01:40,685 : INFO : 2000
2017-01-30 20:01:42,226 : INFO : Finished batch of 10000 in 17m 18s
2017-01-30 20:01:42,228 : INFO : For index 1000000, the actual number of lines written is: 2006


writing batch 1000000


2017-01-30 20:01:45,113 : INFO : Batch creation working on 1050000

2017-01-30 20:01:45,307 : INFO : Batch creation working on 1090000

2017-01-30 20:01:48,632 : INFO : 2000
2017-01-30 20:01:50,578 : INFO : Finished batch of 10000 in 17m 4s
2017-01-30 20:01:50,582 : INFO : For index 1200000, the actual number of lines written is: 2031


writing batch 1200000


2017-01-30 20:01:50,853 : INFO : Finished batch of 10000 in 16m 59s
2017-01-30 20:01:50,868 : INFO : For index 1240000, the actual number of lines written is: 2049


writing batch 1240000


2017-01-30 20:01:50,892 : INFO : Batch creation working on 1010000

2017-01-30 20:01:59,733 : INFO : Batch creation working on 1210000

2017-01-30 20:02:00,386 : INFO : Batch creation working on 1250000

2017-01-30 20:02:01,205 : INFO : 1000
2017-01-30 20:02:05,372 : INFO : Finished batch of 10000 in 17m 19s
2017-01-30 20:02:05,376 : INFO : For index 1160000, the actual number of lines written is: 2033


writing batch 1160000


2017-01-30 20:02:14,455 : INFO : Batch creation working on 1170000

2017-01-30 20:06:37,429 : INFO : 1000
2017-01-30 20:06:41,498 : INFO : 1000
2017-01-30 20:06:42,810 : INFO : 1000
2017-01-30 20:06:45,887 : INFO : 1000
2017-01-30 20:06:49,784 : INFO : 1000
2017-01-30 20:07:27,734 : INFO : 1000
2017-01-30 20:08:12,969 : INFO : Finished batch of 10000 in 13m 10s
2017-01-30 20:08:12,973 : INFO : For index 660000, the actual number of lines written is: 1978


writing batch 660000


2017-01-30 20:08:18,788 : INFO : Batch creation working on 670000

2017-01-30 20:09:47,869 : INFO : 1000
2017-01-30 20:10:17,130 : INFO : 1000
2017-01-30 20:10:19,987 : INFO : 1000
2017-01-30 20:10:27,523 : INFO : 1000
2017-01-30 20:10:32,680 : INFO : 1000
2017-01-30 20:10:33,078 : INFO : 1000
2017-01-30 20:10:39,339 : INFO : 1000
2017-01-30 20:10:53,351 : INFO : 1000
2017-01-30 20:10:56,108 : INFO : 1000
2017-01-30 20:13:33,952 : INFO : Finished batch of 10000 in 14m 0s
2017-01-30 20:13:33,955 : INFO : For index 740000, the actual number of lines written is: 1992


writing batch 740000


2017-01-30 20:13:35,782 : INFO : Finished batch of 10000 in 13m 59s
2017-01-30 20:13:35,791 : INFO : For index 700000, the actual number of lines written is: 1956


writing batch 700000


2017-01-30 20:13:40,310 : INFO : Batch creation working on 750000

2017-01-30 20:13:41,185 : INFO : 2000
2017-01-30 20:13:41,675 : INFO : Batch creation working on 710000

2017-01-30 20:13:44,450 : INFO : Finished batch of 10000 in 14m 9s
2017-01-30 20:13:44,453 : INFO : For index 860000, the actual number of lines written is: 1997


writing batch 860000


2017-01-30 20:13:49,070 : INFO : Finished batch of 10000 in 14m 3s
2017-01-30 20:13:49,072 : INFO : For index 780000, the actual number of lines written is: 2018


writing batch 780000


2017-01-30 20:13:50,748 : INFO : Batch creation working on 870000

2017-01-30 20:13:51,958 : INFO : Finished batch of 10000 in 14m 9s
2017-01-30 20:13:51,962 : INFO : For index 820000, the actual number of lines written is: 1957


writing batch 820000


2017-01-30 20:13:55,447 : INFO : Batch creation working on 790000

2017-01-30 20:13:58,412 : INFO : Batch creation working on 830000

2017-01-30 20:14:19,980 : INFO : Finished batch of 10000 in 14m 1s
2017-01-30 20:14:19,983 : INFO : For index 900000, the actual number of lines written is: 1959


writing batch 900000


2017-01-30 20:14:26,448 : INFO : Batch creation working on 910000

2017-01-30 20:14:51,632 : INFO : 1000
2017-01-30 20:18:00,110 : INFO : 2000
2017-01-30 20:18:22,965 : INFO : Finished batch of 10000 in 17m 8s
2017-01-30 20:18:22,972 : INFO : For index 970000, the actual number of lines written is: 2048


writing batch 970000


2017-01-30 20:18:27,725 : INFO : 2000
2017-01-30 20:18:28,614 : INFO : Finished batch of 10000 in 17m 2s
2017-01-30 20:18:28,616 : INFO : For index 930000, the actual number of lines written is: 2003


writing batch 930000


2017-01-30 20:18:29,144 : INFO : Finished batch of 10000 in 16m 50s
2017-01-30 20:18:29,146 : INFO : For index 1130000, the actual number of lines written is: 1989


writing batch 1130000


2017-01-30 20:18:31,450 : INFO : Batch creation working on 980000

2017-01-30 20:18:37,080 : INFO : Batch creation working on 940000

2017-01-30 20:18:37,108 : INFO : Batch creation working on 1140000

2017-01-30 20:18:45,724 : INFO : 2000
2017-01-30 20:18:46,020 : INFO : 2000
2017-01-30 20:18:49,853 : INFO : Finished batch of 10000 in 17m 5s
2017-01-30 20:18:49,855 : INFO : For index 1090000, the actual number of lines written is: 2008


writing batch 1090000


2017-01-30 20:18:52,440 : INFO : 2000
2017-01-30 20:18:54,241 : INFO : Finished batch of 10000 in 17m 3s
2017-01-30 20:18:54,244 : INFO : For index 1010000, the actual number of lines written is: 2005


writing batch 1010000


2017-01-30 20:18:55,119 : INFO : Finished batch of 10000 in 17m 10s
2017-01-30 20:18:55,123 : INFO : For index 1050000, the actual number of lines written is: 2024


writing batch 1050000


2017-01-30 20:18:58,400 : INFO : Batch creation working on 1100000

2017-01-30 20:19:01,680 : INFO : Batch creation working on 1020000

2017-01-30 20:19:02,974 : INFO : Batch creation working on 1060000

2017-01-30 20:19:04,774 : INFO : Finished batch of 10000 in 17m 5s
2017-01-30 20:19:04,797 : INFO : For index 1210000, the actual number of lines written is: 1965


writing batch 1210000


2017-01-30 20:19:08,987 : INFO : Finished batch of 10000 in 17m 9s
2017-01-30 20:19:08,992 : INFO : For index 1250000, the actual number of lines written is: 1985


writing batch 1250000


2017-01-30 20:19:13,118 : INFO : Batch creation working on 1220000

2017-01-30 20:19:16,898 : INFO : Batch creation working on 1260000

2017-01-30 20:19:21,005 : INFO : 2000
2017-01-30 20:19:23,710 : INFO : Finished batch of 10000 in 17m 9s
2017-01-30 20:19:23,712 : INFO : For index 1170000, the actual number of lines written is: 2008


writing batch 1170000


2017-01-30 20:19:31,852 : INFO : Batch creation working on 1180000

2017-01-30 20:20:48,331 : INFO : 1000
2017-01-30 20:21:12,778 : INFO : 1000
2017-01-30 20:21:18,344 : INFO : 1000
2017-01-30 20:21:23,784 : INFO : 2000
2017-01-30 20:21:27,676 : INFO : 1000
2017-01-30 20:21:28,145 : INFO : 1000
2017-01-30 20:21:30,328 : INFO : 1000
2017-01-30 20:21:55,579 : INFO : Finished batch of 10000 in 13m 37s
2017-01-30 20:21:55,582 : INFO : For index 670000, the actual number of lines written is: 2072


writing batch 670000


2017-01-30 20:22:02,736 : INFO : Batch creation working on 1280000

2017-01-30 20:27:08,877 : INFO : 1000
2017-01-30 20:27:09,684 : INFO : 1000
2017-01-30 20:27:15,652 : INFO : 1000
2017-01-30 20:27:21,878 : INFO : 1000
2017-01-30 20:27:29,734 : INFO : 2000
2017-01-30 20:27:33,429 : INFO : 1000
2017-01-30 20:27:43,086 : INFO : 1000
2017-01-30 20:27:44,189 : INFO : 1000
2017-01-30 20:27:47,176 : INFO : Finished batch of 10000 in 14m 5s
2017-01-30 20:27:47,179 : INFO : For index 710000, the actual number of lines written is: 1907


writing batch 710000


2017-01-30 20:27:54,072 : INFO : Batch creation working on 1320000

2017-01-30 20:27:55,889 : INFO : Finished batch of 10000 in 14m 16s
2017-01-30 20:27:55,892 : INFO : For index 750000, the actual number of lines written is: 2066


writing batch 750000


2017-01-30 20:27:58,761 : INFO : 1000
2017-01-30 20:27:59,914 : INFO : Finished batch of 10000 in 14m 9s
2017-01-30 20:27:59,918 : INFO : For index 870000, the actual number of lines written is: 1945


writing batch 870000


2017-01-30 20:28:03,285 : INFO : Batch creation working on 1360000

2017-01-30 20:28:07,052 : INFO : Batch creation working on 1400000

2017-01-30 20:28:11,042 : INFO : Finished batch of 10000 in 14m 16s
2017-01-30 20:28:11,057 : INFO : For index 790000, the actual number of lines written is: 1961


writing batch 790000


2017-01-30 20:28:11,835 : INFO : 1000
2017-01-30 20:28:14,133 : INFO : Finished batch of 10000 in 14m 16s
2017-01-30 20:28:14,137 : INFO : For index 830000, the actual number of lines written is: 1929


writing batch 830000


2017-01-30 20:28:18,071 : INFO : Batch creation working on 1440000

2017-01-30 20:28:20,934 : INFO : Batch creation working on 1480000

2017-01-30 20:28:36,328 : INFO : Finished batch of 10000 in 14m 10s
2017-01-30 20:28:36,332 : INFO : For index 910000, the actual number of lines written is: 1957


writing batch 910000


2017-01-30 20:28:43,873 : INFO : Batch creation working on 1520000

2017-01-30 20:28:56,326 : INFO : 1000
2017-01-30 20:35:21,702 : INFO : 1000
2017-01-30 20:35:21,843 : INFO : 1000
2017-01-30 20:35:34,610 : INFO : 1000
2017-01-30 20:35:45,932 : INFO : 2000
2017-01-30 20:36:00,063 : INFO : 2000
2017-01-30 20:36:00,206 : INFO : 2000
2017-01-30 20:36:00,806 : INFO : Finished batch of 10000 in 17m 24s
2017-01-30 20:36:00,809 : INFO : For index 1140000, the actual number of lines written is: 1992


writing batch 1140000


2017-01-30 20:36:01,927 : INFO : Finished batch of 10000 in 17m 30s
2017-01-30 20:36:01,930 : INFO : For index 980000, the actual number of lines written is: 2041


writing batch 980000


2017-01-30 20:36:03,786 : INFO : Finished batch of 10000 in 14m 1s
2017-01-30 20:36:03,795 : INFO : For index 1280000, the actual number of lines written is: 1970


writing batch 1280000


2017-01-30 20:36:03,874 : INFO : Finished batch of 10000 in 17m 27s
2017-01-30 20:36:03,883 : INFO : For index 940000, the actual number of lines written is: 2011


writing batch 940000


2017-01-30 20:36:04,754 : INFO : 1000
2017-01-30 20:36:08,906 : INFO : Batch creation working on 1150000

2017-01-30 20:36:10,555 : INFO : Batch creation working on 990000

2017-01-30 20:36:11,387 : INFO : Batch creation working on 1290000

2017-01-30 20:36:11,408 : INFO : 1000
2017-01-30 20:36:11,533 : INFO : Batch creation working on 950000

2017-01-30 20:36:12,450 : INFO : 1000
2017-01-30 20:36:16,889 : INFO : Finished batch of 10000 in 17m 14s
2017-01-30 20:36:16,892 : INFO : For index 1060000, the actual number of lines written is: 2047


writing batch 1060000


2017-01-30 20:36:23,414 : INFO : Batch creation working on 1070000

2017-01-30 20:36:28,886 : INFO : Finished batch of 10000 in 17m 27s
2017-01-30 20:36:28,890 : INFO : For index 1020000, the actual number of lines written is: 1975


writing batch 1020000


2017-01-30 20:36:34,357 : INFO : 2000
2017-01-30 20:36:36,376 : INFO : Batch creation working on 1030000

2017-01-30 20:36:38,787 : INFO : 2000
2017-01-30 20:36:39,375 : INFO : Finished batch of 10000 in 17m 26s
2017-01-30 20:36:39,381 : INFO : For index 1220000, the actual number of lines written is: 2002


writing batch 1220000


2017-01-30 20:36:39,609 : INFO : Finished batch of 10000 in 17m 41s
2017-01-30 20:36:39,616 : INFO : For index 1100000, the actual number of lines written is: 2013


writing batch 1100000


2017-01-30 20:36:44,853 : INFO : Finished batch of 10000 in 17m 28s
2017-01-30 20:36:44,855 : INFO : For index 1260000, the actual number of lines written is: 1962


writing batch 1260000


2017-01-30 20:36:46,231 : INFO : 2000
2017-01-30 20:36:49,029 : INFO : Batch creation working on 1230000

2017-01-30 20:36:49,156 : INFO : Batch creation working on 1110000

2017-01-30 20:36:51,924 : INFO : Batch creation working on 1270000

2017-01-30 20:36:54,341 : INFO : Finished batch of 10000 in 17m 22s
2017-01-30 20:36:54,344 : INFO : For index 1180000, the actual number of lines written is: 2018


writing batch 1180000


2017-01-30 20:37:01,926 : INFO : Batch creation working on 1190000

2017-01-30 20:42:37,740 : INFO : 2000
2017-01-30 20:42:43,338 : INFO : Finished batch of 10000 in 14m 36s
2017-01-30 20:42:43,349 : INFO : For index 1400000, the actual number of lines written is: 1974


writing batch 1400000


2017-01-30 20:42:43,487 : INFO : Finished batch of 10000 in 14m 49s
2017-01-30 20:42:43,492 : INFO : For index 1320000, the actual number of lines written is: 1933


writing batch 1320000


2017-01-30 20:42:50,377 : INFO : Batch creation working on 1410000

2017-01-30 20:42:50,615 : INFO : Batch creation working on 1330000

2017-01-30 20:43:05,377 : INFO : Finished batch of 10000 in 15m 2s
2017-01-30 20:43:05,380 : INFO : For index 1360000, the actual number of lines written is: 2058


writing batch 1360000


2017-01-30 20:43:05,436 : INFO : Finished batch of 10000 in 14m 47s
2017-01-30 20:43:05,440 : INFO : For index 1440000, the actual number of lines written is: 1928


writing batch 1440000


2017-01-30 20:43:12,937 : INFO : Batch creation working on 1450000

2017-01-30 20:43:13,656 : INFO : Batch creation working on 1370000

2017-01-30 20:43:17,760 : INFO : Finished batch of 10000 in 14m 57s
2017-01-30 20:43:17,764 : INFO : For index 1480000, the actual number of lines written is: 1974


writing batch 1480000


2017-01-30 20:43:25,243 : INFO : Batch creation working on 1490000

2017-01-30 20:43:26,913 : INFO : 2000
2017-01-30 20:43:39,596 : INFO : Finished batch of 10000 in 14m 56s
2017-01-30 20:43:39,612 : INFO : For index 1520000, the actual number of lines written is: 2021


writing batch 1520000


2017-01-30 20:43:47,422 : INFO : Batch creation working on 1530000

2017-01-30 20:43:51,117 : INFO : 1000
2017-01-30 20:44:36,277 : INFO : 1000
2017-01-30 20:45:01,692 : INFO : 1000
2017-01-30 20:45:05,774 : INFO : 1000
2017-01-30 20:45:09,636 : INFO : 1000
2017-01-30 20:45:16,780 : INFO : 1000
2017-01-30 20:45:19,933 : INFO : 1000
2017-01-30 20:45:26,112 : INFO : 1000
2017-01-30 20:45:36,672 : INFO : 1000
2017-01-30 20:45:43,086 : INFO : 1000
2017-01-30 20:50:42,181 : INFO : Finished batch of 10000 in 14m 31s
2017-01-30 20:50:42,194 : INFO : For index 1290000, the actual number of lines written is: 1898


writing batch 1290000


2017-01-30 20:50:42,630 : INFO : 1000
2017-01-30 20:50:43,538 : INFO : 1000
2017-01-30 20:50:47,492 : INFO : 1000
2017-01-30 20:50:48,067 : INFO : 1000
2017-01-30 20:50:49,975 : INFO : Batch creation working on 1300000

2017-01-30 20:50:55,281 : INFO : 1000
2017-01-30 20:51:36,824 : INFO : 1000
2017-01-30 20:52:51,172 : INFO : Finished batch of 10000 in 16m 41s
2017-01-30 20:52:51,181 : INFO : For index 990000, the actual number of lines written is: 1993


writing batch 990000


2017-01-30 20:52:54,395 : INFO : 2000
2017-01-30 20:52:55,685 : INFO : 2000
2017-01-30 20:52:57,134 : INFO : Finished batch of 10000 in 16m 48s
2017-01-30 20:52:57,149 : INFO : For index 1150000, the actual number of lines written is: 1982


writing batch 1150000


2017-01-30 20:52:58,786 : INFO : Batch creation working on 1560000

2017-01-30 20:53:04,808 : INFO : Finished batch of 10000 in 16m 41s
2017-01-30 20:53:04,810 : INFO : For index 1070000, the actual number of lines written is: 2021


writing batch 1070000


2017-01-30 20:53:05,295 : INFO : Batch creation working on 1600000

2017-01-30 20:53:07,465 : INFO : 2000
2017-01-30 20:53:08,675 : INFO : Finished batch of 10000 in 16m 57s
2017-01-30 20:53:08,678 : INFO : For index 950000, the actual number of lines written is: 2021


writing batch 950000


2017-01-30 20:53:09,842 : INFO : 2000
2017-01-30 20:53:12,190 : INFO : Batch creation working on 1640000

2017-01-30 20:53:17,050 : INFO : Batch creation working on 1680000

2017-01-30 20:53:22,497 : INFO : Finished batch of 10000 in 16m 46s
2017-01-30 20:53:22,507 : INFO : For index 1030000, the actual number of lines written is: 2028


writing batch 1030000


2017-01-30 20:53:28,133 : INFO : Finished batch of 10000 in 16m 36s
2017-01-30 20:53:28,136 : INFO : For index 1270000, the actual number of lines written is: 1975


writing batch 1270000


2017-01-30 20:53:29,311 : INFO : 2000
2017-01-30 20:53:29,807 : INFO : Batch creation working on 1720000

2017-01-30 20:53:30,045 : INFO : Finished batch of 10000 in 16m 41s
2017-01-30 20:53:30,047 : INFO : For index 1110000, the actual number of lines written is: 2037


writing batch 1110000


2017-01-30 20:53:36,889 : INFO : Batch creation working on 1760000

2017-01-30 20:53:37,429 : INFO : Finished batch of 10000 in 16m 48s
2017-01-30 20:53:37,433 : INFO : For index 1230000, the actual number of lines written is: 1992


writing batch 1230000


2017-01-30 20:53:38,394 : INFO : Batch creation working on 1800000

2017-01-30 20:53:38,759 : INFO : Finished batch of 10000 in 16m 37s
2017-01-30 20:53:38,761 : INFO : For index 1190000, the actual number of lines written is: 2026


writing batch 1190000


2017-01-30 20:53:46,054 : INFO : Batch creation working on 1840000

2017-01-30 20:53:46,319 : INFO : Batch creation working on 1880000

2017-01-30 20:57:48,498 : INFO : Finished batch of 10000 in 14m 58s
2017-01-30 20:57:48,502 : INFO : For index 1330000, the actual number of lines written is: 1960


writing batch 1330000


2017-01-30 20:57:55,522 : INFO : Batch creation working on 1340000

2017-01-30 20:58:04,996 : INFO : 2000
2017-01-30 20:58:06,275 : INFO : 1000
2017-01-30 20:58:15,340 : INFO : Finished batch of 10000 in 15m 25s
2017-01-30 20:58:15,343 : INFO : For index 1410000, the actual number of lines written is: 1980


writing batch 1410000


2017-01-30 20:58:16,062 : INFO : Finished batch of 10000 in 15m 3s
2017-01-30 20:58:16,066 : INFO : For index 1450000, the actual number of lines written is: 2020


writing batch 1450000


2017-01-30 20:58:19,117 : INFO : Finished batch of 10000 in 15m 5s
2017-01-30 20:58:19,120 : INFO : For index 1370000, the actual number of lines written is: 1982


writing batch 1370000


2017-01-30 20:58:20,393 : INFO : 2000
2017-01-30 20:58:23,509 : INFO : Batch creation working on 1420000

2017-01-30 20:58:23,726 : INFO : Batch creation working on 1460000

2017-01-30 20:58:27,172 : INFO : Batch creation working on 1380000

2017-01-30 20:58:42,570 : INFO : Finished batch of 10000 in 15m 17s
2017-01-30 20:58:42,578 : INFO : For index 1490000, the actual number of lines written is: 2058


writing batch 1490000


2017-01-30 20:58:47,558 : INFO : Finished batch of 10000 in 15m 0s
2017-01-30 20:58:47,570 : INFO : For index 1530000, the actual number of lines written is: 1974


writing batch 1530000


2017-01-30 20:58:50,628 : INFO : Batch creation working on 1500000

2017-01-30 20:58:54,271 : INFO : Batch creation working on 1540000

2017-01-30 21:00:50,550 : INFO : 1000
2017-01-30 21:01:10,178 : INFO : 1000
2017-01-30 21:01:13,640 : INFO : 1000
2017-01-30 21:01:26,048 : INFO : 1000
2017-01-30 21:01:33,139 : INFO : 1000
2017-01-30 21:01:35,127 : INFO : 1000
2017-01-30 21:01:40,970 : INFO : 1000
2017-01-30 21:01:50,907 : INFO : 1000
2017-01-30 21:02:19,812 : INFO : 1000
2017-01-30 21:05:00,155 : INFO : 2000
2017-01-30 21:05:15,666 : INFO : Finished batch of 10000 in 14m 26s
2017-01-30 21:05:15,668 : INFO : For index 1300000, the actual number of lines written is: 2047


writing batch 1300000


2017-01-30 21:05:21,585 : INFO : Batch creation working on 1310000

2017-01-30 21:05:23,308 : INFO : 1000
2017-01-30 21:05:40,158 : INFO : 1000
2017-01-30 21:05:56,591 : INFO : 1000
2017-01-30 21:05:58,766 : INFO : 1000
2017-01-30 21:05:59,849 : INFO : 1000
2017-01-30 21:06:19,865 : INFO : 1000
2017-01-30 21:09:00,287 : INFO : 2000
2017-01-30 21:09:29,132 : INFO : Finished batch of 10000 in 16m 30s
2017-01-30 21:09:29,140 : INFO : For index 1560000, the actual number of lines written is: 2050


writing batch 1560000


2017-01-30 21:09:35,973 : INFO : 2000
2017-01-30 21:09:37,834 : INFO : Batch creation working on 1570000

2017-01-30 21:09:39,282 : INFO : Finished batch of 10000 in 16m 34s
2017-01-30 21:09:39,286 : INFO : For index 1600000, the actual number of lines written is: 1949


writing batch 1600000


2017-01-30 21:09:44,974 : INFO : 2000
2017-01-30 21:09:45,167 : INFO : Finished batch of 10000 in 16m 33s
2017-01-30 21:09:45,169 : INFO : For index 1640000, the actual number of lines written is: 2001


writing batch 1640000


2017-01-30 21:09:47,040 : INFO : Batch creation working on 1610000

2017-01-30 21:09:52,761 : INFO : Batch creation working on 1650000

2017-01-30 21:09:55,474 : INFO : Finished batch of 10000 in 16m 38s
2017-01-30 21:09:55,478 : INFO : For index 1680000, the actual number of lines written is: 2048


writing batch 1680000


2017-01-30 21:10:03,633 : INFO : Batch creation working on 1690000

2017-01-30 21:10:05,783 : INFO : Finished batch of 10000 in 16m 36s
2017-01-30 21:10:05,786 : INFO : For index 1720000, the actual number of lines written is: 1986


writing batch 1720000


2017-01-30 21:10:07,382 : INFO : Finished batch of 10000 in 16m 29s
2017-01-30 21:10:07,385 : INFO : For index 1800000, the actual number of lines written is: 1990


writing batch 1800000


2017-01-30 21:10:08,741 : INFO : 2000
2017-01-30 21:10:12,912 : INFO : Finished batch of 10000 in 16m 36s
2017-01-30 21:10:12,915 : INFO : For index 1760000, the actual number of lines written is: 2007


writing batch 1760000


2017-01-30 21:10:13,649 : INFO : Batch creation working on 1730000

2017-01-30 21:10:14,420 : INFO : Batch creation working on 1810000

2017-01-30 21:10:20,878 : INFO : Batch creation working on 1770000

2017-01-30 21:10:23,610 : INFO : Finished batch of 10000 in 16m 37s
2017-01-30 21:10:23,617 : INFO : For index 1840000, the actual number of lines written is: 1915


writing batch 1840000


2017-01-30 21:10:29,283 : INFO : 2000
2017-01-30 21:10:31,138 : INFO : Batch creation working on 1850000

2017-01-30 21:10:43,907 : INFO : Finished batch of 10000 in 16m 57s
2017-01-30 21:10:43,910 : INFO : For index 1880000, the actual number of lines written is: 2034


writing batch 1880000


2017-01-30 21:10:52,765 : INFO : Batch creation working on 1890000

2017-01-30 21:11:48,647 : INFO : 1000
2017-01-30 21:12:20,201 : INFO : Finished batch of 10000 in 14m 25s
2017-01-30 21:12:20,211 : INFO : For index 1340000, the actual number of lines written is: 1974


writing batch 1340000


2017-01-30 21:12:26,391 : INFO : Batch creation working on 1350000

2017-01-30 21:12:36,558 : INFO : 2000
2017-01-30 21:12:41,234 : INFO : Finished batch of 10000 in 14m 17s
2017-01-30 21:12:41,237 : INFO : For index 1460000, the actual number of lines written is: 1984


writing batch 1460000


2017-01-30 21:12:47,336 : INFO : Batch creation working on 1470000

2017-01-30 21:12:53,747 : INFO : Finished batch of 10000 in 14m 30s
2017-01-30 21:12:53,750 : INFO : For index 1420000, the actual number of lines written is: 2045


writing batch 1420000


2017-01-30 21:12:54,098 : INFO : Finished batch of 10000 in 14m 27s
2017-01-30 21:12:54,101 : INFO : For index 1380000, the actual number of lines written is: 1991


writing batch 1380000


2017-01-30 21:12:54,306 : INFO : Finished batch of 10000 in 14m 4s
2017-01-30 21:12:54,309 : INFO : For index 1500000, the actual number of lines written is: 1987


writing batch 1500000


2017-01-30 21:13:02,635 : INFO : Batch creation working on 1510000

2017-01-30 21:13:03,089 : INFO : Batch creation working on 1430000

2017-01-30 21:13:03,173 : INFO : Batch creation working on 1390000

2017-01-30 21:13:26,584 : INFO : Finished batch of 10000 in 14m 32s
2017-01-30 21:13:26,589 : INFO : For index 1540000, the actual number of lines written is: 1990


writing batch 1540000


2017-01-30 21:13:32,967 : INFO : Batch creation working on 1550000

2017-01-30 21:17:43,394 : INFO : 1000
2017-01-30 21:18:20,879 : INFO : 1000
2017-01-30 21:18:25,147 : INFO : 1000
2017-01-30 21:18:38,364 : INFO : 1000
2017-01-30 21:18:50,390 : INFO : 2000
2017-01-30 21:18:53,015 : INFO : 1000
2017-01-30 21:18:53,447 : INFO : 1000
2017-01-30 21:18:57,541 : INFO : Finished batch of 10000 in 13m 36s
2017-01-30 21:18:57,556 : INFO : For index 1310000, the actual number of lines written is: 2016


writing batch 1310000


2017-01-30 21:19:02,893 : INFO : 1000
2017-01-30 21:19:05,068 : INFO : Batch creation working on 1920000

2017-01-30 21:19:11,083 : INFO : 1000
2017-01-30 21:19:41,161 : INFO : 1000
2017-01-30 21:19:51,591 : INFO : 1000
2017-01-30 21:20:14,076 : INFO : 1000
2017-01-30 21:20:15,909 : INFO : 1000
2017-01-30 21:20:37,162 : INFO : 1000
2017-01-30 21:20:38,127 : INFO : 1000
2017-01-30 21:20:54,890 : INFO : 1000
2017-01-30 21:26:23,964 : INFO : 2000
2017-01-30 21:26:37,057 : INFO : 1000
2017-01-30 21:26:50,053 : INFO : Finished batch of 10000 in 16m 57s
2017-01-30 21:26:50,065 : INFO : For index 1650000, the actual number of lines written is: 1946


writing batch 1650000


2017-01-30 21:26:50,293 : INFO : Finished batch of 10000 in 17m 12s
2017-01-30 21:26:50,305 : INFO : For index 1570000, the actual number of lines written is: 2049


writing batch 1570000


2017-01-30 21:26:57,766 : INFO : 2000
2017-01-30 21:26:59,720 : INFO : Batch creation working on 1660000

2017-01-30 21:27:00,421 : INFO : Batch creation working on 1580000

2017-01-30 21:27:04,121 : INFO : Finished batch of 10000 in 17m 17s
2017-01-30 21:27:04,145 : INFO : For index 1610000, the actual number of lines written is: 2014


writing batch 1610000


2017-01-30 21:27:12,646 : INFO : 2000
2017-01-30 21:27:13,158 : INFO : Batch creation working on 1620000

2017-01-30 21:27:17,588 : INFO : 2000
2017-01-30 21:27:26,283 : INFO : Finished batch of 10000 in 17m 23s
2017-01-30 21:27:26,288 : INFO : For index 1690000, the actual number of lines written is: 2016


writing batch 1690000


2017-01-30 21:27:34,242 : INFO : Batch creation working on 1700000

2017-01-30 21:27:34,340 : INFO : Finished batch of 10000 in 17m 21s
2017-01-30 21:27:34,344 : INFO : For index 1730000, the actual number of lines written is: 1982


writing batch 1730000


2017-01-30 21:27:39,056 : INFO : Finished batch of 10000 in 17m 18s
2017-01-30 21:27:39,061 : INFO : For index 1770000, the actual number of lines written is: 1975


writing batch 1770000


2017-01-30 21:27:39,920 : INFO : Finished batch of 10000 in 17m 25s
2017-01-30 21:27:39,923 : INFO : For index 1810000, the actual number of lines written is: 2045


writing batch 1810000


2017-01-30 21:27:42,533 : INFO : Batch creation working on 1740000

2017-01-30 21:27:43,365 : INFO : 2000
2017-01-30 21:27:43,877 : INFO : 2000
2017-01-30 21:27:46,567 : INFO : Finished batch of 10000 in 15m 20s
2017-01-30 21:27:46,569 : INFO : For index 1350000, the actual number of lines written is: 2005


writing batch 1350000


2017-01-30 21:27:48,091 : INFO : Batch creation working on 1780000

2017-01-30 21:27:48,961 : INFO : Finished batch of 10000 in 15m 2s
2017-01-30 21:27:48,964 : INFO : For index 1470000, the actual number of lines written is: 1971


writing batch 1470000


2017-01-30 21:27:49,671 : INFO : Batch creation working on 1820000

2017-01-30 21:27:54,382 : INFO : Batch creation working on 1960000

2017-01-30 21:27:55,297 : INFO : Batch creation working on 2000000

2017-01-30 21:27:57,938 : INFO : Finished batch of 10000 in 17m 27s
2017-01-30 21:27:57,943 : INFO : For index 1850000, the actual number of lines written is: 2035


writing batch 1850000


2017-01-30 21:28:02,658 : INFO : Finished batch of 10000 in 15m 0s
2017-01-30 21:28:02,661 : INFO : For index 1510000, the actual number of lines written is: 1996


writing batch 1510000


2017-01-30 21:28:06,249 : INFO : Batch creation working on 1860000

2017-01-30 21:28:08,571 : INFO : Finished batch of 10000 in 15m 5s
2017-01-30 21:28:08,575 : INFO : For index 1390000, the actual number of lines written is: 1967


writing batch 1390000


2017-01-30 21:28:09,777 : INFO : 2000
2017-01-30 21:28:15,399 : INFO : Finished batch of 10000 in 17m 23s
2017-01-30 21:28:15,402 : INFO : For index 1890000, the actual number of lines written is: 1964


writing batch 1890000


2017-01-30 21:28:22,301 : INFO : Finished batch of 10000 in 15m 19s
2017-01-30 21:28:22,305 : INFO : For index 1430000, the actual number of lines written is: 1965


writing batch 1430000


2017-01-30 21:28:23,922 : INFO : Batch creation working on 1900000

2017-01-30 21:28:45,371 : INFO : Finished batch of 10000 in 15m 12s
2017-01-30 21:28:45,375 : INFO : For index 1550000, the actual number of lines written is: 2095


writing batch 1550000


2017-01-30 21:32:45,820 : INFO : Finished batch of 10000 in 13m 40s
2017-01-30 21:32:45,822 : INFO : For index 1920000, the actual number of lines written is: 1972


writing batch 1920000


2017-01-30 21:32:50,873 : INFO : Batch creation working on 1930000

2017-01-30 21:34:16,164 : INFO : 1000
2017-01-30 21:34:23,871 : INFO : 1000
2017-01-30 21:35:14,868 : INFO : 1000
2017-01-30 21:35:31,023 : INFO : 1000
2017-01-30 21:35:51,486 : INFO : 1000
2017-01-30 21:36:16,565 : INFO : 1000
2017-01-30 21:36:18,791 : INFO : 1000
2017-01-30 21:36:24,424 : INFO : 1000
2017-01-30 21:36:35,440 : INFO : 1000
2017-01-30 21:36:38,619 : INFO : 1000
2017-01-30 21:36:39,348 : INFO : 1000
2017-01-30 21:38:51,598 : INFO : 1000
2017-01-30 21:40:09,589 : INFO : Finished batch of 10000 in 12m 14s
2017-01-30 21:40:09,595 : INFO : For index 2000000, the actual number of lines written is: 1931


writing batch 2000000


2017-01-30 21:40:32,704 : INFO : 2000
2017-01-30 21:40:42,770 : INFO : Finished batch of 10000 in 12m 48s
2017-01-30 21:40:42,771 : INFO : For index 1960000, the actual number of lines written is: 2032


writing batch 1960000


2017-01-30 21:40:48,045 : INFO : Batch creation working on 1970000

2017-01-30 21:43:11,084 : INFO : 2000
2017-01-30 21:43:25,409 : INFO : 2000
2017-01-30 21:43:48,736 : INFO : Finished batch of 10000 in 16m 49s
2017-01-30 21:43:48,739 : INFO : For index 1660000, the actual number of lines written is: 2047


writing batch 1660000


2017-01-30 21:43:57,440 : INFO : Batch creation working on 1670000

2017-01-30 21:44:00,750 : INFO : Finished batch of 10000 in 17m 0s
2017-01-30 21:44:00,754 : INFO : For index 1580000, the actual number of lines written is: 2100


writing batch 1580000


2017-01-30 21:44:02,166 : INFO : 2000
2017-01-30 21:44:08,608 : INFO : Finished batch of 10000 in 16m 55s
2017-01-30 21:44:08,611 : INFO : For index 1620000, the actual number of lines written is: 2015


writing batch 1620000


2017-01-30 21:44:09,881 : INFO : Batch creation working on 1590000

2017-01-30 21:44:16,733 : INFO : Batch creation working on 1630000

2017-01-30 21:44:27,837 : INFO : Finished batch of 10000 in 11m 37s
2017-01-30 21:44:27,840 : INFO : For index 1930000, the actual number of lines written is: 1951


writing batch 1930000


2017-01-30 21:44:32,293 : INFO : Finished batch of 10000 in 16m 58s
2017-01-30 21:44:32,297 : INFO : For index 1700000, the actual number of lines written is: 1982


writing batch 1700000


2017-01-30 21:44:32,796 : INFO : Batch creation working on 1940000

2017-01-30 21:44:33,237 : INFO : 2000
2017-01-30 21:44:39,706 : INFO : 2000
2017-01-30 21:44:40,364 : INFO : Batch creation working on 1710000

2017-01-30 21:44:41,842 : INFO : Finished batch of 10000 in 16m 59s
2017-01-30 21:44:41,847 : INFO : For index 1740000, the actual number of lines written is: 1991


writing batch 1740000


2017-01-30 21:44:43,789 : INFO : Finished batch of 10000 in 16m 54s
2017-01-30 21:44:43,797 : INFO : For index 1820000, the actual number of lines written is: 2008


writing batch 1820000


2017-01-30 21:44:45,200 : INFO : Finished batch of 10000 in 16m 57s
2017-01-30 21:44:45,203 : INFO : For index 1780000, the actual number of lines written is: 1936


writing batch 1780000


2017-01-30 21:44:50,640 : INFO : Finished batch of 10000 in 16m 44s
2017-01-30 21:44:50,643 : INFO : For index 1860000, the actual number of lines written is: 2035


writing batch 1860000


2017-01-30 21:44:52,004 : INFO : Batch creation working on 1830000

2017-01-30 21:44:52,011 : INFO : Batch creation working on 1750000

2017-01-30 21:44:52,887 : INFO : Batch creation working on 1790000

2017-01-30 21:44:58,399 : INFO : 2000
2017-01-30 21:44:58,747 : INFO : Batch creation working on 1870000

2017-01-30 21:45:16,134 : INFO : Finished batch of 10000 in 16m 52s
2017-01-30 21:45:16,137 : INFO : For index 1900000, the actual number of lines written is: 2031


writing batch 1900000


2017-01-30 21:45:24,531 : INFO : Batch creation working on 1910000

2017-01-30 21:47:01,262 : INFO : 1000
2017-01-30 21:50:07,181 : INFO : 1000
2017-01-30 21:52:16,027 : INFO : 1000
2017-01-30 21:52:16,061 : INFO : 1000
2017-01-30 21:52:35,144 : INFO : 1000
2017-01-30 21:52:35,852 : INFO : 1000
2017-01-30 21:52:38,312 : INFO : 1000
2017-01-30 21:52:45,493 : INFO : 1000
2017-01-30 21:52:52,993 : INFO : 1000
2017-01-30 21:52:54,657 : INFO : 1000
2017-01-30 21:52:57,532 : INFO : Finished batch of 10000 in 12m 9s
2017-01-30 21:52:57,536 : INFO : For index 1970000, the actual number of lines written is: 1921


writing batch 1970000


2017-01-30 21:53:03,664 : INFO : Batch creation working on 1980000

2017-01-30 21:53:43,151 : INFO : 1000
2017-01-30 21:55:53,072 : INFO : 2000
2017-01-30 21:55:54,233 : INFO : Finished batch of 10000 in 11m 21s
2017-01-30 21:55:54,238 : INFO : For index 1940000, the actual number of lines written is: 2002


writing batch 1940000


2017-01-30 21:56:00,229 : INFO : Batch creation working on 1950000

2017-01-30 21:59:16,176 : INFO : 1000
2017-01-30 22:00:09,115 : INFO : Finished batch of 10000 in 16m 12s
2017-01-30 22:00:09,118 : INFO : For index 1670000, the actual number of lines written is: 1879


writing batch 1670000


2017-01-30 22:00:16,184 : INFO : 2000
2017-01-30 22:00:22,608 : INFO : 2000
2017-01-30 22:00:32,337 : INFO : Finished batch of 10000 in 16m 22s
2017-01-30 22:00:32,339 : INFO : For index 1590000, the actual number of lines written is: 2042


writing batch 1590000


2017-01-30 22:00:35,187 : INFO : 2000
2017-01-30 22:00:35,367 : INFO : Finished batch of 10000 in 16m 19s
2017-01-30 22:00:35,370 : INFO : For index 1630000, the actual number of lines written is: 2021


writing batch 1630000


2017-01-30 22:00:35,393 : INFO : 2000
2017-01-30 22:00:36,691 : INFO : 2000
2017-01-30 22:00:44,919 : INFO : Finished batch of 10000 in 16m 5s
2017-01-30 22:00:44,922 : INFO : For index 1710000, the actual number of lines written is: 2018


writing batch 1710000


2017-01-30 22:00:45,975 : INFO : Finished batch of 10000 in 15m 54s
2017-01-30 22:00:45,977 : INFO : For index 1830000, the actual number of lines written is: 2021


writing batch 1830000


2017-01-30 22:00:52,057 : INFO : Finished batch of 10000 in 15m 59s
2017-01-30 22:00:52,059 : INFO : For index 1790000, the actual number of lines written is: 1987


writing batch 1790000


2017-01-30 22:00:57,822 : INFO : Finished batch of 10000 in 16m 6s
2017-01-30 22:00:57,824 : INFO : For index 1750000, the actual number of lines written is: 1998


writing batch 1750000


2017-01-30 22:01:07,934 : INFO : Finished batch of 10000 in 16m 9s
2017-01-30 22:01:07,936 : INFO : For index 1870000, the actual number of lines written is: 2093


writing batch 1870000


2017-01-30 22:01:13,617 : INFO : Finished batch of 10000 in 15m 49s
2017-01-30 22:01:13,619 : INFO : For index 1910000, the actual number of lines written is: 1990


writing batch 1910000


2017-01-30 22:01:33,677 : INFO : 1000
2017-01-30 22:05:01,098 : INFO : 2000
2017-01-30 22:05:06,218 : INFO : Finished batch of 10000 in 12m 3s
2017-01-30 22:05:06,220 : INFO : For index 1980000, the actual number of lines written is: 2015


writing batch 1980000


2017-01-30 22:05:12,400 : INFO : Batch creation working on 1990000

2017-01-30 22:07:15,335 : INFO : 2000
2017-01-30 22:07:35,957 : INFO : Finished batch of 10000 in 11m 36s
2017-01-30 22:07:35,959 : INFO : For index 1950000, the actual number of lines written is: 2054


writing batch 1950000


2017-01-30 22:10:05,701 : INFO : 1000
2017-01-30 22:15:04,340 : INFO : 2000
2017-01-30 22:15:11,871 : INFO : Finished batch of 10000 in 9m 59s
2017-01-30 22:15:11,872 : INFO : For index 1990000, the actual number of lines written is: 2022


writing batch 1990000


In [22]:
pool.close()
pool.terminate()

### Join up the files we wrote to form BATCH_SIZE block files

In [None]:
curr_index = 0
curr_read_file_index = 0
curr_write_file_index = 0

read_file = None
write_file = None

def read_line():
    global read_file, curr_read_file_index
    while True:
        if read_file is None:
            if os.path.exists(TEST_PREPROCESSED_FILES_PREFIX + str(curr_read_file_index)):
                read_file = open(TEST_PREPROCESSED_FILES_PREFIX + str(curr_read_file_index), "r")
            else: 
                raise StopIteration()
        for line in read_file:
            yield line
            
        curr_read_file_index += BATCH_SIZE
        info("Reading new file for batch {}".format(curr_read_file_index))
        read_file.close()
        read_file = None

def write_line(line):
    global write_file, curr_write_file_index, curr_index
    if write_file is None:
        write_file = open(TEST_MERGED_PREPROCESSED_FILES_PREFIX + str(curr_write_file_index), "w")
    write_file.write(line)
    curr_index += 1
    if curr_index % BATCH_SIZE == 0:
        curr_write_file_index += BATCH_SIZE
        info("Writing to a new file for batch {}".format(curr_write_file_index))
        write_file.close()
        write_file = None
    

for line in read_line():
    write_line(line)

2017-01-30 22:28:10,633 : INFO : Reading new file for batch 10000
2017-01-30 22:28:51,119 : INFO : Reading new file for batch 20000
2017-01-30 22:28:59,217 : INFO : Reading new file for batch 30000
2017-01-30 22:29:04,416 : INFO : Reading new file for batch 40000
2017-01-30 22:29:07,508 : INFO : Writing to a new file for batch 10000
2017-01-30 22:29:16,125 : INFO : Reading new file for batch 50000
2017-01-30 22:29:18,380 : INFO : Reading new file for batch 60000
2017-01-30 22:29:20,447 : INFO : Reading new file for batch 70000
2017-01-30 22:29:22,347 : INFO : Reading new file for batch 80000
2017-01-30 22:29:23,949 : INFO : Reading new file for batch 90000
2017-01-30 22:29:25,618 : INFO : Writing to a new file for batch 20000


#### Rearranging the Docids

In [21]:
all_doc_ids = []

for i in range(0, 100000000, BATCH_SIZE):
    if os.path.exists(TEST_PREPROCESSED_DOCIDS_FILES_PREFIX + str(i)):
        doc_ids = pickle.load(open(TEST_PREPROCESSED_DOCIDS_FILES_PREFIX + str(i)))
        all_doc_ids.extend(doc_ids)
    else:
        break
        

for i in range(0, 100000000, BATCH_SIZE):
    if i < len(all_doc_ids):
        pickle.dump(all_doc_ids[i: i + BATCH_SIZE], open(TEST_MERGED_PREPROCESSED_DOCIDS_FILES_PREFIX + str(i), "w"))
    else:
        break

## Concatenating 10,000 batch files to 100,000 batch files

In [70]:
%%time
write_file = open("/big/s/shalaby/preprocessed_data/training_large_preprocessed-0", "w")
for i in range(0, 100000, BATCH_SIZE):
    with open(TRAINING_MERGED_PREPROCESSED_FILES_PREFIX + str(i)) as read_file:
        %time content = read_file.readlines()
    write_file.writelines(content)
    print i
write_file.close()

CPU times: user 0 ns, sys: 680 ms, total: 680 ms
Wall time: 755 ms
0
CPU times: user 0 ns, sys: 724 ms, total: 724 ms
Wall time: 918 ms
10000
CPU times: user 0 ns, sys: 328 ms, total: 328 ms
Wall time: 593 ms
20000
CPU times: user 0 ns, sys: 480 ms, total: 480 ms
Wall time: 968 ms
30000
CPU times: user 0 ns, sys: 968 ms, total: 968 ms
Wall time: 58 s
40000
CPU times: user 0 ns, sys: 664 ms, total: 664 ms
Wall time: 9.68 s
50000
CPU times: user 0 ns, sys: 736 ms, total: 736 ms
Wall time: 23.1 s
60000
CPU times: user 0 ns, sys: 712 ms, total: 712 ms
Wall time: 9.25 s
70000
CPU times: user 0 ns, sys: 612 ms, total: 612 ms
Wall time: 8.45 s
80000
CPU times: user 0 ns, sys: 756 ms, total: 756 ms
Wall time: 28.8 s
90000
CPU times: user 0 ns, sys: 12.2 s, total: 12.2 s
Wall time: 2min 37s


## Old Single Threaded Method

In [None]:
%%time

batch_index = 13
file_prefix = TRAINING_PREPROCESSED_FILES_PREFIX
doc_file_prefix = TRAINING_PREPROCESSED_DOCIDS_FILES_PREFIX
with open(training_file) as file_obj:
    token_lines, doc_ids = [], []
    start_time = time.time()
    for index,line in enumerate(file_obj):
        if index < 130000: continue
        (doc_id, text) = eval(line)
        if doc_id in training_docs_list:
            token_lines.append(stemtokenizer(text))
            doc_ids.append(doc_id)
            if len(token_lines) % 1000 == 0: info(len(token_lines))
            if len(token_lines) % BATCH_SIZE == 0:
                duration = time.time() - start_time
                info("Finished batch of {:d} in {:.0f}m {:.0f}s".format(BATCH_SIZE, *divmod(duration, 60)))
                start_time = time.time()
                write_batch(file_prefix, doc_file_prefix, token_lines, doc_ids, batch_index * BATCH_SIZE)
                batch_index += 1
                token_lines, doc_ids = [], []
    duration = time.time() - start_time
    info("Finished batch of {:d} in {:d}m {:.0f}s".format(BATCH_SIZE, *divmod(duration, 60)))
    write_batch(file_prefix, doc_file_prefix, token_lines, doc_ids, batch_index * BATCH_SIZE)

Validation

In [8]:
%%time

batch_index = 0
file_prefix = VALIDATION_PREPROCESSED_FILES_PREFIX
doc_file_prefix = VALIDATION_PREPROCESSED_DOCIDS_FILES_PREFIX
with open(training_file) as file_obj:
    token_lines, doc_ids = [], []
    for line in file_obj:
        (doc_id, text) = eval(line)
        if doc_id in validation_docs_list:
            token_lines.append(stemtokenizer(text))
            doc_ids.append(doc_id)
            if len(token_lines) % BATCH_SIZE == 0:
                %time write_batch(file_prefix, doc_file_prefix, token_lines, doc_ids, batch_index * BATCH_SIZE)
                batch_index += 1
                token_lines, doc_ids = [], []
    write_batch(file_prefix, doc_file_prefix, token_lines, doc_ids, batch_index * BATCH_SIZE)

writing batch 0
CPU times: user 42.5 s, sys: 832 ms, total: 43.4 s
Wall time: 43.5 s


In [34]:
%%time
line_tokens = []
with open(TRAINING_PREPROCESSED_FILES_PREFIX + str(0)) as preproc_file:
    line_lengths = []
    for line in preproc_file:
        line_lengths.append(len(line))
        line_tokens.append(line.split(" "))

CPU times: user 1.53 s, sys: 856 ms, total: 2.38 s
Wall time: 2.39 s


### Old Method

we used to write the doc id and the tokens as tuples, then do an eval on them in reading time, but this turned out to be very slow

Training

In [None]:
%%time
def write_batch(file_prefix, batch_lines, batch_start):
    if len(batch_lines):
        print "writing batch %d" % batch_start
        %time pickle.dump(batch_lines, open(file_prefix + str(batch_start), 'w'))
#         with open(file_prefix + str(batch_start), 'w') as batch_file:
#             for line in batch_lines:
#                 batch_file.write(str(line) + "\n")

batch_index = 0
file_prefix = TRAINING_PREPROCESSED_FILES_PREFIX
with open(training_file) as file_obj:
    token_lines = []
    for line in file_obj:
        (doc_id, text) = eval(line)
        if doc_id in training_docs_list:
            token_lines.append((doc_id, stemtokenizer(text)))
            if len(token_lines) % BATCH_SIZE == 0:
                write_batch(file_prefix, token_lines, batch_index * BATCH_SIZE)
                batch_index += 1
                token_lines = []
    write_batch(file_prefix, token_lines, batch_index * BATCH_SIZE)

Validation

In [8]:
batch_index = 0
file_prefix = VALIDATION_PREPROCESSED_FILES_PREFIX
with open(training_file) as file_obj:
    token_lines = []
    for line in file_obj:
        (doc_id, text) = eval(line)
        if doc_id in validation_docs_list:
            token_lines.append((doc_id, stemtokenizer(text)))
            if len(token_lines) % BATCH_SIZE == 0:
                write_batch(file_prefix, token_lines, batch_index * BATCH_SIZE)
                batch_index += 1
                token_lines = []
    write_batch(file_prefix, token_lines, batch_index * BATCH_SIZE)

writing batch 0
writing batch 10000


In [22]:
%%time
line_tokens = []
with open(TRAINING_PREPROCESSED_FILES_PREFIX + str(0)) as preproc_file:
    for line in preproc_file:
        line_tokens.append(line.split(" "))

CPU times: user 5 s, sys: 1.2 s, total: 6.19 s
Wall time: 6.18 s


In [24]:
line_tokens[0][:10]

['technical',
 'field',
 'the',
 'present',
 'invention',
 'generally',
 'relates',
 'to',
 'wireless',
 'communications']