diff --git a/include/status.h b/include/status.h index 3f50071..309a9a2 100644 --- a/include/status.h +++ b/include/status.h @@ -7,16 +7,17 @@ enum e_status_update TAGDUP_START, /* start processing duplicate removal [2/3] */ CHUNK_DONE, /* a chunk is terminated */ CTASK_DONE, /* a ctask is terminated */ - FCLEAN_START, /* start processing fclean (outfile cleanout) [3/3] */ + FCLEAN_START, /* start processing fclean (outfile cleanup) [3/3] */ }; /* update status variables */ enum e_status_set { FILE_SIZE, /* total file size (g_infile) */ - FCOPY_BYTES, /* currently copied bytes */ + FCOPY_BYTES, /* bytes currently processed by FCOPY */ CHUNK_SIZE, /* chunk size */ - CLEANOUT_BYTES, /* bytes processed by cleanout_chunk() */ + TAGDUP_BYTES, /* bytes currently processed by TAGDUP */ + FCLEAN_BYTES, /* bytes currently processed by FCLEAN */ }; /* source file: status.c */ diff --git a/src/chunk.c b/src/chunk.c index b8db4b6..4d8d8d3 100644 --- a/src/chunk.c +++ b/src/chunk.c @@ -9,6 +9,7 @@ #include "const.h" #include "config.h" #include "status.h" +#include "debug.h" int count_chunks(void) @@ -70,9 +71,13 @@ bool get_next_chunk(t_chunk *chunk, struct file *file) */ void cleanout_chunk(t_chunk *chunk) { - t_line line; - long slot; + t_line line; + long slot; + char *base_ptr; + int i; + i = 0; + base_ptr = chunk->ptr; while (get_next_line(&line, chunk)) { slot = hash(&line) % g_hmap.size; @@ -86,7 +91,14 @@ void cleanout_chunk(t_chunk *chunk) /* archaic open addressing collision resolver */ slot = (slot + 1) % g_hmap.size; } + i++; + if (i == 500000) { + set_status(TAGDUP_BYTES, (size_t)(chunk->ptr - base_ptr)); + base_ptr = chunk->ptr; + i = 0; + } } + set_status(TAGDUP_BYTES, (size_t)(chunk->ptr - base_ptr)); free(chunk); update_status(CTASK_DONE); } diff --git a/src/hmap.c b/src/hmap.c index 395d4ed..028329d 100644 --- a/src/hmap.c +++ b/src/hmap.c @@ -2,6 +2,7 @@ #include "hmap.h" #include "hash.h" #include "const.h" +#include "status.h" #include "error.h" #include "debug.h" @@ -43,9 +44,14 @@ void destroy_hmap(void) void populate_hmap(t_chunk *chunk) { DLOG("populate_hmap()"); - t_line line; - long slot; - size_t has_slots; + t_line line; + long slot; + size_t has_slots; + char *base_ptr; + int i; + + i = 0; + base_ptr = chunk->ptr; #ifdef DEBUG int last_percent_filled = 0; @@ -86,7 +92,14 @@ void populate_hmap(t_chunk *chunk) } if (!has_slots) error("populate_hmap(): no space left on hashmap."); + i++; + if (i == 500000) { + set_status(TAGDUP_BYTES, (size_t)(chunk->ptr - base_ptr)); + base_ptr = chunk->ptr; + i = 0; + } } + set_status(TAGDUP_BYTES, (size_t)(chunk->ptr - base_ptr)); #ifdef DEBUG DLOG("populate_hmap(): used %ld/%ld slots (%.2f%%)", filled, g_hmap.size, (double)filled / (double)g_hmap.size * 100.0); diff --git a/src/main.c b/src/main.c index 21f2877..ea6582c 100644 --- a/src/main.c +++ b/src/main.c @@ -36,10 +36,15 @@ static void remove_duplicates(void) t_line line; size_t line_size; char *dst; + char *base_ptr; + int i; file_chunk.ptr = g_file->addr; file_chunk.endptr = g_file->addr + g_file->info.st_size; + i = 0; + base_ptr = file_chunk.ptr; + dst = file_chunk.ptr; while (get_next_line(&line, &file_chunk)) { @@ -48,10 +53,17 @@ static void remove_duplicates(void) dst += line_size; if (dst != file_chunk.endptr) *dst++ = '\n'; + i++; + if (i == 500000) { + set_status(FCLEAN_BYTES, (size_t)(file_chunk.ptr - base_ptr)); + base_ptr = file_chunk.ptr; + i = 0; + } } /* update file size */ g_file->info.st_size = dst - g_file->addr; + set_status(FCLEAN_BYTES, (size_t)(file_chunk.ptr - base_ptr)); } @@ -65,6 +77,7 @@ int main(int argc, char **argv) update_status(FCOPY_START); init_file(g_conf.infile_name, g_conf.outfile_name); config(); /* configure g_conf options */ + set_status(CHUNK_SIZE, g_conf.chunk_size); init_hmap(g_conf.hmap_size); update_status(TAGDUP_START); diff --git a/src/status.c b/src/status.c index ae6d48d..25fc24b 100644 --- a/src/status.c +++ b/src/status.c @@ -37,7 +37,8 @@ struct status size_t file_size; size_t fcopy_bytes; size_t chunk_size; - size_t cleanout_bytes; + size_t tagdup_bytes; + size_t fclean_bytes; }; static struct status g_status = { @@ -52,7 +53,8 @@ static struct status g_status = { .file_size = 0, .fcopy_bytes = 0, .chunk_size = 0, - .cleanout_bytes = 0, + .tagdup_bytes = 0, + .fclean_bytes = 0, }; pthread_mutex_t g_mutex = PTHREAD_MUTEX_INITIALIZER; @@ -98,13 +100,27 @@ void set_status(enum e_status_set var, size_t val) switch (var) { case FILE_SIZE: - DLOG("set_status(FILE_SIZE) called"); + DLOG("set_status(FILE_SIZE, %lu) called", val); g_status.file_size = val; break ; case FCOPY_BYTES: - DLOG("set_status(FCOPY_BYTES) called"); + DLOG("set_status(FCOPY_BYTES, %lu) called", val); g_status.fcopy_bytes += val; break ; + case CHUNK_SIZE: + DLOG("set_status(CHUNK_SIZE, %lu) called", val); + g_status.chunk_size = val; + break ; + case TAGDUP_BYTES: + DLOG("set_status(TAGDUP_BYTES, %lu) called", val); + pthread_mutex_lock(&g_mutex); + g_status.tagdup_bytes += val; + pthread_mutex_unlock(&g_mutex); + break ; + case FCLEAN_BYTES: + DLOG("set_status(FCLEAN_BYTES, %lu) called", val); + g_status.fclean_bytes += val; + break ; } } @@ -159,65 +175,77 @@ void display_status(void) char elapsed_time_str[BUF_SIZE] = {0}; char arrival_time_str[BUF_SIZE] = {0}; char current_task_str[BUF_SIZE] = {0}; - double percent_progression = 0.0; time_t current_time = 0; time_t elapsed_time = 0; time_t arrival_time = 0; - if (!FCOPY_STARTED()) - return ; + double progress = 0.0; /* 1.0 == 100% */ + double remain_time = 0.0; + current_time = time(NULL); elapsed_time = current_time - START_TIME(); - percent_progression = 0.0; - if (g_status.fcopy_bytes > 0) - { - double fcopy_part; - fcopy_part = (double)g_status.fcopy_bytes / (double)g_status.file_size; - percent_progression = fcopy_part * 5.0; - if (elapsed_time > 0) { - arrival_time = elapsed_time * (time_t)(100.0 / percent_progression); + /* we need at least 1 sec execution to show status */ + if (elapsed_time == 0) + return ; + + /* FCLEAN [3/3] --> 94% to 100% */ + if (g_status.fclean_bytes) { + double fclean_part = + (double)g_status.fclean_bytes / (double)g_status.file_size; + progress = 0.94 + (fclean_part * 0.06); + if (progress > 0.9999) + progress = 0.9999; + + double fclean_elapsed_time = elapsed_time; + fclean_elapsed_time -= FCOPY_DURATION() + TAGDUP_DURATION(); + if (fclean_elapsed_time >= 1) { + remain_time = fclean_elapsed_time / fclean_part; + remain_time -= fclean_elapsed_time; + arrival_time = current_time + remain_time; } } - else if (!TAGDUP_TERMINATED()) - { - percent_progression = 5.0; - double tagdup_elapsed_time = elapsed_time - FCOPY_DURATION(); - if (g_status.done_ctasks > 0 && tagdup_elapsed_time > 0.9) - { - double time_per_ctask = tagdup_elapsed_time / g_status.done_ctasks; - time_t remaining_time = time_per_ctask * MISSING_CTASKS(); - /* adding FCOPY_DURATION because it's ~= FCLEAN_DURATION */ - arrival_time = current_time + remaining_time + FCOPY_DURATION(); - - double percent_per_ctask = 90.0 / g_status.total_ctasks; - percent_progression += percent_per_ctask * g_status.done_ctasks; - double cur_ctasks_seconds = current_time - g_status.last_ctask_date; - double ctask_progression = cur_ctasks_seconds / time_per_ctask; - if (ctask_progression > 1.0) - ctask_progression = 1.0; - percent_progression += percent_per_ctask * ctask_progression; + /* TAGDUP [2/3] --> 4% to 94% */ + else if (g_status.tagdup_bytes) { + double total_bytes = g_status.total_ctasks * g_status.chunk_size; + double tagdup_part = (double)g_status.tagdup_bytes / total_bytes; + progress = 0.04 + (tagdup_part * 0.90); + + double tagdup_elapsed_time = elapsed_time; + tagdup_elapsed_time -= FCOPY_DURATION(); + if (tagdup_elapsed_time >= 1) { + remain_time = tagdup_elapsed_time / tagdup_part; + remain_time -= tagdup_elapsed_time; + arrival_time = current_time + remain_time; + /* add estimation of FCLEAN duration: */ + arrival_time += (FCOPY_DURATION() * 6) / 4; } } - else - { - percent_progression = 95.0; + /* FCOPY [1/3] --> 0% to 4% */ + else if (g_status.fcopy_bytes) { + progress = (double)g_status.fcopy_bytes / (double)g_status.file_size; + progress *= 0.04; + } + else { + return; + } - double percent_per_second = 5.0 / (double) FCOPY_DURATION(); - time_t elapsed_fclean = current_time - g_status.fclean_date; - percent_progression += percent_per_second * (double)elapsed_fclean; - if (percent_progression > 99.99) - percent_progression = 99.99; + /* fallback method to display ETA */ + if (progress > 0 && arrival_time == 0) { + remain_time = (double)elapsed_time / progress; + remain_time -= elapsed_time; + arrival_time = current_time + remain_time; } + repr_elapsed_time(elapsed_time_str, elapsed_time); repr_arrival_time(arrival_time_str, arrival_time); repr_current_task(current_task_str); - fprintf(stderr, "time: %s %.2f%% (ETA: %s) %s ...\n", + fprintf(stderr, "time: %s %5.2f%% (ETA: %s) %s ...\n", elapsed_time_str, - percent_progression, + progress * 100.0, arrival_time_str, current_task_str); }