In [1]:
from tqdm import tqdm_notebook as tqdm
from collections import namedtuple
from datetime import datetime
from itertools import chain
from PIL import Image


import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import glob
import cv2
import os

In [2]:
def subtract_time(subdir_time, table_time):
        subdir_time = subdir_time.split(" ")[1]
        table_time  = table_time.split(" ")[1]
        
        if subdir_time == table_time:
                return 0
        
        subdir_time = subdir_time[:4] + "20" + subdir_time[4:]
        table_time  = table_time[:4] + "20" + table_time[4:]

        sub_convert = datetime.strptime(subdir_time  , "%m%d%Y")
        table_convert = datetime.strptime(table_time, "%m%d%Y")
        
        return (table_convert - sub_convert).days

In [3]:
def analyze_raw_data():
        raw_data = pd.DataFrame(columns=["dir_name", "subdir_num", "table_num", "time_delta"])
        all_id_dir = [ i for i in glob.iglob("Data/*") ]
        Item = namedtuple("Item", "dir_name, subdir_num, table_num, time_delta")
        for id_dir in tqdm(all_id_dir, desc="Raw Data Table"):
                sub_dir = [ i for i in glob.iglob("%s/*"     % id_dir ) if os.path.isdir(i)    ]
                table   = [ i for i in glob.iglob("%s/*"     % id_dir ) if not os.path.isdir(i) and not i.endswith(".json")]
                
                diff_time = None 
                if table:
                        sub_dir_name, table_name = sub_dir[0], table[0]
                        sub_dir_time   = sub_dir_name.split("\\")[2]
                        table_time     = table_name.split("\\")[2]
                        diff_time = subtract_time(sub_dir_time, table_time)
                
                data = Item(id_dir, len(sub_dir), len(table), diff_time)
                data = data._asdict()
                raw_data = raw_data.append(data, ignore_index=True)
        return raw_data

def check_id_consistence():
        all_id_dir = [ i for i in glob.iglob("Data/*") ]
        for id_dir in all_id_dir:
                sub_dir = [ i for i in glob.iglob("%s/*"     % id_dir ) if os.path.isdir(i)    ]
                table   = [ i for i in glob.iglob("%s/*"     % id_dir ) if not os.path.isdir(i)]
                if len(sub_dir) == 1 and len(table) == 1:
                        sub_dir_name = sub_dir[0][sub_dir[0].rfind("\\")+1:]
                        table_name = table[0][table[0].rfind("\\")+1:]

                        id_dir_name = id_dir[id_dir.rfind("\\")+1:]
                        sub_dir_name = sub_dir_name.split(" ")[0]
                        table_name   = table_name.split(" ")[0]

                        if not id_dir_name == sub_dir_name == table_name:
                                print(id_dir_name, sub_dir_name, table_name)

def resolution_analyze():
        resolution = pd.DataFrame(columns=["dir_name", "width_upper", "height_upper"])
        all_id_dir = [ i for i in glob.iglob("Data/*") ]
        Item = namedtuple("Item", "dir_name, width_upper, height_upper")
        for id_dir in tqdm(all_id_dir, desc="Resolution Data Table"):
                images  = [ Image.open(i).size for i in glob.iglob("%s/*/*"     % id_dir)  ]

                bins = np.arange(0, 3000, 100)
                digitized = np.digitize(images, bins)

                unique_w, counts_w = np.unique(digitized[:, 0], return_counts=True)
                unique_h, counts_h = np.unique(digitized[:, 1], return_counts=True)

                h_freq = np.asarray((unique_h, counts_h)).T
                w_freq = np.asarray((unique_w, counts_w)).T

                sort_h_freq = h_freq[np.argsort(h_freq[:,1])]
                sort_w_freq = w_freq[np.argsort(w_freq[:,1])]

                w_upper = max(sort_h_freq[-2:, 0])* 100
                h_upper = max(sort_w_freq[-2:, 0])* 100


                data = Item(id_dir, w_upper, h_upper)
                data = data._asdict()
                resolution = resolution.append(data, ignore_index=True)
        return resolution


In [4]:
def find_table_start_point(index_of_240):
        start_x, start_y = 0, 0

        i = 0
        max_threshold, min_threshold = 80, 40
        total_len = len(index_of_240[0])
        while i < total_len:
                sequence_len = 1
                while i + sequence_len < total_len:
                        if index_of_240[1][i] + sequence_len != index_of_240[1][i+sequence_len]:
                            break
                        sequence_len += 1

                if  max_threshold >= sequence_len >= min_threshold:
                        start_x = index_of_240[0][i]
                        # start_y = index_of_240[1][i]
                        break

                i += sequence_len

        index = index_of_240[0].tolist().index(start_x)
        while True:
                if  index_of_240[1][index] + 1 == index_of_240[1][ index + 1 ] and index_of_240[1][index] + 2 == index_of_240[1][ index + 2 ] and \
                        index_of_240[1][index] + 3 == index_of_240[1][ index + 3 ] and index_of_240[1][index] + 4 == index_of_240[1][ index + 4 ] and \
                        index_of_240[1][index] + 5 == index_of_240[1][ index + 5 ] and index_of_240[1][index] + 6 == index_of_240[1][ index + 6 ]:

                        start_y = index_of_240[1][index]
                        break
                index += 1

        return (start_x, start_y)

def get_subseq_pair(index_of_240, start_value, direction):
        start_end_pair = []
        all_index = None
        if direction == 'x':
                all_index = [ j for i, j in zip(index_of_240[0], index_of_240[1]) if i == start_value ]
        if direction == 'y':
                all_index = [ i for i, j in zip(index_of_240[0], index_of_240[1]) if j == start_value ]

        start = end = 0 # start index and end index
        total_len = len(all_index)
        i = 0
        while i < total_len:
                if ( (i+1) < total_len and (all_index[i] + 1) != all_index[i+1]) or (i+1 == total_len):
                        end = i
                        distance = all_index[end] - all_index[start]
                        if distance > 7:
                            start_end_pair.append((all_index[start], all_index[end]))
                        start = end = i+1
                i+=1

        return start_end_pair

def points_to_cell(point_row_list, point_col_list):
        row = 0
        x = [ int(i) for i in point_row_list ]
        y = [ int(i) for i in point_col_list ]

        total_row = len(point_row_list)
        table_cells = []
        while row < total_row:
                col = 0
                while col + 1 < 32:
                        position = {
                            "location": (col//2, row//2),
                            "corner": [
                                (x[row]  , y[col]  ), # upper-left  
                                (x[row]  , y[col+1]), # upper-right
                                (x[row+1], y[col]),   # lower-left
                                (x[row+1], y[col+1])  # lower-right
                            ]
                        }
                        table_cells.append(position)
                        col += 2
                row += 2
        return table_cells
    
def get_table_cell_location(image_path):
        
        image = cv2.imread(image_path, 0)
        index_of_240 = np.where(image==240)
        start_point = find_table_start_point(index_of_240)


        point_column     = get_subseq_pair(index_of_240, start_point[0], 'x') # left -> right
        point_row        = get_subseq_pair(index_of_240, start_point[1], 'y') # top -> down    
        point_column_second = get_subseq_pair(index_of_240, point_row[1][0], 'x')

        point_column_list = [ i for i in list(chain(*point_column)) if i >= start_point[1] ]
        point_row_list = [ i for i in list(chain(*point_row)) if i >= start_point[0] ]
        point_column_second_list = [ i for i in list(chain(*point_column_second)) if i >= start_point[1] ]
        table_cells = points_to_cell(point_row_list, point_column_second_list)

        image = np.where(image==240, 255, 0)
        image_color = cv2.cvtColor(image.astype(np.uint8), cv2.COLOR_GRAY2RGB)
        for cell in table_cells:
            for point in cell["corner"]:
                 image_color[point] = [0, 0, 255]
        
        return table_cells


In [5]:
def statistic_cell_size(image_path ,table_situation, cell_dict):
        
        heights, widths = [], []
        for item in cell_dict:
                width = item["corner"][1][1] - item["corner"][0][1]
                widths.append(width)

                height = item["corner"][2][0] - item["corner"][0][0]
                heights.append(height)

        width_freq = np.array(np.unique(widths, return_counts=True)).T
        height_freq = np.array(np.unique(heights, return_counts=True)).T
        cell_same_size = True if len(height_freq) == 1 else False

        item_info = {
            "dir_name": image_path[:image_path.rfind("\\")],
            "cell_width" : width_freq[0, 0],
            "cell_height": height_freq[0, 0],
            "cell_same_size": cell_same_size
        }

        table_situation = table_situation.append(item_info, ignore_index=True)
        return table_situation

In [6]:
raw_data_table = analyze_raw_data()
resolution = resolution_analyze()

HBox(children=(IntProgress(value=0, description='Raw Data Table', max=330, style=ProgressStyle(description_wid…




HBox(children=(IntProgress(value=0, description='Resolution Data Table', max=330, style=ProgressStyle(descript…




In [7]:
raw_data_table

Unnamed: 0,dir_name,subdir_num,table_num,time_delta
0,Data\000408,1,1,25
1,Data\000411,1,1,0
2,Data\001742,1,1,3
3,Data\002456,1,1,98
4,Data\002555,1,1,0
...,...,...,...,...
325,Data\S469585_1,1,1,33
326,Data\S469585_2,1,1,4
327,Data\S594966_1,1,1,21
328,Data\S594966_2,1,1,-4


In [8]:
resolution

Unnamed: 0,dir_name,width_upper,height_upper
0,Data\000408,800,900
1,Data\000411,800,900
2,Data\001742,800,800
3,Data\002456,800,900
4,Data\002555,800,900
...,...,...,...
325,Data\S469585_1,800,900
326,Data\S469585_2,800,900
327,Data\S594966_1,800,800
328,Data\S594966_2,800,900


In [9]:
table_situation = pd.DataFrame(columns=["dir_name", "cell_width", "cell_height", "cell_same_size"])
files = [ i for i in glob.iglob("Data/*/*.png") ]
for image_path in tqdm(files):
        table_cell = get_table_cell_location(image_path)
        table_situation = statistic_cell_size(image_path, table_situation, table_cell)
        
        
        if len(table_cell) != 512:
                print("%s don't have correct number of table cell" % files)
                raise ValueError

HBox(children=(IntProgress(value=0, max=299), HTML(value='')))




In [10]:
table_situation

Unnamed: 0,dir_name,cell_width,cell_height,cell_same_size
0,Data\000408,23,9,False
1,Data\000411,58,11,False
2,Data\001742,61,13,True
3,Data\004151,77,25,True
4,Data\004359,35,13,True
...,...,...,...,...
294,Data\S469585_1,61,13,True
295,Data\S469585_2,61,13,True
296,Data\S594966_1,61,13,True
297,Data\S594966_2,61,13,True


In [11]:
overview_table = pd.merge(left=raw_data_table, right=resolution, left_on='dir_name', right_on='dir_name', how='outer')
overview_table = pd.merge(left=overview_table, right=table_situation, left_on='dir_name', right_on='dir_name', how='outer')

In [12]:
invalid = overview_table[overview_table.isnull().any(axis=1)].index
invalid
overview_table

Unnamed: 0,dir_name,subdir_num,table_num,time_delta,width_upper,height_upper,cell_width,cell_height,cell_same_size
0,Data\000408,1,1,25,800,900,23,9,False
1,Data\000411,1,1,0,800,900,58,11,False
2,Data\001742,1,1,3,800,800,61,13,True
3,Data\002456,1,1,98,800,900,,,
4,Data\002555,1,1,0,800,900,,,
...,...,...,...,...,...,...,...,...,...
325,Data\S469585_1,1,1,33,800,900,61,13,True
326,Data\S469585_2,1,1,4,800,900,61,13,True
327,Data\S594966_1,1,1,21,800,800,61,13,True
328,Data\S594966_2,1,1,-4,800,900,61,13,True


In [13]:
is_valids = []
invalid_messages = []

for index, row in overview_table.iterrows():
        is_valid = True
        invalid_message = ""
        
        if np.isnan(row["cell_width"]) or not 50 < row["cell_width"] < 70 or not 10 < row["cell_height"] < 20 or row["height_upper"] > 900 or row["width_upper"] > 1000 :
                is_valid = False
                
                if row["table_num"] == 0:
                        invalid_message = "no table image"
                        is_valids.append(is_valid)
                        invalid_messages.append(invalid_message)
                        continue
                        
                if np.isnan(row["cell_width"]):
                        invalid_message = "table image is jpg format"
                        is_valids.append(is_valid)
                        invalid_messages.append(invalid_message)
                        continue
                
                if row["height_upper"] > 900 or row["width_upper"] > 1000:
                        invalid_message += "tooth image is high-resolution "
                        
                if not 50 < row["cell_width"] < 70 or not 10 < row["cell_height"] < 20:
                        invalid_message += "table cell is unhandled size"
                
                
                
        is_valids.append(is_valid)
        invalid_messages.append(invalid_message)

overview_table["Valid"] = is_valids
overview_table["Note"]  = invalid_messages

In [14]:
overview_table

Unnamed: 0,dir_name,subdir_num,table_num,time_delta,width_upper,height_upper,cell_width,cell_height,cell_same_size,Valid,Note
0,Data\000408,1,1,25,800,900,23,9,False,False,table cell is unhandled size
1,Data\000411,1,1,0,800,900,58,11,False,True,
2,Data\001742,1,1,3,800,800,61,13,True,True,
3,Data\002456,1,1,98,800,900,,,,False,table image is jpg format
4,Data\002555,1,1,0,800,900,,,,False,table image is jpg format
...,...,...,...,...,...,...,...,...,...,...,...
325,Data\S469585_1,1,1,33,800,900,61,13,True,True,
326,Data\S469585_2,1,1,4,800,900,61,13,True,True,
327,Data\S594966_1,1,1,21,800,800,61,13,True,True,
328,Data\S594966_2,1,1,-4,800,900,61,13,True,True,


In [15]:
overview_table.to_csv("Overview.csv", index=False, sep=",")