nladuo · nladuo · Apr 17, 2017 · Apr 17, 2017
diff --git a/weibo.cn/spliter/__init__.py b/weibo.cn/spliter/__init__.py
@@ -0,0 +1,6 @@
+# -*- coding:utf-8 -*-
+#!/usr/bin/env python3
+
+"""
+
+"""
diff --git a/weibo.cn/spliter/__main__.py b/weibo.cn/spliter/__main__.py
@@ -0,0 +1,29 @@
+# -*- coding:utf-8 -*-
+#!/usr/bin/env python3
+
+"""
+
+"""
+import os
+#os.path.abspath(os.path.abspath(os.path.dirname(__file__)))
+
+from spliter import *
+
+
+def cli():
+    parent_dir = os.path.dirname(__file__)
+    path = os.path.abspath(os.path.join(parent_dir, '..', 'downloader', 'captchas'))
+    images = []
+
+    images = filter(lambda fn:os.path.splitext(fn)[1].lower() == '.png',
+                    os.listdir(path))
+
+    path2 = os.path.join(parent_dir, "dataset")
+    ispliter = Spliter(path2)
+    for im in images:
+        im_path = os.path.join(path, im)
+        print(im_path)
+        ispliter.split_and_save(im_path)
+
+if __name__ == '__main__':
+    cli()
diff --git a/weibo.cn/spliter/captcha_utils.py b/weibo.cn/spliter/captcha_utils.py
@@ -0,0 +1,102 @@
+# -*- coding:utf-8 -*-
+#!/usr/bin/env python3
+
+"""
+
+"""
+from collections import namedtuple
+
+Point = namedtuple('Point', ['x', 'y'])
+
+import numpy as np
+
+def has_tranversed_the_point(x, y, tranversed_points:list):
+    for point in tranversed_points:
+        if x == point.x and y == point.y:
+            return True
+
+    return False
+
+def find_connection_area(now_point, image:np.array, area:list, tranversed_points:list):
+
+    if now_point.x < 0 or now_point.x >= image.shape[1]\
+        or now_point.y < 0 or now_point.y >= image.shape[0]:
+        return
+
+    if image[now_point.y][now_point.x] != 0: return
+
+    if has_tranversed_the_point(now_point.x, now_point.y, tranversed_points):return
+
+    area.append(now_point)
+    tranversed_points.append(now_point)
+
+    find_connection_area(Point(now_point.x, now_point.y-1), image, area, tranversed_points)  #上
+    find_connection_area(Point(now_point.x, now_point.y+1), image, area, tranversed_points)   #下
+    find_connection_area(Point(now_point.x-1, now_point.y), image, area, tranversed_points)   #左
+    find_connection_area(Point(now_point.x+1, now_point.y), image, area, tranversed_points)   #右
+    find_connection_area(Point(now_point.x-1, now_point.y-1), image, area, tranversed_points) #左上
+    find_connection_area(Point(now_point.x-1, now_point.y+1), image, area, tranversed_points) #左下
+    find_connection_area(Point(now_point.x+1, now_point.y-1), image, area, tranversed_points) #右上
+    find_connection_area(Point(now_point.x+1, now_point.y+1), image, area, tranversed_points) #右下
+
+class CaptchaUtils:
+
+    def __init__(self):
+        pass
+
+    @classmethod
+    def clear_peper_noise(self, image, max_adhesion_count):
+        areas = []
+        tranversed_points = []
+        for i in range(image.shape[1]):
+            for j in range(image.shape[0]):
+                if image[j][i] == 0 and not has_tranversed_the_point(i, j, tranversed_points):
+                    area = []
+                    find_connection_area(Point(i, j), image, area, tranversed_points)
+                    areas.append(area)
+
+        # clean the noises
+        for area in areas:
+            if(len(area) <= max_adhesion_count):
+                for point in area:
+                    image[point.y][point.x] = 255
+
+    @classmethod
+    def vertical_project(self, image:np.array, splits:list):
+        project = []
+        for i in range(image.shape[1]):
+            count = 0
+            for j in range(1, image.shape[0]-1):
+                if image[j][i] == 0:count += 1
+
+            project.append(count)
+
+        index = 0
+        i = 1
+        threshold = 1
+        while index < 8 and i < 100:
+            if (project[i] > threshold and project[i-1] <= threshold) \
+                    or (project[i] <= threshold and project[i-1] > threshold):
+                if(index % 2 == 1 and (i - splits[index-1]) <= 8 ):
+                    i += 1
+                    continue
+                splits[index] = i
+                index += 1
+
+            i += 1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/weibo.cn/spliter/requirements.txt b/weibo.cn/spliter/requirements.txt
@@ -0,0 +1,2 @@
+numpy
+opencv-python
diff --git a/weibo.cn/spliter/spliter.py b/weibo.cn/spliter/spliter.py
@@ -0,0 +1,204 @@
+# -*- coding:utf-8 -*-
+#!/usr/bin/env python3
+
+"""
+
+"""
+import uuid
+import os
+
+import cv2
+import numpy as np
+
+from captcha_utils import CaptchaUtils, Point
+
+__all__ = ["Spliter"]
+
+class Spliter:
+
+    HEIGHT_STANDRAD = 32
+    WIDTH_STANDARD = 32
+    def __init__(self, save_dir):
+        self.save_dir = save_dir
+        if not os.path.isdir(save_dir):
+            os.makedirs(save_dir)
+
+    def split_letters(self, filename, letters:list):
+        image = cv2.imread(filename, cv2.IMREAD_COLOR)
+        #cv2.imshow("init", image)
+
+        image = self.clear_noise(image)
+        splits = [0]*8
+        CaptchaUtils.vertical_project(image, splits)
+        if splits[7] < image.shape[1]:#assume the final split is in the range of image.
+            for i in range(0, 8, 2):
+                letters[i//2] = image[0:image.shape[0], splits[i]:splits[i+1],].copy()
+
+    def split_and_save(self, filename):
+        letters = [0]*4
+        self.split_letters(filename, letters)
+        for every_letter in letters:
+            self.save_image(every_letter)
+
+    def clear_noise(self, image):
+        image = cv2.flip(image, -1,)
+        clear_horizontal_noise_line(image)
+        image = cv2.flip(image, -1, )
+        clear_horizontal_noise_line(image)
+        clear_color(image)
+
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+        image = cv2.threshold(image, 150, 255, cv2.THRESH_BINARY)[1]
+        CaptchaUtils.clear_peper_noise(image, 2)
+
+        return image
+
+
+
+    def save_image(self, splited_image:np.array):
+        if splited_image.shape[1] > Spliter.WIDTH_STANDARD:return
+        if splited_image.shape[0] <=0 or splited_image.shape[1]<=0:return
+
+        out_width = Spliter.WIDTH_STANDARD
+        out_height = Spliter.HEIGHT_STANDRAD
+        #cv::Mat out(out_height, out_width, CV_8UC1,cv::Scalar(255));
+        offset_x = abs(out_width - splited_image.shape[1])/2
+        offset_y = abs(out_height - splited_image.shape[0])/2
+
+        #print(splited_image, splited_image.shape)
+        TransMat = np.array([[1,0,offset_x],
+                             [0,1,offset_y]])
+
+        #print(TransMat)
+        new_image = cv2.warpAffine(splited_image, TransMat,
+                                   (out_height, out_width)[::-1],
+                                   borderValue=255)#reversed
+
+
+        path = os.path.join(self.save_dir, str(uuid.uuid4()))+'.png'
+        cv2.imwrite(path, new_image)
+
+def is_black(i, j, image):
+    b = image[j, i][0]
+    g = image[j, i][1]
+    r = image[j, i][2]
+    average = (int(r) + int(g) + int(b))/3
+    if r < 244 and abs(average-b)<4 and abs(average-g)<4 and abs(average-r)<4:
+        return True
+    return False
+
+def clear_color(image):
+    for i in range(image.shape[1]):
+        for j in range(image.shape[0]):
+            if is_black(i, j, image):
+                image[j][i][0]=20
+                image[j][i][1]=20
+                image[j][i][2]=20
+
+def get_horizontal_noise_line_width(image, now_height, now_width):
+    end_width = now_width
+    while end_width < image.shape[1] \
+        and image[now_height][now_width][0] < 12 \
+        and image[now_height][now_width][1] < 12 \
+        and image[now_height][now_width][2] < 12 :
+
+        end_width += 1
+
+    return end_width - now_width
+
+def clear_horizontal_noise_line(image):
+    first_height = 0
+    has_find = False
+    for i in range(image.shape[0]):
+        if image[i][0][0] < 12 and image[i][0][1] < 12 and image[i][0][2] < 12 \
+                and get_horizontal_noise_line_width(image, i, 0) >= 2:
+            first_height = i
+            has_find = True
+
+    if not has_find: return
+    now_width = 0
+    now_height = first_height-2
+    while now_width < image.shape[1]:
+        #print((now_height, now_width), image.shape)
+        width = get_horizontal_noise_line_width(image, now_height, now_width)
+
+        #clear the horizontal noise line
+        for i in range(now_width, now_width+width-1):
+            top_num = 0
+            bottom_num = 0
+            #the upper pixel
+            if is_black(i, now_height-1, image): top_num += 1
+            #the upper left pixel
+            if is_black(i-1, now_height-1, image): top_num += 1
+            #the upper right pixel
+            if is_black(i+1, now_height-1, image): top_num += 1
+            #the lower pixel
+            if is_black(i, now_height+1, image): bottom_num += 1
+            #the left lower pixel
+            if is_black(i-1, now_height+1, image): bottom_num += 1
+            # the right lower pixel
+            if is_black(i+1, now_height+1, image): bottom_num += 1
+
+            if now_height != 0 and now_height != image.shape[0]:
+                if top_num>0 and bottom_num>0: continue
+
+            image[now_height][i][0] = 255
+            image[now_height][i][1] = 255
+            image[now_height][i][2] = 255
+
+        # find the next noise pixel
+        a = get_horizontal_noise_line_width(image, now_height - 1, now_width + width -1)
+        b = get_horizontal_noise_line_width(image, now_height + 1, now_width + width -1)
+        c = get_horizontal_noise_line_width(image, now_height - 1, now_width + width)
+        d = get_horizontal_noise_line_width(image, now_height + 1, now_width + width)
+        if now_height == 0:
+            a=0
+            c=0
+
+        if now_height == (image.shape[0] - 1):
+            b=0
+            d=0
+
+        max_a_b = max(a, b)
+        max_c_d = max(c, d)
+        max_a_b_c_d = max(max_a_b, max_c_d)
+        if max_a_b_c_d < 2: break
+        if max_a_b == max_a_b_c_d:
+            now_width += width-1
+            if max_a_b == a:
+                now_height -= 1
+            else:
+                now_height += 1
+
+        else:
+            now_width += width
+            if max_c_d == c:
+                now_height -= 1
+            else:
+                now_height += 1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+