Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add spliter python-impl #5

Merged
merged 1 commit into from Apr 17, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 6 additions & 0 deletions weibo.cn/spliter/__init__.py
@@ -0,0 +1,6 @@
# -*- coding:utf-8 -*-
#!/usr/bin/env python3

"""

"""
29 changes: 29 additions & 0 deletions weibo.cn/spliter/__main__.py
@@ -0,0 +1,29 @@
# -*- coding:utf-8 -*-
#!/usr/bin/env python3

"""

"""
import os
#os.path.abspath(os.path.abspath(os.path.dirname(__file__)))

from spliter import *


def cli():
parent_dir = os.path.dirname(__file__)
path = os.path.abspath(os.path.join(parent_dir, '..', 'downloader', 'captchas'))
images = []

images = filter(lambda fn:os.path.splitext(fn)[1].lower() == '.png',
os.listdir(path))

path2 = os.path.join(parent_dir, "dataset")
ispliter = Spliter(path2)
for im in images:
im_path = os.path.join(path, im)
print(im_path)
ispliter.split_and_save(im_path)

if __name__ == '__main__':
cli()
102 changes: 102 additions & 0 deletions weibo.cn/spliter/captcha_utils.py
@@ -0,0 +1,102 @@
# -*- coding:utf-8 -*-
#!/usr/bin/env python3

"""

"""
from collections import namedtuple

Point = namedtuple('Point', ['x', 'y'])

import numpy as np

def has_tranversed_the_point(x, y, tranversed_points:list):
for point in tranversed_points:
if x == point.x and y == point.y:
return True

return False

def find_connection_area(now_point, image:np.array, area:list, tranversed_points:list):

if now_point.x < 0 or now_point.x >= image.shape[1]\
or now_point.y < 0 or now_point.y >= image.shape[0]:
return

if image[now_point.y][now_point.x] != 0: return

if has_tranversed_the_point(now_point.x, now_point.y, tranversed_points):return

area.append(now_point)
tranversed_points.append(now_point)

find_connection_area(Point(now_point.x, now_point.y-1), image, area, tranversed_points) #上
find_connection_area(Point(now_point.x, now_point.y+1), image, area, tranversed_points) #下
find_connection_area(Point(now_point.x-1, now_point.y), image, area, tranversed_points) #左
find_connection_area(Point(now_point.x+1, now_point.y), image, area, tranversed_points) #右
find_connection_area(Point(now_point.x-1, now_point.y-1), image, area, tranversed_points) #左上
find_connection_area(Point(now_point.x-1, now_point.y+1), image, area, tranversed_points) #左下
find_connection_area(Point(now_point.x+1, now_point.y-1), image, area, tranversed_points) #右上
find_connection_area(Point(now_point.x+1, now_point.y+1), image, area, tranversed_points) #右下

class CaptchaUtils:

def __init__(self):
pass

@classmethod
def clear_peper_noise(self, image, max_adhesion_count):
areas = []
tranversed_points = []
for i in range(image.shape[1]):
for j in range(image.shape[0]):
if image[j][i] == 0 and not has_tranversed_the_point(i, j, tranversed_points):
area = []
find_connection_area(Point(i, j), image, area, tranversed_points)
areas.append(area)

# clean the noises
for area in areas:
if(len(area) <= max_adhesion_count):
for point in area:
image[point.y][point.x] = 255

@classmethod
def vertical_project(self, image:np.array, splits:list):
project = []
for i in range(image.shape[1]):
count = 0
for j in range(1, image.shape[0]-1):
if image[j][i] == 0:count += 1

project.append(count)

index = 0
i = 1
threshold = 1
while index < 8 and i < 100:
if (project[i] > threshold and project[i-1] <= threshold) \
or (project[i] <= threshold and project[i-1] > threshold):
if(index % 2 == 1 and (i - splits[index-1]) <= 8 ):
i += 1
continue
splits[index] = i
index += 1

i += 1
















2 changes: 2 additions & 0 deletions weibo.cn/spliter/requirements.txt
@@ -0,0 +1,2 @@
numpy
opencv-python
204 changes: 204 additions & 0 deletions weibo.cn/spliter/spliter.py
@@ -0,0 +1,204 @@
# -*- coding:utf-8 -*-
#!/usr/bin/env python3

"""

"""
import uuid
import os

import cv2
import numpy as np

from captcha_utils import CaptchaUtils, Point

__all__ = ["Spliter"]

class Spliter:

HEIGHT_STANDRAD = 32
WIDTH_STANDARD = 32
def __init__(self, save_dir):
self.save_dir = save_dir
if not os.path.isdir(save_dir):
os.makedirs(save_dir)

def split_letters(self, filename, letters:list):
image = cv2.imread(filename, cv2.IMREAD_COLOR)
#cv2.imshow("init", image)

image = self.clear_noise(image)
splits = [0]*8
CaptchaUtils.vertical_project(image, splits)
if splits[7] < image.shape[1]:#assume the final split is in the range of image.
for i in range(0, 8, 2):
letters[i//2] = image[0:image.shape[0], splits[i]:splits[i+1],].copy()

def split_and_save(self, filename):
letters = [0]*4
self.split_letters(filename, letters)
for every_letter in letters:
self.save_image(every_letter)

def clear_noise(self, image):
image = cv2.flip(image, -1,)
clear_horizontal_noise_line(image)
image = cv2.flip(image, -1, )
clear_horizontal_noise_line(image)
clear_color(image)

image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
image = cv2.threshold(image, 150, 255, cv2.THRESH_BINARY)[1]
CaptchaUtils.clear_peper_noise(image, 2)

return image



def save_image(self, splited_image:np.array):
if splited_image.shape[1] > Spliter.WIDTH_STANDARD:return
if splited_image.shape[0] <=0 or splited_image.shape[1]<=0:return

out_width = Spliter.WIDTH_STANDARD
out_height = Spliter.HEIGHT_STANDRAD
#cv::Mat out(out_height, out_width, CV_8UC1,cv::Scalar(255));
offset_x = abs(out_width - splited_image.shape[1])/2
offset_y = abs(out_height - splited_image.shape[0])/2

#print(splited_image, splited_image.shape)
TransMat = np.array([[1,0,offset_x],
[0,1,offset_y]])

#print(TransMat)
new_image = cv2.warpAffine(splited_image, TransMat,
(out_height, out_width)[::-1],
borderValue=255)#reversed


path = os.path.join(self.save_dir, str(uuid.uuid4()))+'.png'
cv2.imwrite(path, new_image)

def is_black(i, j, image):
b = image[j, i][0]
g = image[j, i][1]
r = image[j, i][2]
average = (int(r) + int(g) + int(b))/3
if r < 244 and abs(average-b)<4 and abs(average-g)<4 and abs(average-r)<4:
return True
return False

def clear_color(image):
for i in range(image.shape[1]):
for j in range(image.shape[0]):
if is_black(i, j, image):
image[j][i][0]=20
image[j][i][1]=20
image[j][i][2]=20

def get_horizontal_noise_line_width(image, now_height, now_width):
end_width = now_width
while end_width < image.shape[1] \
and image[now_height][now_width][0] < 12 \
and image[now_height][now_width][1] < 12 \
and image[now_height][now_width][2] < 12 :

end_width += 1

return end_width - now_width

def clear_horizontal_noise_line(image):
first_height = 0
has_find = False
for i in range(image.shape[0]):
if image[i][0][0] < 12 and image[i][0][1] < 12 and image[i][0][2] < 12 \
and get_horizontal_noise_line_width(image, i, 0) >= 2:
first_height = i
has_find = True

if not has_find: return
now_width = 0
now_height = first_height-2
while now_width < image.shape[1]:
#print((now_height, now_width), image.shape)
width = get_horizontal_noise_line_width(image, now_height, now_width)

#clear the horizontal noise line
for i in range(now_width, now_width+width-1):
top_num = 0
bottom_num = 0
#the upper pixel
if is_black(i, now_height-1, image): top_num += 1
#the upper left pixel
if is_black(i-1, now_height-1, image): top_num += 1
#the upper right pixel
if is_black(i+1, now_height-1, image): top_num += 1
#the lower pixel
if is_black(i, now_height+1, image): bottom_num += 1
#the left lower pixel
if is_black(i-1, now_height+1, image): bottom_num += 1
# the right lower pixel
if is_black(i+1, now_height+1, image): bottom_num += 1

if now_height != 0 and now_height != image.shape[0]:
if top_num>0 and bottom_num>0: continue

image[now_height][i][0] = 255
image[now_height][i][1] = 255
image[now_height][i][2] = 255

# find the next noise pixel
a = get_horizontal_noise_line_width(image, now_height - 1, now_width + width -1)
b = get_horizontal_noise_line_width(image, now_height + 1, now_width + width -1)
c = get_horizontal_noise_line_width(image, now_height - 1, now_width + width)
d = get_horizontal_noise_line_width(image, now_height + 1, now_width + width)
if now_height == 0:
a=0
c=0

if now_height == (image.shape[0] - 1):
b=0
d=0

max_a_b = max(a, b)
max_c_d = max(c, d)
max_a_b_c_d = max(max_a_b, max_c_d)
if max_a_b_c_d < 2: break
if max_a_b == max_a_b_c_d:
now_width += width-1
if max_a_b == a:
now_height -= 1
else:
now_height += 1

else:
now_width += width
if max_c_d == c:
now_height -= 1
else:
now_height += 1