-
Notifications
You must be signed in to change notification settings - Fork 0
/
csvtoline.py
93 lines (92 loc) · 3.41 KB
/
csvtoline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import numpy as np
import pandas as pd
from collections import defaultdict
import re
class LineReader:
def __init__(self):
self.line_dict=defaultdict()
self.word_dict=defaultdict()
def find_lines(self,start, end, X1, Y1, X2, Y2, Text, line_dict,xbias=25):
#print(Text[start:end])
order = np.argsort(X1[start:end])
x1_tmp = np.array(X1[start:end])[order]
x2_tmp = np.array(X2[start:end])[order]
y1_tmp = np.array(Y1[start:end])[order]
y2_tmp = np.array(Y2[start:end])[order]
text = np.array(Text[start:end])[order]
f_start = 0
f_end = 0
if start == end:
return line_dict
curr = float(x2_tmp[0])
for i in range(len(x1_tmp)):
if float(x1_tmp[i]) <= float(curr) + float(xbias):
curr = float(x2_tmp[i])
f_end = i + 1
else:
cordinate = (x1_tmp[f_start], y1_tmp[f_start], x2_tmp[f_end - 1], y2_tmp[f_end - 1])
tmp = u' '.join(str(i) for i in text[f_start:f_end])
if len(tmp) > 0:
line_dict[cordinate] = tmp
#print("Temp ==> ", tmp,len(tmp))
curr = x2_tmp[i]
f_start = i
f_end = i+1
cordinate = (x1_tmp[f_start], y1_tmp[f_start], x2_tmp[f_end - 1], y2_tmp[f_end - 1])
tmp = u' '.join(str(i) for i in text[f_start:f_end])
if len(tmp)>0:
line_dict[cordinate]=tmp
#print("Temp ==> ", tmp,len(tmp))
curr = x2_tmp[i]
"""f_start = i
f_end = i
cordinate = (x1_tmp[f_start], y1_tmp[f_start], x2_tmp[f_end - 1], y2_tmp[f_end - 1])
line_dict[cordinate] = u' '.join \
(str(i) for i in text[f_start:f_end])"""
return line_dict
def csv_reader(self,df,type='line',ybias=7,xbias=25):
print(len(df))
Text = df.iloc[:, 0]
Text = [re.sub('[^a-zA-Z0-9-.?!&/":#\';()| ]', '', str(i)) for i in Text]
Text = [re.sub(' +', ' ', str(i)) for i in Text]
X1 = list(df.iloc[:, 1])
X2 = list(df.iloc[:, 3])
Y1 = list(df.iloc[:, 2])
Y2 = list(df.iloc[:, 4])
i=0
while i < len(Text):
if len(re.sub('[^a-zA-Z0-9&#]','',Text[i]))<=0:
Text.pop(i)
X1.pop(i)
X2.pop(i)
Y1.pop(i)
Y2.pop(i)
else:
i+=1
if type=='word':
for x1, y1, x2, y2, text in zip(X1, Y1, X2, Y2, Text):
if len(re.sub('[^a-zA-Z0-9]', '', text)) <= 0:
continue
self.word_dict[(x1, y1, x2, y2)] = text
return self.word_dict
order = np.argsort(Y1)
X1 = np.array(X1)[order]
Y1 = np.array(Y1)[order]
X2 = np.array(X2)[order]
Y2 = np.array(Y2)[order]
Text = np.array(Text)[order]
curr = float(Y1[0])
start = 0
end = 0
line_dict = defaultdict(str)
for i in range(len(X1)):
if float(Y1[i])<= float(curr) + float(ybias):
end = i + 1
curr = float(Y1[i])
else:
if(X2[i]>X1[i]):
line_dict = self.find_lines(start, end, X1, Y1, X2, Y2, Text, line_dict,xbias)
start = i
end = i+1
curr = Y1[i]
return line_dict,Text