-
Notifications
You must be signed in to change notification settings - Fork 56
/
svtr_tiny.yaml
187 lines (175 loc) · 4.2 KB
/
svtr_tiny.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
system:
mode: 0 # 0 for graph mode, 1 for pynative mode in MindSpore
distribute: True
amp_level: O2
amp_level_infer: O2 # running inference in O2 mode
seed: 42
log_interval: 100
val_while_train: True
drop_overflow_update: True
ckpt_save_policy: latest_k
ckpt_max_keep: 5
common:
character_dict_path: &character_dict_path
num_classes: &num_classes 37 # num_chars_in_dict + 1
max_text_len: &max_text_len 24
use_space_char: &use_space_char False
batch_size: &batch_size 512
model:
type: rec
transform:
name: STN_ON
in_channels: 3
tps_inputsize: [32, 64]
tps_outputsize: [32, 100]
num_control_points: 20
tps_margins: [0.05, 0.05]
stn_activation: none
backbone:
name: SVTRNet
pretrained: False
img_size: [32, 100]
out_channels: 192
patch_merging: Conv
embed_dim: [64, 128, 256]
depth: [3, 6, 3]
num_heads: [2, 4, 8]
mixer:
[
"Local",
"Local",
"Local",
"Local",
"Local",
"Local",
"Global",
"Global",
"Global",
"Global",
"Global",
"Global",
]
local_mixer: [[7, 11], [7, 11], [7, 11]]
last_stage: True
prenorm: False
neck:
name: Img2Seq
head:
name: CTCHead
out_channels: *num_classes
postprocess:
name: RecCTCLabelDecode
character_dict_path: *character_dict_path
use_space_char: *use_space_char
metric:
name: RecMetric
main_indicator: acc
character_dict_path: *character_dict_path
ignore_space: True
print_flag: False
loss:
name: CTCLoss
pred_seq_len: 25 # 100 / 4
max_label_len: *max_text_len # this value should be smaller than pre_seq_len
batch_size: *batch_size
scheduler:
scheduler: warmup_cosine_decay
min_lr: 0.00001
lr: 0.001
num_epochs: 30
warmup_epochs: 3
decay_epochs: 27
optimizer:
opt: adamw
grouping_strategy: svtr
filter_bias_and_bn: False
weight_decay: 0.05
loss_scaler:
type: dynamic
loss_scale: 512
scale_factor: 2.0
scale_window: 1000
train:
ckpt_save_dir: ./tmp_rec
dataset_sink_mode: False
ema: True
ema_decay: 0.9999
dataset:
type: LMDBDataset
dataset_root: path/to/data_lmdb_release/
data_dir: training/
label_file: null
sample_ratio: 1.0
shuffle: True
filter_max_len: True
filter_zero_text_image: True
extra_count_if_repeat: True
max_text_len: *max_text_len
character_dict_path: *character_dict_path
label_standandize: True
transform_pipeline:
- DecodeImage:
img_mode: BGR
to_float32: False
- SVTRRecAug:
aug_type: 0
- RecCTCLabelEncode:
max_text_len: *max_text_len
character_dict_path: *character_dict_path
use_space_char: *use_space_char
lower: True
- SVTRRecResizeImg:
image_shape: [64, 256]
padding: False
- NormalizeImage:
bgr_to_rgb: True
is_hwc: True
mean: [127.0, 127.0, 127.0]
std: [127.0, 127.0, 127.0]
- ToCHWImage:
output_columns: ["image", "text_seq"]
net_input_column_index: [0]
label_column_index: [1]
loader:
shuffle: True
batch_size: *batch_size
drop_remainder: True
max_rowsize: 12
num_workers: 4
eval:
ckpt_load_path: ./tmp_rec/best.ckpt
dataset_sink_mode: False
dataset:
type: LMDBDataset
dataset_root: path/to/data_lmdb_release/
data_dir: validation/
label_file: null
sample_ratio: 1.0
shuffle: False
transform_pipeline:
- DecodeImage:
img_mode: BGR
to_float32: False
- RecCTCLabelEncode:
max_text_len: *max_text_len
character_dict_path: *character_dict_path
use_space_char: *use_space_char
lower: True
- SVTRRecResizeImg:
image_shape: [64, 256]
padding: False
- NormalizeImage:
bgr_to_rgb: True
is_hwc: True
mean: [127.0, 127.0, 127.0]
std: [127.0, 127.0, 127.0]
- ToCHWImage:
output_columns: ["image", "text_padded", "text_length"]
net_input_column_index: [0]
label_column_index: [1, 2]
loader:
shuffle: False
batch_size: 512
drop_remainder: False
max_rowsize: 12
num_workers: 1