forked from linglanfeng/CCF2019-OCR
-
Notifications
You must be signed in to change notification settings - Fork 0
/
合并全体单项文件.py
72 lines (60 loc) · 3.24 KB
/
合并全体单项文件.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# 地址 单列提交
import pandas as pd
src_path = 'single_submit/submit_example.csv'
# output_path = 'all_submit/1015_best_submit_all.csv' #99.511% ->0.995407
# best_result_file = {'姓名':'1013_name_only_400.csv'
# ,'民族':'0905_nation_only_mix_ext1w.csv'
# ,'性别':'0907_sex_only_mix.csv'
# ,'年':'0908_year_only_mix_ext1w.csv'
# ,'月':'0906_month_only_mix.csv'
# ,'日':'0907_day_only_mix_ext1w.csv'
# ,'住址':'1006_address_only_180_1014test.csv'
# ,'公民身份号码':'1015_id_only_50.csv'
# ,'签发机关':'1008_office_only_33_l2_correct.csv'
# ,'有效期限':'1012_valid_only_200.csv'
# ,}
# output_path = 'all_submit/1020_best_submit_all.csv' #99.651% -> 0.99660
# best_result_file = {'姓名':'1013_name_only_400_t1017.csv'
# ,'民族':'1018_v5_nation_only_46.csv'
# ,'性别':'0907_sex_only_mix.csv'
# ,'年':'1019_year_only_31.csv'
# ,'月':'1019_month_only_93.csv'
# ,'日':'1019_day_only_47.csv'
# ,'住址':'1006_address_only_180_1014test.csv'
# ,'公民身份号码':'1015_id_only_50.csv'
# ,'签发机关':'1008_office_only_33_l2_correct.csv'
# ,'有效期限':'1012_valid_only_200.csv'
# ,}
output_path = 'all_submit/1021_best_submit_all.csv' #99.687% -> 0.996958
best_result_file = {'姓名':'1013_name_only_400_t1017.csv'
,'民族':'1018_v5_nation_only_46.csv'
,'性别':'0907_sex_only_mix.csv'
,'年':'1019_year_only_31.csv'
,'月':'1019_month_only_93.csv'
,'日':'1019_day_only_47.csv'
,'住址':'1006_address_only_180_1014test.csv'
,'公民身份号码':'1015_id_only_50.csv'
,'签发机关':'1008_office_only_33_l2_correct.csv'
,'有效期限':'1020_v4_valid_only_29_t1017.csv'
,}
df_src = pd.read_csv(src_path, header=None)
columns = ['图片名字', '姓名', '民族', '性别', '年', '月', '日', '住址', '公民身份号码', '签发机关', '有效期限']
df_src.columns = columns
df_src.loc[:, 1:] = 0
df_src.set_index(['图片名字'],inplace=True)
for idx,col in enumerate(columns[1:]):
print(col)
result_path = 'single_submit/{0}'.format(best_result_file[col])
print(result_path)
df = pd.read_csv(result_path, header=None, names=columns,dtype={'年':'str', '月':'str', '日':'str','公民身份号码':'str'})
df = df.loc[~df.图片名字.isna()].copy()
df.set_index(['图片名字'], inplace=True)
df.fillna('0', inplace=True)
# df.columns = columns
df_src[col] = df[col].astype('str')
# df_src['图片名字'] = df['图片名字']
df_src.fillna('0',inplace=True)
df_src.reset_index(inplace=True)
df_src.loc[df_src.图片名字 == '0','图片名字'] = ''
df_src.to_csv(output_path, encoding='utf_8_sig', index=False, header=None)
print(22)