forked from JavaStudenttwo/ccks_kg
-
Notifications
You must be signed in to change notification settings - Fork 0
/
regulation.py
73 lines (61 loc) · 3.55 KB
/
regulation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import re
from collections import defaultdict
from utils.preprocessing import *
DATA_DIR = './data' # 输入数据文件夹
# ## 通过规则抽取实体
#
# - 机构
# - 研报
# - 文章
# - 风险
# In[ ]:
def aug_entities_by_rules(yanbao_dir):
entities_by_rule = defaultdict(list)
for file in list(yanbao_dir.glob('*.txt'))[:]:
with open(file, encoding='utf-8') as f:
found_yanbao = False
found_fengxian = False
for lidx, line in enumerate(f):
# 公司的标题
ret = re.findall('^[\((]*[\d一二三四五六七八九十①②③④⑤]*[\))\.\s]*(.*有限公司)$', line)
if ret:
entities_by_rule['机构'].append(ret[0])
# 研报
if not found_yanbao and lidx <= 5 and len(line) > 10:
may_be_yanbao = line.strip()
if not re.findall(r'\d{4}\s*[年-]\s*\d{1,2}\s*[月-]\s*\d{1,2}\s*日?', may_be_yanbao) \
and not re.findall('^[\d一二三四五六七八九十]+\s*[\.、]\s*.*$', may_be_yanbao) \
and not re.findall('[\((]\d+\.*[A-Z]*[\))]', may_be_yanbao) \
and len(may_be_yanbao) > 5 \
and len(may_be_yanbao) < 100:
entities_by_rule['研报'].append(may_be_yanbao)
found_yanbao = True
# 文章
for sent in split_to_sents(line):
results = re.findall('《(.*?)》', sent)
for result in results:
entities_by_rule['文章'].append(result)
# 风险
for sent in split_to_sents(line):
if found_fengxian:
sent = sent.split(':')[0]
fengxian_entities = re.split('以及|、|,|;|。', sent)
fengxian_entities = [re.sub('^[■]+[\d一二三四五六七八九十①②③④⑤]+', '', ent) for ent in fengxian_entities]
fengxian_entities = [re.sub('^[\((]*[\d一二三四五六七八九十①②③④⑤]+[\))\.\s]+', '', ent) for ent in
fengxian_entities]
fengxian_entities = [_ for _ in fengxian_entities if len(_) >= 4]
entities_by_rule['风险'] += fengxian_entities
found_fengxian = False
if not found_fengxian and re.findall('^\s*[\d一二三四五六七八九十]*\s*[\.、]*\s*风险提示[::]*$', sent):
found_fengxian = True
results = re.findall('^\s*[\d一二三四五六七八九十]*\s*[\.、]*\s*风险提示[::]*(.{5,})$', sent)
if results:
fengxian_entities = re.split('以及|、|,|;|。', results[0])
fengxian_entities = [re.sub('^[■]+[\d一二三四五六七八九十①②③④⑤]+', '', ent) for ent in fengxian_entities]
fengxian_entities = [re.sub('^[\((]*[\d一二三四五六七八九十①②③④⑤]+[\))\.\s]+', '', ent) for ent in
fengxian_entities]
fengxian_entities = [_ for _ in fengxian_entities if len(_) >= 4]
entities_by_rule['风险'] += fengxian_entities
for ent_type, ents in entities_by_rule.items():
entities_by_rule[ent_type] = list(set(ents))
return entities_by_rule