# Named Entities Recognition

## 1. 规则

### example: 日期识别

1. 从句子中提取表示日期的词语
    1. 利用jieba进行分词
    2. 如果这个词语是数量词或者表示日期的词语,就添加(和前一个词语组合或者作为单独的词语等等)
    3. (额外)如果发现有'今天'/'明天'/'后天'等词语就直接转换成真实的日期

2. 判断提取出来的表示日期的词语是否确实表示日期,如果不是的话,就转换成正确的格式或者就丢弃

3. 解析提取出来的词语,转换成合适的格式
    1. 先采用dateutil.parser.parse来解析,如果成功就皆大欢喜
    2. parse解析失败,使用正则表达式从词语中提取year,month,day,hour,minute,second信息
    3. 将提取出来的信息进一步处理,包括中文数字和'下午'等词语
    4. 转换格式

In [41]:
import re
import jieba.posseg as psg
from datetime import datetime, timedelta
from dateutil.parser import parse


UTIL_CN_NUM = {
    '零': 0, '一': 1, '二': 2, '两': 2, '三': 3, '四': 4,
    '五': 5, '六': 6, '七': 7, '八': 8, '九': 9,
    '0': 0, '1': 1, '2': 2, '3': 3, '4': 4,
    '5': 5, '6': 6, '7': 7, '8': 8, '9': 9
}

UTIL_CN_UNIT = {'十': 10, '百': 100, '千': 1000, '万': 10000}

def cn2dig(src):
    if src == "":
        return None
    m = re.match("\d+", src)
    if m:
        return int(m.group(0))
    rsl = 0
    unit = 1
    for item in src[::-1]:
        if item in UTIL_CN_UNIT.keys():
            unit = UTIL_CN_UNIT[item]
        elif item in UTIL_CN_NUM.keys():
            num = UTIL_CN_NUM[item]
            rsl += num * unit
        else:
            return None
    if rsl < unit:
        rsl += unit
    return rsl

def year2dig(year):
    res = ''
    for item in year:
        if item in UTIL_CN_NUM.keys():
            res = res + str(UTIL_CN_NUM[item])
        else:
            res = res + item
    m = re.match("\d+", res)
    if m:
        if len(m.group(0)) == 2:
            return int(datetime.datetime.today().year/100)*100 + int(m.group(0))
        else:
            return int(m.group(0))
    else:
        return None


def check_time_valid(word):
    '''
        Check whether the word represents a date or not.
    '''
    m = re.match('\d+$', word)
    if m:
        return None
    new_word = re.sub('[号|日]\d+$', '日', word)
    if new_word == word:
        return word
    else:
        return check_time_valid(new_word)


def parse_time(word):
    if len(word) == 0 or word is None:
        return None
    
    
    try:
        dt = parse(word, fuzzy=True)
        return dt.strftime('%Y-%m-%d %H:%M:%S')
    except Exception as e:
        re_zh_str = r"([0-9零一二两三四五六七八九十]+年)?([0-9一二两三四五六七八九十]+月)?([0-9一二两三四五六七八九十]+[号日])?([上中下午晚早]+)?([0-9零一二两三四五六七八九十百]+[点:\.时])?([0-9零一二三四五六七八九十百]+分?)?([0-9零一二三四五六七八九十百]+秒)?"
        m = re.match(re_zh_str, word)
        if m:

            res = {
                'year': m.group(1),
                'month': m.group(2),
                'day': m.group(3),
                'hour': m.group(5) if m.group(5) is not None else '00',
                'minute': m.group(6) if m.group(6) is not None else '00',
                'second': m.group(7) if m.group(7) is not None else '00',
            }
            params = {}
            for unit in res:
                if res[unit] is not None and len(res[unit]) != 0:
                    tmp = None
                    if unit == 'year':
                        tmp = year2dig(res[unit][:-1])
                    else:
                        tmp = cn2dig(res[unit][:-1])
                    if tmp is not None:
                        params[unit] = int(tmp)
            
            # 处理'下午'等词语
            if m.group(4) is not None:
                if m.group(4) in ['下午', '晚上', '中午']:
                    params['hour'] = params['hour'] + 12 if params['hour'] < 12 else params['hour']
            
            
            return datetime.today().replace(**params).strftime('%Y-%m-%d %H:%M:%S')
            
        else:
            return None

    
    

def extract_time(text):
    words = list()
    word = ''
    special_date = {'今天': 0, '明天': 1, '后天': 2}
    for part_word, tag in psg.cut(text):
        if part_word in special_date:
            if len(word) != 0:
                words.append(word)
            word = part_word
            word = (datetime.today() + timedelta(days=special_date[part_word])).strftime('%Y年%m月%d日')
        elif tag not in ['m', 't']:
            if len(word) != 0:
                words.append(word)
                word = ''
            continue
        elif len(word) != 0:
            word += part_word
        else:
            word = part_word
    if len(word) != 0:
        words.append(word)
    
    # Ensure the word is valid time, otherwise modify the word to be valid
    words = [check_time_valid(word) for word in words if check_time_valid(word) is not None]
    
    # Parse time to be a correct format
    words = [parse_time(word) for word in words if parse_time(word) is not None]
    
    return words
        

In [42]:
text1 = '我要住到明天下午三点'
print(text1, extract_time(text1), sep=':')

text2 = '预定28号的房间'
print(text2, extract_time(text2), sep=':')

text3 = '我要从26号下午4点住到11月2号'
print(text3, extract_time(text3), sep=':')

text4 = '我要预订今天到30的房间'
print(text4, extract_time(text4), sep=':')

text5 = '今天30号呵呵'
print(text5, extract_time(text5), sep=':')

我要住到明天下午三点:['2019-04-13 00:00:00']
预定28号的房间:['2019-04-28 00:00:00']
我要从26号下午4点住到11月2号:['2019-04-26 16:00:00', '2019-11-02 00:00:00']
我要预订今天到30的房间:['2019-04-12 00:00:00']
今天30号呵呵:['2030-04-12 00:00:00']
