In [1]:
import re
import os
import json
import math
import shutil
from pprint import pprint
import numpy as np
import pandas as pd
from glob import glob
import random as rd
import spacy 
import pyodbc

import jieba
import jieba.analyse

def read_json(filepath):
    with open(filepath, 'r') as fi:
        data = json.load(fi)
    return data

In [31]:
class BaseDAL(object):

    def __init__(self, host, user, passwd, name):
        driver= '{ODBC Driver 13 for SQL Server}'
        self.conn = pyodbc.connect('DRIVER={};SERVER={};DATABASE={};UID={};PWD={}'.format(
                               driver, 
                               host, 
                               name,
                               user,
                               passwd))
        self.cursor = self.conn.cursor()

    def __exit__(self, exc_type, exc_value, traceback):
        self.cursor.close()
        self.conn.close()

    def execute(cls, sql):
        cls.cursor.execute(sql)
        return cls.cursor.fetchall()

    def execute_commit(cls, sql):
        cls.cursor.execute(sql)
        cls.conn.commit()
        
db = BaseDAL(host='10.173.1.150', user='sa', passwd='wandagcLi4$', name='zx-test')

### 1. 规则匹配

In [2]:
class RegexMatch(object):
    def __init__(self, regex_path):
        if not os.path.exists(regex_path):
            raise ValueError('Must provide regex file path.')
        self.neg_rules, self.neg_exact = [], []   # 消极
        self.pos_rules, self.pos_exact = [], []   # 积极
        self.norm_rules, self.norm_exact = [], [] # 正常
        with open(regex_path, 'r') as fi:
            lines = fi.readlines()
            lines = [line.strip() for line in lines if line.strip()]
        for line in lines:
            if line.startswith('neg'):
                line = line.split('\t')
                if line[1].strip() == 'regex':
                    self.neg_rules.append(re.compile(line[2].strip()))
                elif line[1].strip() == 'exact':
                    self.neg_exact.append(line[2].strip())
            elif line.startswith('pos'):
                line = line.split('\t')
                if line[1].strip() == 'regex':
                    self.pos_rules.append(re.compile(line[2].strip()))
                elif line[1].strip() == 'exact':
                    self.pos_exact.append(line[2].strip())
            elif line.startswith('norm'):
                line = line.split('\t')
                if line[1].strip() == 'regex':
                    self.norm_rules.append(re.compile(line[2].strip()))
                elif line[1].strip() == 'exact':
                    self.norm_exact.append(line[2].strip())
            else:
                print('Warning: rule format is not accepted -->', line[:30])

    def process(self, sentence):
        res = {}
        neg_matched, pos_matched = [], []
        neg_texts, pos_texts = [], []
        for rule in self.neg_rules:
            r = re.search(rule, sentence)
            if r:
                neg_matched.append(rule)
                neg_texts.append(r.group())
        for rule in self.pos_rules:
            r = re.search(rule, sentence)
            if r:
                pos_matched.append(rule)
                pos_texts.append(r.group())
        res['neg_rules'] = neg_matched
        res['neg_texts'] = neg_texts
        res['pos_rules'] = pos_matched
        res['pos_texts'] = pos_texts
        return res
    
    def run(self, sentences):
        results = []
        for sent in sentences:
            res = self.process(sent)
            results.append(res)
        return results

In [42]:
data = ['房企业绩观察 | 土储成发展短板 冠城大通营收、净利双下滑遭问询', 
        '上半年净利润预减超三成 重药控股启动并购准备抢“地盘”',
        '跨界转型受阻 浙江广厦营收净利润双下滑遭问询', 
        '浙江广厦(600052.SH)：楼忠福及楼明合计3.5%股份已被冻结']

rm_path = '../regex/sentment_rule.txt'
rm = RegexMatch(rm_path)
rm.run(data)

[{'neg_rules': [re.compile(r'(业绩|利润|盈利|营收|净利).{0,4}(大跌|下滑|降|减少|双降|预减|亏损)',
   re.UNICODE)],
  'neg_texts': ['营收、净利双下滑'],
  'pos_rules': [],
  'pos_texts': []},
 {'neg_rules': [re.compile(r'(业绩|利润|盈利|营收|净利).{0,4}(大跌|下滑|降|减少|双降|预减|亏损)',
   re.UNICODE)],
  'neg_texts': ['净利润预减'],
  'pos_rules': [],
  'pos_texts': []},
 {'neg_rules': [re.compile(r'(业绩|利润|盈利|营收|净利).{0,4}(大跌|下滑|降|减少|双降|预减|亏损)',
   re.UNICODE)],
  'neg_texts': ['营收净利润双下滑'],
  'pos_rules': [],
  'pos_texts': []},
 {'neg_rules': [re.compile(r'(被|遭|股份|万股).{0,4}(冻结|罚款|重罚)', re.UNICODE)],
  'neg_texts': ['股份已被冻结'],
  'pos_rules': [],
  'pos_texts': []}]

In [None]:
data = db.execute(
    '''select file_uuid,title
    from [zx-test].[dbo].[news]
    '''
)

In [67]:
def update_db(d):
    file_uuid, sentence = d[0], d[1]
    res = rm.process(sentence)
    neg_texts, pos_texts = [], []
    if res['neg_texts']:
        neg_texts = res['neg_texts']
    if res['pos_texts']:
        pos_texts = res['pos_texts']
    if neg_texts or pos_texts:
        neg_texts = ','.join(neg_texts)
        pos_texts = ','.join(pos_texts)
        db.execute_commit(
            '''update [zx-test].[dbo].[news]
            set neg_texts = N'{}', pos_texts = N'{}' 
            where file_uuid = '{}'
            '''.format(neg_texts, pos_texts, file_uuid)
        )
_ = list(map(lambda d: update_db(d), data))

### 2. 匹配负面词汇

In [118]:
class BowMatch(object):
    def __init__(self):
        self.bow_path = '../data/中文金融情感词典_姜富伟等(2020).xlsx'
        self.init()
        
    def init(self):
        neg_bow = pd.read_excel(self.bow_path, sheet_name='negative')
        pos_bow = pd.read_excel(self.bow_path, sheet_name='positive')
        neg_bow = map(lambda x: x.strip(), neg_bow.values.flatten())
        pos_bow = map(lambda x: x.strip(), pos_bow.values.flatten())
        self.neg_bow = set([i for i in neg_bow if i])
        self.pos_bow = set([i for i in pos_bow if i])
        
    def process(self, word):
        if word in self.neg_bow:
            return 2
        elif word in self.pos_bow:
            return 1
        return 0
    
    def process_sentence(self, sent):
        neg_words, pos_words = [], []
        for word in jieba.cut(sent):
            m = self.process(word)
            if m == 2:
                neg_words.append(word)
            elif m == 1:
                pos_words.append(word)
        return neg_words, pos_words

In [20]:
def process(filename):
    filepath = '../data/raw/{}.json'.format(filename)
    data = read_json(filepath)
    bm = BowMatch()
    for i in range(int(len(data)/1000)+1):
        new_file = '../data/标注/{}_{}.xlsx'.format(filename, i)
        res = []
        left, right = i*1000, min((i+1)*1000, len(data))
        for row in data[left:right]:
            row_ = [row['symbol'], row['publish_date'], row['title'], None, None, None, row['content'], row['url']]
            neg, pos = bm.process_sentence(row['title'])
            row_[3] = neg
            row_[4] = pos
            res.append(row_)
        df = pd.DataFrame(res, columns=['symbol', 'publish_date', 'title', '负向词', '正向词', '规则', 'content', 'url'])
        df.to_excel(new_file, index=False)

process('xueqiu')
process('sina')

### 3. 转化标注的文件为训练格式

In [7]:
id_to_label = {
    '1': '负面',
    '2': '中性',
    '3': '正面'
}

def read_labeled_files():
    labeled_files = glob('../data/标注_0819/sina_*.xlsx')
    data = []
    for file in labeled_files:
        df = pd.read_excel(file)
        rows = df.iloc[:, [2, 5]].values.tolist()
        data.extend(rows)
    df = pd.DataFrame(data, columns=['sentence', 'label'])
    return df
def process_labeled_data(df):
    df_ = df.copy()
    df_['sentence'] = df_['sentence'].map(lambda x: x.replace('\t', ''))
    df_['label'] = df_['label'].map(lambda x: id_to_label[str(x)])
    is_q = df_['sentence'].map(lambda x: x.startswith('投资者提问'))
    df_ = df_[-is_q]
    print(df_.shape)
    return df_
df = read_labeled_files()
df = process_labeled_data(df)

(6215, 2)


In [3]:
df.head()

Unnamed: 0,sentence,label
0,北向资金高比例持有索菲亚、上海机场等股,正面
3,上海机场：短期业绩承压明显 复苏时间尚难确定,负面
4,深交所连预警：3股将被外资买爆 为何钟爱中国资产？,正面
5,上海机场受沪股通青睐 连续3日净买入,正面
6,首创股份拟发行20亿元可续期公司债券,中性


In [5]:
test_ratio = 0.2
np.random.seed(42)

shuffle_index = np.random.permutation(len(df))
train_data = df.iloc[shuffle_index[:-int(len(df)*test_ratio)]]
test_data = df.iloc[shuffle_index[-int(len(df)*test_ratio):]]

train_data.to_csv('../data/processed/train.tsv', sep ='\t', index=False)
test_data.to_csv('../data/processed/dev.tsv', sep ='\t', index=False)

In [2]:
# df = pd.read_excel('../data/raw/嘉华科技.xlsx')

## 存储为模型预测的格式
# df = df[['title']]
# df.columns = ['sentence']
# df.to_csv('../data/processed/test.tsv', sep ='\t', index=False)

## 合并
# df_pred = pd.read_excel('../data/prediction/st_news_test_results.xlsx')
# df['pred'] = df_pred['pred_label']
# df['probability'] = df_pred['probability']
# df.to_excel('../data/trash/嘉华科技_pred.xlsx', index=False)

### 4. NER公司名提取

In [18]:
nlp = spacy.load('zh_core_web_sm')

def ner_detect(sentence):
    s = nlp(sentence)
    return [i.text for i in s.ents if i.label_ == 'ORG']

In [23]:
sent = '浦发银行哈尔滨分行：以金融力量助力企业复工复产'
sent = '华能水电：公司电价的定价收费优先电厂电价执行政府定价 市场化电厂电价由市场化交易形成'
sent = '宝钢股份(600019.SH)：宝山基地四号高炉设备故障 未造成人员伤亡'
res = ner_detect(sent)
print(res)

['宝钢股份(', '宝山基地']


In [32]:
def ner_match(ner, sentence):
    res = re.search(ner, sentence)
    if res:
        return res.group()

ner = '宝山基地'
sent = '宝钢股份(600019.SH)：宝山基地四号高炉设备故障 未造成人员伤亡'
res = ner_match(ner, sent)
print(res)

宝山基地


### 5. 生成词云

In [2]:
def init_jieba(stopwords_file=None, idf_file=None):
    if stopwords_file:
        jieba.analyse.set_stop_words(stopwords_file)
    if idf_file:
        jieba.analyse.set_idf_path(idf_file)
init_jieba('../data/stopwords.txt')

def build_fn(symbol_pattern, offset=0):
    def generate_cloud_words(x):
        # x: pd.Series
        res = jieba.analyse.extract_tags(x['title'], withWeight=True, topK=300)
        res = [i for i in res if not re.search('[a-zA-Z0-9]', i[0])] #re.search('^\d*\.?\d*$', i[0])
        m = re.findall(symbol_pattern, x['symbol'])
        if len(m) > 0:
            m = m[0][offset:]
            with open('../data/cloud_words/{}.txt'.format(m), 'w') as fi:
                for i in res:
                    fi.write('{},{}\n'.format(i[0], str(i[1])))
    return generate_cloud_words

In [21]:
# 1. Read from local files
filepath = '../data/raw/{}.json'.format('sina')
data = read_json(filepath)
df = pd.DataFrame(data)
df = pd.DataFrame(
    df[['title', 'symbol']].groupby('symbol')['title'].apply(lambda x: ' '.join(x.values))
).reset_index()

In [170]:
generate_cloud_words = build_fn('<span>\(\d+', 7)
_ = df.apply(generate_cloud_words, axis=1)

In [44]:
# 2. Read from sql server
data = db.execute(
    '''select title,symbol,stock_name
    from [zx-test].[dbo].[news]
    where symbol = 'SH688051'
    '''
)
data = pd.Series({
    'symbol': data[0][1],
    'title': ' '.join([i[0] for i in data])
})

In [61]:
generate_cloud_words = build_fn('[SH|SZ](\d+)')
generate_cloud_words(data)

### 6. 将模型及正则匹配结果保存至数据库

In [None]:
import torch
from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer

ckpt_path = '/tmp/st_news'
config = AutoConfig.from_pretrained(ckpt_path)
tokenizer = AutoTokenizer.from_pretrained(ckpt_path)
model_ft = AutoModelForSequenceClassification.from_pretrained(ckpt_path, config=config)
model_ft.eval()

def predict(sentence):
    input_ids = tokenizer.encode(sentence)
    input_ids = torch.tensor([input_ids])
    with torch.no_grad():
        outputs = model_ft(input_ids)
        predictions = outputs[0]

    softmax = torch.nn.Softmax(dim=1)
    predictions = softmax(predictions)
    index_pred = torch.argmax(predictions[0, :]).item()
    prob = predictions.numpy()[0, index_pred]
    return index_pred, prob

In [20]:
data = db.execute(
    '''select file_uuid,title
    from [zx-test].[dbo].[news]
    '''
)

In [33]:
def update_db(d):
    file_uuid, sentence = d[0], d[1]
    index_pred, prob = predict(sentence)

    db.execute_commit(
        '''update [zx-test].[dbo].[news]
        set label = '{}', score = '{:.5f}' 
        where file_uuid = '{}'
        '''.format((index_pred+1), prob, file_uuid)
    )
_ = list(map(lambda d: update_db(d), data))