# Preprocessing 2
Investment Analysis with Machine Learning\
UROP 1100E (Man Yin Michael YEUNG, 2021 Fall)

In [2]:
import pandas as pd
import datetime as dt
import numpy as np
import jieba
import jieba.posseg as pseg
import paddle
import string
import re
from multiprocessing import Pool
from stopwordsiso import stopwords
import warnings
from collections import Counter
warnings.filterwarnings("ignore")
from IPython.display import clear_output

### Read Text Data

In [3]:
text_data = pd.read_parquet('D:/tsclientdata/anatxt.parquet.gzip')
text_data.shape

(1693001, 11)

In [4]:
text_data = text_data.drop(columns=['FYEAR']).drop_duplicates()
text_data.shape

(598664, 10)

NOTE: ADD FULL STOP TO TITLE (FOR SENTENCE SPLIT)

In [5]:
text_data.index = text_data.ID; text_data = text_data.drop(columns=['ID']).loc[text_data.content.notna()]
text_data['TITLE'] = text_data['TITLE'].map(lambda x: str(x)+'。')
text_data.content = text_data['TITLE'] + text_data.content
text_data = text_data.drop(columns=['TITLE'])[['SecuCode','create_date','content']].rename(columns={'create_date':'date'})
text_data.head(3)

Unnamed: 0_level_0,SecuCode,date,content
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
166048,600150,2009-10-28,中国船舶：三季度业绩符合预期。 09 年三季度业绩符合预期：\r\n\r\n ...
166049,600426,2009-10-28,华鲁恒升：成本优势体现，三季度业绩符合预期。 三季度业绩符合预期：\r\n\r\n ...
167170,600887,2009-10-30,*ST伊利：毛利率提升推动业绩增长。 2009年1-9月，公司实现营业收入192....


### Jieba

In [6]:
text_data.content = text_data.content.map(lambda x: re.split("！|？|｡|。|，|,|；|\r|\n|\t",x))

In [7]:
chinese_punct = "！？｡。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.． \r\n\t"
numbers = "1234567890"
def cut_list(list_1):
    ret = []
    for sentence in list_1:
        sentence = ''.join(c for c in sentence if c not in string.punctuation + numbers + chinese_punct + " ")
        sentence = list(jieba.cut(sentence, cut_all=False))
        if sentence != []:
            ret.append(sentence)
    return ret

In [8]:
text_data['jieba_content'] = text_data.content.map(cut_list)
text_data.to_parquet('D:/michael/dictionary_based/text_jieba_sentences.parquet.gzip')
text_data.head(3)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\michael\AppData\Local\Temp\jieba.cache
Loading model cost 1.501 seconds.
Prefix dict has been built successfully.


Unnamed: 0_level_0,SecuCode,date,content,jieba_content
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
166048,600150,2009-10-28,"[中国船舶：三季度业绩符合预期, 09 年三季度业绩符合预期：, , , , ...","[[中国, 船舶, 三季度, 业绩, 符合, 预期], [年, 三季度, 业绩, 符合, 预..."
166049,600426,2009-10-28,"[华鲁恒升：成本优势体现, 三季度业绩符合预期, 三季度业绩符合预期：, , , ...","[[华鲁, 恒升, 成本, 优势, 体现], [三季度, 业绩, 符合, 预期], [三季度..."
167170,600887,2009-10-30,"[*ST伊利：毛利率提升推动业绩增长, 2009年1-9月, 公司实现营业收入1...","[[ST, 伊利, 毛利率, 提升, 推动, 业绩, 增长], [年, 月], [公司, 实..."


### Merge Data Again

In [17]:
data_2 = pd.read_parquet('D:/michael/dictionary_based/text_jieba_sentences.parquet.gzip')
data_2 = data_2.reset_index()
data_2.head(3)

Unnamed: 0,ID,SecuCode,date,content,jieba_content
0,166048,600150,2009-10-28,"[中国船舶：三季度业绩符合预期, 09 年三季度业绩符合预期：, , , , ...","[[中国, 船舶, 三季度, 业绩, 符合, 预期], [年, 三季度, 业绩, 符合, 预..."
1,166049,600426,2009-10-28,"[华鲁恒升：成本优势体现, 三季度业绩符合预期, 三季度业绩符合预期：, , , ...","[[华鲁, 恒升, 成本, 优势, 体现], [三季度, 业绩, 符合, 预期], [三季度..."
2,167170,600887,2009-10-30,"[*ST伊利：毛利率提升推动业绩增长, 2009年1-9月, 公司实现营业收入1...","[[ST, 伊利, 毛利率, 提升, 推动, 业绩, 增长], [年, 月], [公司, 实..."


In [5]:
data_1 = pd.read_parquet('D:/michael/full_version_data/preprocessed_data_1.parquet.gzip')
data_1.head(3)

Unnamed: 0,ID,SecuCode,date,specret,specret_2d,specret_tp2-tp6,jieba_content
0,181899,600323,2010-01-04,0.006658,-0.001522,-0.0303,"[南海发展nt, 水价n, 如期t, 上调v, 增厚v, 业绩n, 约d, 事项n, 公司n..."
1,181900,2024,2010-01-04,-0.004292,-0.022783,-0.047744,"[苏宁ns, 电器n, 内外s, 兼修v, 进入v, 新一轮nz, 增长期n, 投资vn, ..."
2,181901,2092,2010-01-04,0.030815,0.018962,0.05539,"[泰ns, 化学n, 发布v, 准东ns, 井田nr, 勘探vn, 报告n, 煤炭n, 开发..."


In [34]:
data_1.date = data_1.date.map(lambda x: dt.datetime.strptime(x,'%Y-%m-%d'))

In [18]:
data_2 = data_2.rename(columns={'jieba_content':'jieba_sentence'})

In [44]:
merged_data = pd.merge(data_1,data_2,how='inner',on=['ID','SecuCode','date'])
merged_data.head(3)

Unnamed: 0,ID,SecuCode,date,specret,specret_2d,specret_tp2-tp6,jieba_content,content,jieba_sentence
0,181899,600323,2010-01-04,0.006658,-0.001522,-0.0303,"[南海发展nt, 水价n, 如期t, 上调v, 增厚v, 业绩n, 约d, 事项n, 公司n...","[南海发展：水价如期上调, 增厚业绩约10％, 事项：, , , , 公...","[[南海发展, 水价, 如期, 上调], [增厚, 业绩, 约], [事项], [公司, 今..."
1,181900,2024,2010-01-04,-0.004292,-0.022783,-0.047744,"[苏宁ns, 电器n, 内外s, 兼修v, 进入v, 新一轮nz, 增长期n, 投资vn, ...","[苏宁电器：“内外兼修”进入新一轮增长期, 投资要点, , , , 行业...","[[苏宁, 电器, 内外, 兼修, 进入, 新一轮, 增长期], [投资, 要点], [行业..."
2,181901,2092,2010-01-04,0.030815,0.018962,0.05539,"[泰ns, 化学n, 发布v, 准东ns, 井田nr, 勘探vn, 报告n, 煤炭n, 开发...","[中泰化学：发布准东一井田勘探报告, 煤炭开发再推进一步, 事件描述：, , ,...","[[中, 泰, 化学, 发布, 准东, 一, 井田, 勘探, 报告], [煤炭, 开发, 再..."


In [45]:
merged_data = merged_data[['ID','SecuCode','date','specret','specret_2d','specret_tp2-tp6','jieba_content','jieba_sentence']]
merged_data.head(3)

Unnamed: 0,ID,SecuCode,date,specret,specret_2d,specret_tp2-tp6,jieba_content,jieba_sentence
0,181899,600323,2010-01-04,0.006658,-0.001522,-0.0303,"[南海发展nt, 水价n, 如期t, 上调v, 增厚v, 业绩n, 约d, 事项n, 公司n...","[[南海发展, 水价, 如期, 上调], [增厚, 业绩, 约], [事项], [公司, 今..."
1,181900,2024,2010-01-04,-0.004292,-0.022783,-0.047744,"[苏宁ns, 电器n, 内外s, 兼修v, 进入v, 新一轮nz, 增长期n, 投资vn, ...","[[苏宁, 电器, 内外, 兼修, 进入, 新一轮, 增长期], [投资, 要点], [行业..."
2,181901,2092,2010-01-04,0.030815,0.018962,0.05539,"[泰ns, 化学n, 发布v, 准东ns, 井田nr, 勘探vn, 报告n, 煤炭n, 开发...","[[中, 泰, 化学, 发布, 准东, 一, 井田, 勘探, 报告], [煤炭, 开发, 再..."


In [46]:
merged_data.jieba_sentence = merged_data.jieba_sentence.map(list)

In [48]:
merged_data.shape

(497665, 8)

In [47]:
merged_data.to_parquet('D:/michael/full_version_data/preprocessed_data_2.parquet.gzip')