In [None]:
# load_data

import glob

ARTICLE_FOLDER = '../2018_research_data/article'
LABEL_FOLDER = '../2018_research_data/label'

article = []
label = []

for file in glob.glob(ARTICLE_FOLDER + "/*.txt"):
    with open(file, "r") as f:
        article.append(f.read())
        
for file in glob.glob(LABEL_FOLDER + "/*.txt"):
    with open(file, "r") as f:
        tags = f.read().split('\n')[:-1]
        label.append(tags)

# Preprcoessing

In [64]:
import re
import nltk
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

class Preprocessing:
    def __init__(self):        
        self.stopwords = self._load_stopwords()
        
    def text_preprocessing(self, article_list):
        token_list = self.tokenize(article_list)
        # print('Successfully tokenized!')
        
        token_list = self.remove_stopwords(token_list)
        # print('Successfully remove stopwords!')
        
        # token_list = self.select_POS(token_list)
        # print('Successfully POS selected!')
        
        # lemmatization
        token_list = self.lemmatization(token_list)
        # print('Successfully lemmatization!')
        
        # concate tokens
        result_list = []
        for tokens in token_list:
            content = ' '.join(tokens)
            result_list.append(content)
        return result_list
        
    def _load_stopwords(self):
        sw = set(stopwords.words('english'))
        my_stopwords = []
        sw = list(sw) + my_stopwords
        return sw
    
    def cut_sent(self, text:str):
        '''將文章斷句為sent
            Parameters
                text {str} 一篇文章
            Return {list} 被斷開的句子
        '''
        text = text.replace('\n', '')
        text = text.strip()
        return text.split(".")
    
    def tokenize(self, article_list:list):
        result_list = []
        for article in article_list:
            tokens = nltk.word_tokenize(article)
            token_filtered = [w.lower() for w in tokens if w.isalpha()]
            result_list.append(token_filtered)
        return result_list
    
    def remove_stopwords(self, tokenized_article_list:list):
        result_list = []
        for tokens in tokenized_article_list:
            result_list.append([w for w in tokens if w not in self.stopwords])
        return result_list
    
    def lemmatization(self, tokenized_article_list:list):
        wnl = WordNetLemmatizer()
        result_list = []
        for tokens in tokenized_article_list:
            result_list.append([wnl.lemmatize(word) for word in tokens])
        return result_list
    
    def select_POS(self, tokenized_article_list,
                    selective_POS=['NN', 'NNS', 'NNP', 'NNPS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ']):
        result_list = []
        for tokens in tokenized_article_list:
            pos_token_list = nltk.pos_tag(tokens)
            
            selective_pos_tkns = []
            for tkn in pos_token_list:
                if tkn[1] in selective_POS:
                    selective_pos_tkns.append(tkn[0])
            
            result_list.append(selective_pos_tkns)
        return result_list

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jensonsu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# preprocessing

prep = Preprocessing()
article = prep.text_preprocessing(article)

# Word2Vec

In [None]:
'''preprocessing corpus in seg_sent.txt'''

ARTICLE_FOLDER = '../2018_research_data/article'
SEG_SENT_PATH = './seg_sent.txt'

with open(SEG_SENT_PATH, 'w', encoding='utf8') as output:
    for file in glob.glob(ARTICLE_FOLDER + "/*.txt"):
        with open(file, "r") as f:
            text = f.read()
            sent_list = prep.cut_sent(text)

            for sent in sent_list:
                sent = prep.text_preprocessing([sent])[0]
                if sent is not '' and sent is not None and len(sent)>3:
                    output.write(sent)
                    output.write('\n')

In [None]:
from gensim.models import word2vec
import multiprocessing

class Word2Vec(Preprocessing):
    def __init__(self):
        super().__init__()
        
        self.ARTICLE_FOLDER = '../2018_research_data/article'
        self.SEG_TEXT_PATH = './seg_sent.txt'
        
        self.vector_size = 50
        self.window = 3
        self.min_count = 0
        self.sg = 0
        self.negative= 5
        
        self.model = None
        
    def preprocess_and_save_seg_text(self):
        with open(self.SEG_TEXT_PATH, 'w', encoding='utf8') as output:
            for file in glob.glob(ARTICLE_FOLDER + "/*.txt"):
                with open(file, "r") as f:
                    text = f.read()
                    sent_list = self.cut_sent(text)

                    for sent in sent_list:
                        sent = self.text_preprocessing([sent])[0]
                        if sent is not '' and sent is not None and len(sent)>3:
                            output.write(sent)
                            output.write('\n')
        print('Successfully perprocess and save articles into seg text format.')
                            
    def train(self, vector_size=None, window=None, min_count=None, sg=None, negative=None):
        if vector_size:
            self.vector_size = vector_size
        if window:
            self.window = window
        if min_count:
            self.min_count = min_count
        if sg:
            self.sg = sg
        if negative:
            self.negative = negative
        
        sentences = word2vec.LineSentence(self.SEG_TEXT_PATH)
        self.model = word2vec.Word2Vec(sentences, size=self.vector_size, window=self.window,
                                           min_count=self.min_count, workers=multiprocessing.cpu_count(),
                                           sg=self.sg, negative=self.negative)
        print('Successfully train word2vec model!')
        
    def save(self, path='./w2v_model'):
        self.model.save(path+".model")
        print('Successfully save word2vec model!')
        
    def load(self, path='./w2v_model'):
        self.model = word2vec.Word2Vec.load(path+".model")
        print('Successfully load word2vec model!')
        
    def map_word_2_embedding(self):
        embedding_index = dict()
        for word, embed in zip(self.model.wv.index2word, self.model.wv.vectors):
            embedding_index[word] = embed
        return embedding_index

In [None]:
w2v = Word2Vec()
w2v.preprocess_and_save_seg_text()

In [None]:
w2v.train()

In [None]:
indx = w2v.map_word_2_embedding()
w2v.save()

# Keras LSTM


In [45]:
import glob
import numpy as np
import matplotlib.pyplot as plt
import pickle
from sklearn import model_selection
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential, load_model
from keras.layers.core import Dense, Activation, Flatten, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM


class lstm_SDGs_classifier:
    def __init__(self):
        # data folder
        self.ARTICLE_FOLDER = '../2018_research_data/article'
        self.LABEL_FOLDER = '../2018_research_data/label'
        
        # raw data
        self._articles = [] # raw articles data
        self._labels = [] # raw labels data
        
        # training data
        self._train_text = []
        self._train_y_one_hot = []
        
        # valid data
        self._valid_text = []
        self._valid_y_one_hot = []
        
        # token
        self.tokenizer = []
        self.vocab_size = 0
        
        # bag of words
        self._embed_dim = 150 # 詞向量維度
        self._feature_dim = 500 # 字典內的單字數
        self._max_len = 100 # 每句話最大長度
        
        # model
        self.model = None
        
    def load_data(self, train_data=[], valid_data=[]):
        # load memory data
        if train_data and valid_data:
            self._articles = train_data
            self._labels = valid_data
        else:
            for file in glob.glob(self.ARTICLE_FOLDER + "/*.txt"):
                with open(file, "r") as f:
                    self._articles.append(f.read())

            for file in glob.glob(self.LABEL_FOLDER + "/*.txt"):
                with open(file, "r") as f:
                    tags = f.read().split('\n')[:-1]
                    self._labels.append(tags)
                
        print('Successfully load data.')
                
    def preprocessing(self, feature_dim=1000, max_len=100):
        # set feature_dim & max_len
        self._feature_dim = feature_dim
        self._max_len = max_len
        
        # split the dataset into training and testing 
        train_x, valid_x, train_y, valid_y = model_selection.train_test_split(self._articles,
                                                        self._labels, test_size=0.1, random_state=1)
        
        # build token
        self.tokenizer = Tokenizer(num_words=self._feature_dim)
        self.tokenizer.fit_on_texts(train_x)
        self.vocab_size = len(self.tokenizer.word_index) + 1
        print('vocab_size:', self.vocab_size)
        
        # pre-process train & valid x: text to seq & pad seq
        self._train_text = self._text_vectorization(train_x)
        self._valid_text = self._text_vectorization(valid_x)
        
        # pre-process train & valid y: label one-hot encoding
        self._train_y_one_hot = self._label_oneHot_encoding(train_y)
        self._valid_y_one_hot = self._label_oneHot_encoding(valid_y)
        
        print('Successfully pre-process the raw data.')
    
    def set_up_model(self, embed_dim=150, embedding_matrix=None):
        self._embed_dim = embed_dim
        
        model = Sequential()
        
        if embedding_matrix is not None:
            print('load in pre-train embedding.')
            model.add(Embedding(input_dim=self.vocab_size, output_dim=self._embed_dim, 
                                weights=[embedding_matrix], trainable=False, input_length=self._max_len))
        else:
            model.add(Embedding(input_dim=self.vocab_size, output_dim=self._embed_dim, input_length=self._max_len))
        
        
        model.add(LSTM(128))
        model.add(Dense(units=128, activation='relu'))
        model.add(Dropout(0.25))
        model.add(Dense(units=128, activation='relu'))
        model.add(Dense(units=17, activation='sigmoid'))
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        model.summary()
        self.model = model
        
        print('Successfully set up model, everything is already!')
        
    def train(self, batch_size=256, epochs=50, verbose=2, shuffle=True, validation_split=0.1):
        train_history = self.model.fit(self._train_text, self._train_y_one_hot, batch_size=batch_size,
                            epochs=epochs, verbose=verbose, shuffle=shuffle, validation_split=validation_split)
        return train_history
    
    def show_train_history(self, train_history, train, validation):
        plt.plot(train_history.history[train])
        plt.plot(train_history.history[validation])
        plt.title('Train History')
        plt.ylabel(train)
        plt.xlabel('Epoch')
        plt.legend(['train', 'validation'], loc='upper left')
        plt.show()
        
    def evaluate_model(self):
        scores = self.model.evaluate(self._valid_text, self._valid_y_one_hot)
        print('Accuracy:', scores[1])
        
    def save_tokenizer(self, tokenizer_path='my_tokenizer'):
        with open(tokenizer_path+'.pickle', 'wb') as handle:
            pickle.dump(self.tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
        print('Successfully save tokenizer')
        
    def load_tokenizer(self, tokenizer_path):
        with open(tokenizer_path+'.pickle', 'rb') as handle:
            self.tokenizer = pickle.load(handle)
        
    def save_model(self, model_path='my_model'):
        self.model.save(model_path)
        print('Successfully save model')
        
    def load_model(self, model_path):
        self.model = load_model(model_path)
        
    def predict(self, text):
        vector_text = self._text_vectorization([text])
        # print('vector_text:', vector_text)
        predict_soft = self.model.predict(vector_text)
        
        result = self._pred_2_goal(predict_soft)
        return result
        
    def _text_vectorization(self, text):
        text_seq = self.tokenizer.texts_to_sequences(text)
        # print('text_seq:', text_seq)
        return sequence.pad_sequences(text_seq, maxlen=self._max_len)
        
    def _label_oneHot_encoding(self, labels, label_dim=17):
        one_hot_label = []
        for label in labels:
            arr = label_dim*[float(0)]
            for i in label:
                arr[int(i)-1] = float(1)
            one_hot_label.append(arr)
        return np.array(one_hot_label)
    
    def _pred_2_goal(self, pred_result):
        obj = {}
        for i in range(1, 18):
            obj[i] = pred_result[0][i-1]
        
        sorted_list = sorted(obj.items(), key=lambda k: k[1], reverse=True) 
        return sorted_list

In [None]:
lstm_classifier = lstm_SDGs_classifier()

lstm_classifier.load_data(article, label)
lstm_classifier.preprocessing(feature_dim=500, max_len=100)

In [None]:
'''load in wrod2vec pre-train embedding'''

embed_dim = 50

# create a weight matrix for words in training docs
embedding_matrix = np.zeros((lstm_classifier.vocab_size, embed_dim))
for word, index in lstm_classifier.tokenizer.word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

# print(embedding_matrix)

lstm_classifier.set_up_model(embed_dim=embed_dim, embedding_matrix=embedding_matrix)

In [None]:
history = lstm_classifier.train(batch_size=128, epochs=30)

In [None]:
lstm_classifier.show_train_history(history, train='accuracy', validation='val_accuracy')
lstm_classifier.show_train_history(history, train='loss', validation='val_loss')

In [None]:
lstm_classifier.evaluate_model()

In [None]:
text = '''This study separates from two major aspects, and one is industry expanding, another is business operation. According to the work experiences and the professionals’ comments, these two aspects will influence stock price. First, this study investigates the overall review and uses the Diamond model, and finds out which elements will change in the same direction with the product value of Taiwan IC design industry. And this aspect generalizes 11 industry expanding factors. Second, my research uses the five force analysis model and value chain model, and finds out which elements will change in the same direction with the EPS of MediaTek. And this aspect generalizes 29 corporate’s earning growth factors.In the end, my research selects 19 impact factor of stock price trend. And these are selected from the two aspects, which will change in the same direction with the stock price. According to the 19 factors, this study constructs an analysis framework. This framework concludes two aspects and ten elements. One is the industry aspect, and its elements are global industry, Taiwan industry, related industries, and product demand. Another is the corporate aspect, and its elements are enterprise profit, business operations, core of competition, major competitors, major customers, and major suppliers.'''

In [None]:
# preprocessing text
text = prep.text_preprocessing([text])[0]
print(text)

lstm_classifier.predict(text)

In [None]:
lstm_classifier.save_model('20201027_w2v_lstm_model_2')

In [None]:
lstm_classifier.save_tokenizer('20201027_token')

# Load Model & Predict

In [3]:
text = '''This study separates from two major aspects, and one is industry expanding, another is business operation. According to the work experiences and the professionals’ comments, these two aspects will influence stock price. First, this study investigates the overall review and uses the Diamond model, and finds out which elements will change in the same direction with the product value of Taiwan IC design industry. And this aspect generalizes 11 industry expanding factors. Second, my research uses the five force analysis model and value chain model, and finds out which elements will change in the same direction with the EPS of MediaTek. And this aspect generalizes 29 corporate’s earning growth factors.In the end, my research selects 19 impact factor of stock price trend. And these are selected from the two aspects, which will change in the same direction with the stock price. According to the 19 factors, this study constructs an analysis framework. This framework concludes two aspects and ten elements. One is the industry aspect, and its elements are global industry, Taiwan industry, related industries, and product demand. Another is the corporate aspect, and its elements are enterprise profit, business operations, core of competition, major competitors, major customers, and major suppliers.'''

In [5]:
# preprocessing text
prep = Preprocessing()
text = prep.text_preprocessing([text])[0]
print(text)

# load model
lc = lstm_SDGs_classifier()
lc.load_tokenizer('20201027_token')
lc.load_model('20201027_w2v_lstm_model_2')

# predict
lc.predict(text)

Successfully tokenized!
Successfully remove stopwords!
Successfully lemmatization!
study separate two major aspect one industry expanding another business operation according work experience professional comment two aspect influence stock price first study investigates overall review us diamond model find element change direction product value taiwan ic design industry aspect generalizes industry expanding factor second research us five force analysis model value chain model find element change direction eps mediatek aspect generalizes corporate earning growth end research selects impact factor stock price trend selected two aspect change direction stock price according factor study construct analysis framework framework concludes two aspect ten element one industry aspect element global industry taiwan industry related industry product demand another corporate aspect element enterprise profit business operation core competition major competitor major customer major supplier
[[0.021281

[(9, 0.5381969),
 (8, 0.24569705),
 (11, 0.22825849),
 (17, 0.20907578),
 (3, 0.15945989),
 (4, 0.1329284),
 (12, 0.121085614),
 (16, 0.08298901),
 (15, 0.06441569),
 (7, 0.06425646),
 (10, 0.05701056),
 (13, 0.036898226),
 (1, 0.0212816),
 (6, 0.018139362),
 (5, 0.016872019),
 (14, 0.014728785),
 (2, 0.0076040328)]

# SDGs Classifier Percision Calculate

In [6]:
import pymongo

MONGO_URI= 'mongodb://admin:mongoadmin@35.201.137.113:27017/?authSource=admin&readPreference=primary&appname=MongoDB%20Compass&ssl=false'
client = pymongo.MongoClient(MONGO_URI, connect=False)
db = client['nthu_sdg_db']

user_collect = db['users']
paper_collect = db['papers']

robot_paper_collect = db['robot_papers']

In [13]:
# get all papers

agg_doc = user_collect.aggregate([
            {
                '$lookup': {
                    'from': "papers",
                    'localField': "account",
                    'foreignField': "account",
                    'as': "paper_doc"
                }
            }
        ])
agg_doc_list = list(agg_doc)

In [73]:
class Evaluate(Preprocessing):
    def __init__(self):
        super().__init__()
        
        self.college = ['COTM', 'EECS', 'ENGI', 'CHS', 'NUCL', 'THC', 'SCI', 'LSCO', 'HCTC', 'TE', 'OAA', 'CARS']
        self.evalute_dict = {}
        
    def set_evaluate_format(self, format=[]):
        self.evalute_dict = {}
        for c in self.college:
            self.evalute_dict[c] = {}
            for f in format:
                self.evalute_dict[c][f] = 0
        print('Successfully set evaluate dict.')
        
    def calculate_hit(self, classifier, paper_list):
        '''預測的前三項goal，命中使用者選擇goal的比例'''
        total_hit_count = 0
        
        # calculate paper
        for paper in paper_list:
            text = paper['summaryEN']
            text = self.text_preprocessing([text])[0]

            user_tags = [ str(t['goal_id']) for t in paper['tags'] ]
            pred_tags = [ str(t[0]) for t in classifier.predict(text)[:3] ]

            # 預測的前三項goal，命中使用者選擇goal的比例
            for t in user_tags:
                if t in pred_tags:
                    total_hit_count += 1
                    self.evalute_dict[paper['college']]['hit_count'] += 1
                    break
            self.evalute_dict[paper['college']]['paper_count'] += 1
            
        # calculate percision
        for col in self.evalute_dict.keys():
            try:
                hit_count = self.evalute_dict[col]['hit_count']
                paper_count = self.evalute_dict[col]['paper_count']
                self.evalute_dict[col]['percision'] = round(hit_count/paper_count*100, 2)
            except Exception as err:
                print('Col:', col)
                print('Col Dict:', self.evalute_dict[col])
                print('Err:', err)
        self.evalute_dict['percision'] = round(total_hit_count/len(paper_list)*100, 2)
    
    def percision(self, classifier, paper_list, mode='hit'):
        if mode is 'hit':
            self.calculate_hit(classifier, paper_list)
        return self.evalute_dict

In [43]:
# 取出所有已標記papers

paper_list = []

for user in agg_doc_list:
    if user['account']=='music1353@gmail.com':
        continue
    
    papers = user['paper_doc'][0]['papers']
    college = user['college']
    
    for p in papers:
        if p['isTag'] is False:
            continue
        
        p['college'] = college
        paper_list.append(p)

In [74]:
evaluate = Evaluate()
evaluate.set_evaluate_format(['paper_count', 'hit_count', 'percision'])

lc = lstm_SDGs_classifier()
lc.load_tokenizer('20201027_token')
lc.load_model('20201027_w2v_lstm_model_2')

result = evaluate.percision(lc, paper_list, mode='hit')
result

Successfully set evaluate dict.


{'COTM': {'paper_count': 440, 'hit_count': 253, 'percision': 57.5},
 'EECS': {'paper_count': 600, 'hit_count': 431, 'percision': 71.83},
 'ENGI': {'paper_count': 1025, 'hit_count': 741, 'percision': 72.29},
 'CHS': {'paper_count': 208, 'hit_count': 102, 'percision': 49.04},
 'NUCL': {'paper_count': 527, 'hit_count': 332, 'percision': 63.0},
 'THC': {'paper_count': 94, 'hit_count': 68, 'percision': 72.34},
 'SCI': {'paper_count': 365, 'hit_count': 236, 'percision': 64.66},
 'LSCO': {'paper_count': 309, 'hit_count': 212, 'percision': 68.61},
 'HCTC': {'paper_count': 545, 'hit_count': 374, 'percision': 68.62},
 'TE': {'paper_count': 6, 'hit_count': 5, 'percision': 83.33},
 'OAA': {'paper_count': 2, 'hit_count': 1, 'percision': 50.0},
 'CARS': {'paper_count': 35, 'hit_count': 13, 'percision': 37.14},
 'percision': 66.6}