## Predict Topics with LDA

In [1]:
# test data
test_data_list = ['[WFBS-SVC] Installation on Mac Is Not Full Featured',
                  '[WFBS - A] Smart Scan is not updating',
                  '[WFBS - SVC] Installation issue',
                  '[MALWARE][WFBS SVC] Wallet Ransomware',
                  '[MALWARE][WFBS S 9.5]Possible Ransomware detection on clients machine',
                 ]


In [2]:
import pandas as pd
df = pd.DataFrame(test_data_list, columns=['Subject'])
df.head(10)

Unnamed: 0,Subject
0,[WFBS-SVC] Installation on Mac Is Not Full Fea...
1,[WFBS - A] Smart Scan is not updating
2,[WFBS - SVC] Installation issue
3,[MALWARE][WFBS SVC] Wallet Ransomware
4,[MALWARE][WFBS S 9.5]Possible Ransomware detec...


In [3]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
#from nltk.stem.porter import PorterStemmer
#from nltk.stem.snowball import SnowballStemmer

import krovetz
from nltk.tokenize import word_tokenize

stop_words=stopwords.words('english')
break_words=list('\n\t')
prefix_char_remove=list('.*,-')

stemmer = krovetz.PyKrovetzStemmer()
#stemmer = SnowballStemmer("english")
#stemmer.stem('working')
#lemmatizer = WordNetLemmatizer()
word_freqs = dict()


## 以 regular expression 去除 email、數字、URL、Phone No、序號

In [4]:
re_pattern_list=[]
re_pattern_list.append('[a-zA-Z0-9_]+\.(com|org|net)[a-zA-Z0-9_\.]*$') #email
re_pattern_list.append('[0-9_]+') # all digits
re_pattern_list.append("\/\/[\w.-]+(?:\.[\w\.-]+)+[\w\-\._~:/?#[\]@!\$&'\(\)\*\+,;=.]+$") # url
re_pattern_list.append('[0-9\-]+') # phone no
re_pattern_list.append('[\u0800-\u4e00]+') # 日文
re_pattern_list.append('(trend|trend\smicro)') # trend micro


## 字詞清理函數

In [5]:
# clean words
def clean_word(word_0):
    word_list=[]
    for word in word_0.split('/'):

        if len(word) <= 0 :
            continue

        # remove 網路芳鄰路徑
        remote_path=word.split('\\')
        if len(remote_path) > 0 and len(remote_path[-1])>0:
            word=remote_path[-1]
        elif len(remote_path) > 1 and len(remote_path[-2])>0:
            word=remote_path[-2]
        elif len(remote_path) > 2 and len(remote_path[-3])>0:
            word=remote_path[-3]

        if word[0] == '"' and word[-1] == '"':
            word=word[1:-1]
        if word[0] == "'" and word[-1] == "'":
            word=word[1:-1]
        if len(word) <= 0 :
            continue

        # repeat 3 times to remove continous chars
        if word[0] in prefix_char_remove:
            word=word[1:]
        if len(word) <= 0 :
            continue
        if word[0] in prefix_char_remove:
            word=word[1:]
        if len(word) <= 0 :
            continue
        if word[0] in prefix_char_remove:
            word=word[1:]

        # lemmatize
        word=word.strip().lower()
        word=stemmer.stem(word)
        if word=='':
            continue      
            
        # remove .exe and .com
        if word.endswith('.exe') or word.endswith('.com'):
            word=word[:-4]
            
        # 以 regular expression 去除 email、數字、URL、Phone No、序號
        is_re_list = False
        for pattern1 in re_pattern_list:
            result=re.findall(pattern1, word)
            if  len(result) > 0:
                is_re_list = True
                break
        if is_re_list:
            break
            
            
        # remove the words with one or two characters only
        if len(word) <= 2 :
            continue
        if not (word in stop_words):
            word_list.append(word)
               
    return word_list

## 分詞

In [6]:
import re
pattern1 = '\[[a-zA-Z0-9_\-\s]+\]' # service_list pattern, e.g. [WFBS-SVC]

# 清理後的 email DESCRIPTION
clean_corpus=[]
original_corpus=[]
for index, line in df.iterrows():
    clean_line=""
    line = line['Subject']
    
    # remove service_list, e.g. [WFBS-SVC]
    if type(line) == str and len(line) > 0: 
        service_list = re.findall(pattern1, line)
        for service_item in service_list:
            line = line.replace(service_item, ' ')          
                                  
    #print(line)
    for break_word in break_words:
        #print('-',len(break_word), break_word, '-')
        if not type(line) == str or len(line) <= 0: 
            break
        line = line.replace(break_word, ' ')
    if not type(line) == str or len(line) <= 0: 
        #print(type(line))
        continue
    words = word_tokenize(line) #line.lower().split(' ')
    for word_0 in words:        
        word_list = clean_word(word_0)
        for word in word_list:        
            if word in word_freqs:
                word_freqs[word] += 1
            else:
                word_freqs[word] = 1
            clean_line+=' '+word
    original_corpus.append(line)
    clean_corpus.append(clean_line.strip())
email_words=word_freqs.keys()            
print(len(email_words))

15


In [7]:
import re
keyword_list_new=[]
keyword_list = set(email_words)
for item in keyword_list:
    is_re_list = False
    for pattern1 in re_pattern_list:
        result=re.findall(pattern1, item)
        if  len(result) > 0:
            is_re_list = True
            break
    if is_re_list == False:
        keyword_list_new.append(item)
len(keyword_list_new)

15

## Load List of Microsoft software and Ubuntu Glossaries

In [8]:
import pickle
with open("keyword_list.pickle", 'rb') as f:
    keyword_list_TERM = pickle.load(f)


## compare corpus with n-gram keywords, n=2~4

In [9]:
def check_special_term(words, start_index, n_gram):
    merge_word=[]
    for i in range(n_gram):
        merge_word.append(words[start_index+i])
    if ' '.join(merge_word) in keyword_list_TERM:
        #print('converted_word=', keyword_list_TERM[' '.join(merge_word)]['converted_word'])
        return keyword_list_TERM[' '.join(merge_word)]['converted_word']
    else:
        return ''

def compare_corpus_with_keyword(clean_corpus):
    new_clean_corpus=[]
    merge_word_list=[] # n-gram keywords
    hit_row_index=[] # keep for predict test
    for no, line in enumerate(clean_corpus):
        words = line.split(' ')
        #words = word_tokenize(line) 
        words_len = len(words)
        # n-gram
        new_line=''
        for i in range(4, 1, -1):
            for j in range(words_len-i+1):
                merge_word = check_special_term(words, j, i)
                if merge_word == '':
                    new_line+=' '+words[j]
                    if j==words_len-i:
                        for k in range(j+1, words_len):
                            new_line+=' '+words[k]
                    continue
                else:
                    merge_word_list.append(merge_word)
                    hit_row_index.append(no)
                    new_line+=' '+merge_word
                    j+=i
                    continue

            # line is too short, keep it as original
            if words_len-i<0:
                new_line=line    
            new_line = new_line.strip()
            words = new_line.split(' ')
            words_len = len(words)
            if i>2:
                new_line=''
        new_clean_corpus.append(new_line)
        #if new_line != line:
        #    print('org_line='+line+'\n')
        #    print('new_line='+new_line+'\n\n')
    return new_clean_corpus, merge_word_list, hit_row_index

new_clean_corpus, merge_word_list, hit_row_index = compare_corpus_with_keyword(clean_corpus)
hit_row_index=list(set(hit_row_index))        

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer = CountVectorizer(decode_error="replace",vocabulary=pickle.load(open("feature.pkl", "rb")))

BOW_vector = vectorizer.fit_transform(new_clean_corpus)
X = BOW_vector.toarray()

vocab = vectorizer.get_feature_names()
word2id = pickle.load(open("word2id.pkl", "rb"))

## Load the model 

In [11]:
import numpy as np
import guidedlda

with open('guidedlda_model.pickle', 'rb') as file_handle:
    model = pickle.load(file_handle)

## predict

In [12]:
display_keyword_count_per_topic=5

doc_topic = model.transform(X)
display_document_count=X.shape[0]
for i in range(display_document_count):
    print('原文:', df.loc[i, 'Subject'])
    doc=[]
    #vocab_len = len(vocab)
    #for j in range(vocab_len):
    #    if X[i, j] != 0:
    #        doc.append(vocab[j])
    #print(' '.join(doc))
    
    print("top topic: {}".format(doc_topic[i].argmax()))
    top_n_words = X[i,:].argsort()[:display_keyword_count_per_topic:-1]
    top_n_words_list = []
    for k in top_n_words:
        if X[i,k] > 0:
            top_n_words_list.append(k)
    print("Document: {}".format(', '.join(np.array(vocab)[top_n_words_list])))
    #print("Document: {}".format(', '.join(np.array(vocab)[list(reversed(X[i,:].argsort()))[:display_keyword_count_per_topic:-1]])))
    print(' ')

原文: [WFBS-SVC] Installation on Mac Is Not Full Featured
top topic: 4
Document: mac, full, feature, installation
 
原文: [WFBS - A] Smart Scan is not updating
top topic: 6
Document: smart, scan, update
 
原文: [WFBS - SVC] Installation issue
top topic: 4
Document: issue, installation
 
原文: [MALWARE][WFBS SVC] Wallet Ransomware
top topic: 9
Document: ransomware, wallet
 
原文: [MALWARE][WFBS S 9.5]Possible Ransomware detection on clients machine
top topic: 9
Document: machine, detection, wfb, possible, ransomware, client
 
