In [1]:
import entity_tagging
import utils
import argparse
from datetime import datetime
import pandas as pd
import pickle
from sklearn.model_selection import KFold
import numpy as np

In [2]:
def load_corpus_xlsx(filename):
    corpus = pd.read_excel(filename, dtype={'msg': str})
    list_label = corpus['class'].to_list()
    list_content = corpus['msg'].to_list()

    return list_label, list_content

In [111]:
list_label, list_content = load_corpus_xlsx('C:\KhoiNXM\Spam message Vietnamese\Data\message_modified_v1.xlsx')

In [39]:
list_content = entity_tagging.entity_tagging(list_content)

In [5]:
classifier_list = ['nb', 'svm', 'dt', 'knn', 'maxent', 'baseline']

# Begin test

In [112]:
test_content = 'Ngân hàng xin thông báo cho bạn về cuộc hẹn lúc 8h tại địa chỉ ABC. Nhân viên hỗ trợ của bạn là D , số điện thoại liên hệ'

In [113]:
list_content.insert(0, test_content)

In [115]:
list_content = entity_tagging.entity_tagging(list_content)

In [118]:
list_content_vec, list_len_sms, dictionary = utils.vectorize(list_content, 'bow')

In [119]:
list_content_vec.shape

(8051, 2177)

In [104]:
# Reads in saved classification model
load_clf = pickle.load(open(r'C:\KhoiNXM\Spam message Vietnamese\Dev\vietnamese-spam-sms-filtering\models\bow_nb_clf.pkl', 'rb'))

In [106]:
list_content_vec[:1]

array([[ 1.,  1.,  2., ...,  0.,  0., 29.]])

In [72]:
list_content_vec[6]

array([0., 0., 0., ..., 0., 0., 7.])

In [83]:
list_content[635:645]

['chả thấy gì cả',
 'bạn kiểm tra giúp mình với ạ',
 'chào bạn mình muốn cắt mạng internet',
 'vậy mình đã chết và khoái cảm giác bị chửi',
 'shop kiểm tra giúp em với',
 'mình muốn login vào number number number number thì mật khẩu và id là gì vậy',
 'em đăng kí gói super65 fpt cả number nơi mà mạng download đc number 600kb s',
 'mạng mỗi tối đều mất ổn định nhé',
 'chào anh nam anh vui lòng cung cấp thông tin hợp đổng fpt kiểm tra ạ',
 'dạ bên em đã update lại tín hiệu nhờ anh kiểm tra lại ạ']

In [105]:
load_clf.predict(list_content_vec[:1])

array([0], dtype=int64)

# End test

In [6]:
def train_evaluation(list_label, list_content, vectorize_method, classifier_list):

    if vectorize_method == 'bow':
        list_content_vec, list_label, list_len_sms, dictionary = utils.doc_2_vec(list_content, list_label, 'bow')

    if vectorize_method == 'tfidf':
        list_content_vec, list_label, list_len_sms, dictionary = utils.doc_2_vec(list_content, list_label, 'tfidf')

    for i in classifier_list:
        list_false_positive, list_false_negative, list_true_positive, list_true_negative = \
        utils.kfold_classification(list_content, list_content_vec, list_label, i, 5, vectorize_method)
        # print(list_false_positive)
        # print(list_false_negative)
        # print(list_true_positive)
        # print(list_true_negative)
        print('Evaluation for ' + i.upper())
        utils.evaluation(list_false_positive, list_false_negative, list_true_positive, list_true_negative)
        print('-------------------------------------------------')

In [16]:
def train_entire_dataset(list_content, list_label, vectorize_method):
    list_content_vec, list_label, list_len_sms, dictionary = utils.doc_2_vec(list_content, list_label, vectorize_method)
    clf = utils.build_classifier_nb(list_content_vec, list_label)
    pickle.dump(clf, open(r'C:\KhoiNXM\Spam message Vietnamese\Dev\vietnamese-spam-sms-filtering\models\bow_nb_clf.pkl', 'wb'))
    print('Traning Done!')

In [17]:
train_entire_dataset(list_content, list_label,'bow')

Traning Done!


In [7]:
train_evaluation(list_label, list_content, 'bow', classifier_list)

Evaluation for NB
False Positive Rate: 1.073881842611954%
False Negative Rate: 15.138779735674575%
True Positive Rate: 84.86122026432541%
True Negative Rate: 98.92611815738803%
-------------------------------------------------
Evaluation for SVM
False Positive Rate: 0.9740059088338784%
False Negative Rate: 17.96114426250163%
True Positive Rate: 82.03885573749837%
True Negative Rate: 99.02599409116613%
-------------------------------------------------
Evaluation for DT
False Positive Rate: 3.548351709638655%
False Negative Rate: 26.101911834142%
True Positive Rate: 73.898088165858%
True Negative Rate: 96.45164829036135%
-------------------------------------------------
Evaluation for KNN
False Positive Rate: 1.459818615738095%
False Negative Rate: 43.425532714465795%
True Positive Rate: 56.574467285534205%
True Negative Rate: 98.5401813842619%
-------------------------------------------------
Evaluation for MAXENT
False Positive Rate: 0.8886138017429629%
False Negative Rate: 19.42908346

In [8]:
train_evaluation(list_label, list_content, 'tfidf', classifier_list)

Evaluation for NB
False Positive Rate: 0.014114326040931546%
False Negative Rate: 86.8779451876219%
True Positive Rate: 13.1220548123781%
True Negative Rate: 99.98588567395906%
-------------------------------------------------
Evaluation for SVM
False Positive Rate: 0.5876200513943358%
False Negative Rate: 30.259694208829437%
True Positive Rate: 69.74030579117057%
True Negative Rate: 99.41237994860566%
-------------------------------------------------
Evaluation for DT
False Positive Rate: 3.4955387354472345%
False Negative Rate: 25.439517112417207%
True Positive Rate: 74.56048288758281%
True Negative Rate: 96.50446126455277%
-------------------------------------------------
Evaluation for KNN
False Positive Rate: 1.0445096762374104%
False Negative Rate: 56.012192569971894%
True Positive Rate: 43.987807430028106%
True Negative Rate: 98.95549032376259%
-------------------------------------------------
Evaluation for MAXENT
False Positive Rate: 0.5586852484671465%
False Negative Rate: 33