## 1. 定义分词方法及获取数据集

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
import joblib
import os
import re
import jieba


DATA_HOME = os.path.join(os.path.dirname(os.path.dirname('./')), 'data')

def clean_str(string, sep=" "):
    """
    该函数的作用是去掉一个字符串中的所有非中文字符
    :param string: 输入必须是字符串类型
    :param sep: 表示去掉的部分用什么填充，默认为一个空格
    :return: 返回处理后的字符串
    example:
    s = "祝你2018000国庆快乐！"
    print(clean_str(s))# 祝你 国庆快乐
    print(clean_str(s,sep=""))# 祝你国庆快乐
    """
    string = re.sub(r"[^\u4e00-\u9fff]", sep, string)
    string = re.sub(r"\s{1,}", sep, string)  # 若有空格，则最多只保留1个宽度
    return string.strip()

def load_spam():
    """
    载入原始文本
    :return: x为一个list，每个元素为一个样本
             y为一个list，每个元素为样本对应的标签
    """
    data_spam_dir = DATA_HOME

    def load_spam_data(file_path=None):
        texts = []
        with open(file_path, encoding='utf-8') as f:
            for line in f:
                line = line.strip('\n')
                texts.append(clean_str(line))
        return texts

    x_pos = load_spam_data(file_path=os.path.join(data_spam_dir, 'ham_5000.utf8'))
    x_neg = load_spam_data(file_path=os.path.join(data_spam_dir, 'spam_5000.utf8'))
    y_pos, y_neg = [1] * len(x_pos), [0] * len(x_neg)
    x, y = x_pos + x_neg, y_pos + y_neg
    return x, y

def load_cut_spam():
    """
    :return: ['中信   国际   电子科技 有限公司 推出 新 产品   升职 步步高',
             '搜索 文件   看 是否 不 小心 拖 到 某个 地方 了',....]
    """
    x, y = load_spam()
    x_cut = []
    for text in x:
        seg_list = jieba.cut(text, cut_all=False)
        tmp = " ".join(seg_list)
        x_cut.append(tmp)
    return x_cut, y

def get_dataset():
    x, y = load_cut_spam()
    X_train, X_test, y_train, y_test = \
        train_test_split(x, y, test_size=0.3, random_state=42)
    return X_train, X_test, y_train, y_test

## 2.文本向量化

In [20]:
def preprocessing(x,
                  train=False,
                  top_k_words=1000,
                  MODEL_NAME='count_vec.pkl'):
    """
    数据预处理
    :param x: 原始数据
    :param train: 训练或测试
    :param top_k_words:  取前top_k_words词为词表
    :param MODEL_NAME:   模型保存的名称
    :return:
    """
    if train:
        # 仅考虑词频的词袋模型
        # count_vec = CountVectorizer(max_features=top_k_words)
        # 基于权重的词袋模型
        count_vec = TfidfVectorizer(max_features=top_k_words)

        count_vec.fit(x)  # 重新训练
        # print(len(count_vec.vocabulary_)) # 输出词表长度
        save_model(count_vec, MODEL_NAME=MODEL_NAME)
    else:
        count_vec = load_model(MODEL_NAME=MODEL_NAME)
    x = count_vec.transform(x)
    return x

## 3.保存和载入模型

In [21]:
def save_model(model, dir='MODEL', MODEL_NAME='model.pkl'):
    if not os.path.exists(dir):
        os.mkdir(dir)
    path = os.path.join(dir, MODEL_NAME)
    joblib.dump(model, path)
    print(f"模型: {path} 保存成功！")


def load_model(dir='MODEL', MODEL_NAME='model.pkl'):
    path = os.path.join(dir, MODEL_NAME)
    if not os.path.exists(path):
        raise FileNotFoundError(f"{path} 模型不存在，请先训练模型！")
    model = joblib.load(path)
    print(f"载入已有模型: {path}")
    return model

## 4.模型训练和预测

In [22]:
def train(X_train, y_train):
    X_train = preprocessing(X_train, train=True)
    model = KNeighborsClassifier(n_neighbors=3)
    model.fit(X_train, y_train)
    save_model(model, MODEL_NAME='KNN.pkl')
    y_pred = model.predict(X_train)
    print("模型在训练集上的表现结果：")
    print(classification_report(y_train, y_pred))


def predict(X, MODEL_NAME='KNN.pkl'):
    X_test = preprocessing(X, train=False)
    model = load_model(MODEL_NAME=MODEL_NAME)
    y_pred = model.predict(X_test)
    return y_pred

## 5.运行结果

In [44]:

if __name__ == '__main__':
    X_train, X_test, y_train, y_test = get_dataset()
    train(X_train, y_train)
    y_pred = predict(X_test)
    print("模型在测试集上的表现结果：")
    print(classification_report(y_test, y_pred))


模型: MODEL/count_vec.pkl 保存成功！
模型: MODEL/KNN.pkl 保存成功！
模型在训练集上的表现结果：
              precision    recall  f1-score   support

           0       0.66      1.00      0.80      3537
           1       1.00      0.48      0.65      3463

    accuracy                           0.74      7000
   macro avg       0.83      0.74      0.72      7000
weighted avg       0.83      0.74      0.72      7000

载入已有模型: MODEL/count_vec.pkl
载入已有模型: MODEL/KNN.pkl
模型在测试集上的表现结果：
              precision    recall  f1-score   support

           0       0.59      1.00      0.74      1464
           1       0.99      0.33      0.50      1537

    accuracy                           0.66      3001
   macro avg       0.79      0.66      0.62      3001
weighted avg       0.80      0.66      0.62      3001

