### 导入相关库

In [1]:
import pandas as pd
import numpy as np  
from sklearn.model_selection import train_test_split  
import re  
import jieba  
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.linear_model import SGDClassifier  
from sklearn.linear_model import LogisticRegression  
from sklearn import metrics 
import joblib

### 读取数据

In [88]:
def get_data():  

    df = pd.read_csv('./Data/train.news.csv')
    df_sample = df.sample(n=10000, random_state=42) #随机抽取10000个样本
    corpus = df_sample['Title']
    labels = df_sample['label']
    return corpus, labels  


corpus, labels = get_data()     # 获取数据集
print(corpus[0])                # 示例输出
print(labels[0])                
print("总的数据量:", len(labels)) 

中国反腐风刮到阿根廷，这个美到让人瘫痪的女总统，因为8个本子摊上大事了
0
总的数据量: 10000


### 数据预处理

In [30]:
def remove_empty_docs(corpus, labels):  
    '''
    功能：重新整合邮件及标签，去除空白邮件
    zip(corpus, labels),控制同时遍历corpus和labels
    doc.strip(),去除文本首尾空格和换行符，如果有非空字符，返回真
    '''
    filtered_corpus = []  
    filtered_labels = []  
    for doc, label in zip(corpus, labels):  
        if doc.strip():  
            filtered_corpus.append(doc)  
            filtered_labels.append(label)  
  
    return filtered_corpus, filtered_labels 

corpus, labels = remove_empty_docs(corpus, labels) 
label_name_map = ["垃圾邮件", "正常邮件"]  
print(corpus[0])      # 输出处理后的第一封邮件(换行符被处理在字符串中)
print(labels[0])      
print("总的数据量:", len(labels))  

冯小刚还不知道自己是怎么死的！
1
总的数据量: 5000


### 划分数据集

In [31]:
def prepare_datasets(corpus, labels, test_data_proportion=0.3):  
    '''
    功能：划分数据集
    test_size=0.3，测试集30%，训练集70%
    random_state=42，设置随机种子，确保每次切割结果一致
    '''  
    train_X, test_X, train_Y, test_Y = train_test_split(corpus, labels, test_size = test_data_proportion, random_state=42)  
    return train_X, test_X, train_Y, test_Y

train_corpus, test_corpus, train_labels, test_labels = prepare_datasets(corpus,  labels,  test_data_proportion=0.3)  
print('训练集样本数量：%d，测试样本数量：%d'%(len(train_corpus),len(test_corpus)))
print(train_corpus[101],train_labels[101])
print(test_corpus[3],test_labels[3]) 

训练集样本数量：3500，测试样本数量：1500
公安已提醒！微信上所有的朋友请看下..... 0
作为刑警，也来说说滴滴 0


### 样本标准化

In [32]:
def normalize_corpus(corpus):  
    '''
    功能：导入停用词于stopwords列表,遍历corpus（即每一行）并分词，将每一行中不是停用词的词连成字符串（词与词用空格隔开），再存入normalized_corpus列表
    sw.replace('\n', '')，将sw通过readlines到的每一行内容中的换行符移除
    jieba.lcut(text.replace('\n','')),对字符串去除空格和换行符后，做分词处理放入列表中
    text = ' '.join(filtered_tokens)，将filterd_tokens列表元素拼接为字符串(用空格隔开元素)
    '''
    normalized_corpus = []  
    stopwords = [sw.replace('\n', '') for sw in open('./Data/stopwords.txt',encoding='utf-8').readlines()]   
        
    for text in corpus:  
        filtered_tokens = []   
        tokens = jieba.lcut(text.replace('\n',''))  
  
        for token in tokens:  
            token = token.strip()  
            if token not in stopwords and len(token)>1:  
                filtered_tokens.append(token)  
          
        text = ' '.join(filtered_tokens)  
        normalized_corpus.append(text)  
    return normalized_corpus

norm_train_corpus = normalize_corpus(train_corpus)  
norm_test_corpus = normalize_corpus(test_corpus) 
print(norm_train_corpus[0])
print(norm_test_corpus[0])

罗志 祥和 周扬青 同居 试婚 周扬青 整容
人贩子 没有 孩子 新型 拐卖 防不胜防


### 特征提取

In [33]:
def bow_extractor(normalized_corpus, ngram_range=(1, 1)):  
    '''
    功能：将normalized_corpus（分词处理后的）转化为BOW词袋模型
    min_df=1，表示包含至少在一个文档中（即每一行）出现的词汇才会被纳入词典
    ngram_range=(1, 1)，定义n-gram的范围，只提取单个词汇，不考虑组合词汇
    fit_transform(normalized_corpus)，文本数据转化为词频矩阵，每一行代表一个文档，每一列代表一个词汇项，矩阵中的元素表示相应词汇在该文档(这一行中)的出现次数
    features.shape = (len(normalized_corpus),dic_size)，dic_size为词典大小
    ''' 
    vectorizer = CountVectorizer(min_df=1, ngram_range=ngram_range)  
    features = vectorizer.fit_transform(normalized_corpus)  
    return vectorizer, features
    
def tfidf_extractor(normalized_corpus, ngram_range=(1, 1)):  
    '''
    功能：将normalized_corpus（分词处理后的）转化为TF-IDF表示形式
    norm = 'l2',在计算TF-IDF向量后，会对结果向量做L2范数归一化
    fit_transform(normalized_corpus),将文本数据转换为TF-IDF矩阵，每一行代表一个文档，每一列代表一个词汇项,矩阵中的元素表示相应词汇在文档中的TF-IDF值
    '''
    vectorizer = TfidfVectorizer(min_df=1,  
                                 norm='l2',  
                                 smooth_idf=True,       # 在IDF权重计算时引入平滑项，防止出现零概率问题
                                 use_idf=True,          # 启用IDF调整，即TF-IDF计算公式中的逆文档频率部分
                                 ngram_range=ngram_range)  
    features = vectorizer.fit_transform(normalized_corpus)  
    return vectorizer, features

# BOW词袋模型
bow_vectorizer, bow_train_features = bow_extractor(norm_train_corpus)  # 将训练分词集传入函数中，获取训练集vector和features
bow_test_features = bow_vectorizer.transform(norm_test_corpus)         # 将测试分词集传入训练集vector中，获得测试集features
print(bow_train_features.shape)                                        # (7000,2473)表示训练集7000封邮件，词典size为27473
print(bow_test_features.shape)
# print(bow_train_features)
# print(bow_test_features)

# TF-IDF模型(同理)
tfidf_vectorizer, tfidf_train_features = tfidf_extractor(norm_train_corpus)  
tfidf_test_features = tfidf_vectorizer.transform(norm_test_corpus)
print(tfidf_train_features.shape)
print(tfidf_test_features.shape)
# print(tfidf_train_features)
# print(tfidf_test_features)

(3500, 8690)
(1500, 8690)
(3500, 8690)
(1500, 8690)


### 训练和测试

In [80]:
def get_metrics(true_labels, predicted_labels): 
    '''
    功能：计算预测的各类指标
    ''' 
    acc = metrics.accuracy_score(true_labels,predicted_labels)  
    precision = metrics.precision_score(true_labels, predicted_labels, average = 'weighted')  
    recall = metrics.recall_score(true_labels, predicted_labels,average='weighted')  
    f1_score = metrics.f1_score(true_labels,predicted_labels,average='weighted')  
    print('准确率:%.4f' % acc)  
    print('精度:%.4f' % precision)  
    print('召回率:%.4f' % recall)  
    print('F1得分:%.4f' % f1_score)  
  
def train_predict(classifier, train_features, train_labels, test_features, test_labels):  
    '''
    功能：根据传入的训练分类器，进行模型训练
    '''
    classifier.fit(train_features, train_labels)  # 喂给模型训练矩阵和训练标签
    predictions = classifier.predict(test_features)  # 模型预测
    get_metrics(true_labels=test_labels, predicted_labels=predictions)  # 调用get_metrics函数,打印预测结果的各类指标
    return predictions

# 分类器  
svm = SGDClassifier(loss='hinge')  
lr = LogisticRegression()  
# 基于词袋模型特征的逻辑回归  
print("基于词袋模型特征的逻辑回归")  
lr_bow_predictions = train_predict(classifier=lr,  
                            train_features=bow_train_features,  
                            train_labels=train_labels,  
                            test_features=bow_test_features,  
                            test_labels=test_labels)  

# 基于词袋模型的支持向量机方法  
print("基于词袋模型的支持向量机")  
svm_bow_predictions = train_predict(classifier=svm,  
                            train_features=bow_train_features,  
                            train_labels=train_labels,  
                            test_features=bow_test_features,  
                            test_labels=test_labels)  

# 基于tfidf的逻辑回归模型  
print("基于tfidf的逻辑回归模型")  
lr_tfidf_predictions = train_predict(classifier=lr,  
                            train_features=tfidf_train_features,  
                            train_labels=train_labels,  
                            test_features=tfidf_test_features,  
                            test_labels=test_labels)  

# 基于tfidf的支持向量机模型  
print("基于tfidf的支持向量机模型")  
svm_tfidf_predictions = train_predict(classifier=svm,  
                            train_features=tfidf_train_features,  
                            train_labels=train_labels,  
                            test_features=tfidf_test_features,  
                            test_labels=test_labels)

基于词袋模型特征的逻辑回归
准确率:0.9333
精度:0.9340
召回率:0.9333
F1得分:0.9313
基于词袋模型的支持向量机
准确率:0.9327
精度:0.9319
召回率:0.9327
F1得分:0.9317
基于tfidf的逻辑回归模型
准确率:0.9020
精度:0.9082
召回率:0.9020
F1得分:0.8952
基于tfidf的支持向量机模型
准确率:0.9420
精度:0.9414
召回率:0.9420
F1得分:0.9415


### 输入预测

In [79]:
# 以基于词袋模型的SVM为例
# 创建SVM分类器
svm_classifier = SGDClassifier(loss='hinge')  
# 模型训练
svm_classifier.fit(bow_train_features,train_labels)
# 模型保存
joblib.dump(svm_classifier, 'svm_model.pkl')
# 模型加载
loaded_svm_model = joblib.load('svm_model.pkl')

# 输入预测内容
new_data = ['老兵日记,2018公安改革:明年3月全国义务兵转正!成为人民警察编']
# 标准化
train_data = normalize_corpus(new_data)
# 特征提取
test_features = bow_vectorizer.transform(train_data)
# 模型预测
svm_prediction = loaded_svm_model.predict(test_features)
print("SVM预测结果：", svm_prediction)


SVM预测结果： [0]
