In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
import platform
import datetime,pytz

root_ = '/content/drive/My Drive/colab/' if platform.system() == 'Linux' else '/Users/love/Test/'

WeiboSentiment_ = os.path.join(root_, 'WeiboSentiment')
if not os.path.exists(WeiboSentiment_):
    os.makedirs(WeiboSentiment_)

model_ = os.path.join(WeiboSentiment_, 'model')
if not os.path.exists(model_):
    os.makedirs(model_)


import jieba
import re
import numpy as np

def tokenize(text):
    """
    带有语料清洗功能的分词函数, 包含数据预处理, 可以根据自己的需求重载
    """
    text = re.sub("\{%.+?%\}", " ", text)           # 去除 {%xxx%} (地理定位, 微博话题等)
    text = re.sub("@.+?( |$)", " ", text)           # 去除 @xxx (用户名)
    text = re.sub("【.+?】", " ", text)              # 去除 【xx】 (里面的内容通常都不是用户自己写的)
    icons = re.findall("\[.+?\]", text)             # 提取出所有表情图标
    text = re.sub("\[.+?\]", "IconMark", text)      # 将文本中的图标替换为`IconMark`

    tokens = []
    for k, w in enumerate(jieba.lcut(text)):
        w = w.strip()
        if "IconMark" in w:                         # 将IconMark替换为原图标
            for i in range(w.count("IconMark")):
                tokens.append(icons.pop(0))
        elif w and w != '\u200b' and w.isalpha():   # 只保留有效文本
                tokens.append(w)
    return tokens


def load_curpus(path):
    """
    加载语料库
    """
    data = []
    with open(path, "r", encoding="utf8") as f:
        for line in f:
            [_, seniment, content] = line.split(",", 2)
            content = tokenize(content)             # 分词
            data.append((content, int(seniment)))
    return data


Mounted at /content/drive


#### 加载数据

In [None]:
import pandas as pd
train_data = load_curpus(os.path.join(WeiboSentiment_, 'train.txt'))
test_data = load_curpus(os.path.join(WeiboSentiment_, 'test.txt'))
train_df = pd.DataFrame(train_data, columns=["content", "sentiment"])
test_df = pd.DataFrame(test_data, columns=["content", "sentiment"])

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.983 seconds.
Prefix dict has been built successfully.


In [None]:
stopwords = []
with open(os.path.join(WeiboSentiment_, 'stopwords.txt'), "r", encoding="utf8") as f:
    for w in f:
        stopwords.append(w.strip())

#### Ont-hot
ps: 与其他方法相比，朴素贝叶斯并没有对高质量词向量的需求，因此不再加载Fasttext词向量，而是直接用one-hot形式

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
data_str = [" ".join(content) for content, sentiment in train_data] + \
            [" ".join(content) for content, sentiment in test_data]
vectorizer = CountVectorizer(token_pattern='\[?\w+\]?', stop_words=stopwords)
vectorizer.fit_transform(data_str)

  'stop_words.' % sorted(inconsistent))


<119988x117762 sparse matrix of type '<class 'numpy.int64'>'
	with 1138782 stored elements in Compressed Sparse Row format>

In [None]:
X_data, y_data = [], []
for content, sentiment in train_data:
    X, y = [], sentiment
    X_data.append(" ".join(content))
    y_data.append(sentiment)
X_train = vectorizer.transform(X_data)
y_train = y_data

In [None]:
X_data, y_data = [], []
for content, sentiment in test_data:
    X, y = [], sentiment
    X_data.append(" ".join(content))
    y_data.append(sentiment)
X_test = vectorizer.transform(X_data)
y_test = y_data

#### Bayes
全部用默认参数

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [None]:
result = clf.predict(X_test)

#### 模型评估
速度快，效果还很好，可能是因为该任务语料规模较小，在大规模语料任务上性能会下降

In [None]:
from sklearn import metrics
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
print(metrics.classification_report(y_test, result))

print('acc:',accuracy_score(y_test, result))
print('pc',precision_score(y_test, result))
print('rc:',recall_score(y_test, result))
print('f1:',f1_score(y_test, result))

              precision    recall  f1-score   support

           0       0.89      0.87      0.88     11884
           1       0.87      0.89      0.88     12113

    accuracy                           0.88     23997
   macro avg       0.88      0.88      0.88     23997
weighted avg       0.88      0.88      0.88     23997

acc: 0.8786514981039296
pc 0.8718176675018184
rc: 0.8905308346404689
f1: 0.8810748999428245
