In [1]:
path = r'./'

In [2]:
import re
import time
import pickle
import pandas as pd
import numpy as np
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import naive_bayes
from sklearn import metrics

In [3]:
all_stopwords = set()

# We revised the Baidu Stopwords and Sichuan University Stopwords to avoid filtering out the emotion words

baidu_swFile = open("baidu_stopwords_revised.txt", encoding = "UTF-8")
for line in baidu_swFile:
    all_stopwords.add(line.strip("\n"))
    
scu_swFile = open("scu_stopwords_revised.txt", encoding = "UTF-8")
for line in scu_swFile:
    all_stopwords.add(line.strip("\n"))

#print(all_stopwords)

In [4]:
def dumpModel(model, filename):
    ''' Input Value
            model:    the model you are going to dump
            filename: the filename of the file to store the model, you are strongly suggested to use
                      ".model" as the suffix
    '''
    f = open(filename, 'wb')
    pickle.dump(model, f)
    
def loadModel(filename):
    ''' Input Value
            filename: the filename of the file storing the model, you are strongly suggested to use
                      ".model" as the suffix
    '''
    f = open(filename, 'rb')
    return pickle.load(f)

In [5]:
class sentiment_Bayes():
    def __init__(self, data, stopwords):
        ''' Input Value
                data:      the dataset you are going to use to train the model, it will be split to 
                           two dataset, one is for training (dfTrain) and one is for testing (dfTest)
                stopwords: the stopwords for Bag-of-Word / TF-IDF
        '''
        self.data = data
        self.stopwords = stopwords
        
        self.vecr = None # the vectorizer for feature coding
        self.featureMethod = "undefined" # which feature coding method are we using
        
        self.bayes_model = None # the best trained naive Bayes model
        self.f1_score = 0  # the f1-score
        self.classify_report = None # the deatailed classification report from sklearn
        
        self.bestCnt = 0 # the counter's value when we obtained the best model
        self.trainCnt = 0 # the counter for training
    
    def classify(self, dfClassify):
        ''' Input Value     
                dfClassify: the dataframe includes the data we are going to classify
            Return Value
                resultDF:   the df based on dfClassify which shows the classification result
                            on column "predict"
        '''
        
        X = self.vecr.transform(dfClassify["text"])
        resultDF = dfClassify
        
        start = time.perf_counter()
        y = self.bayes_model.predict(X)
        end = time.perf_counter()
        timeCost = end - start
        
        print("\nPredicting Finish Transcript")
        print("-----------------------------------------")
        print("Time Cost (Seconds): ",round(timeCost,8))
        print("-----------------------------------------")
        
        resultDF["predict"] = y
        return resultDF
    
    def training(self, iteration = 20, percent = 0.2, featureMethod = "tfidf", inplace = True):
        ''' Input Value     
                iteration:     the iteration round of training, default will be 5
                percent:       the percentage of the testing set among the whole dataset, default will be 0.2
                featureMethod: the method used in feature coding, can use "tfidf" or "bow",
                               default will be "tfidf"
                inplace:       whether to update the model if we have trained a better model, 
                               default will be True
        '''
        
        bestModel = self.bayes_model
        bestVecr = self.vecr
        bestF1_score = self.f1_score
        bestReport = self.classify_report
        bestModelCnt = self.bestCnt
        
        print("Training Record")
        print("-------------------------------------------------------------------------")
        print("Total Training No. | Train Iter. No. |  f1_score  |   Time Cost (Seconds)")
        
        for i in range(iteration):
            dfTrain, dfTest = self.random_dataSplit(self.data, percent)
            #print(dfTrain.shape[0], dfTest.shape[0])
            
            start = time.perf_counter()
            vecr, bayes_model = self.singleTrain(dfTrain, featureMethod)
            ac, report = self.__test(bayes_model, dfTest, vecr, featureMethod)
            end = time.perf_counter()
            
            timeCost = end - start
            self.trainCnt += 1
            
            print("\t",self.trainCnt,"\t\t",i+1,"\t      ",round(ac,8),"\t",round(timeCost,8))
            
            if(ac > bestF1_score):
                bestModel = bayes_model
                bestVecr = vecr
                bestF1_score = ac
                bestReport = report
                bestModelCnt = self.trainCnt
        
        print("\nTraining Finish Transcript")
        print("-------------------------------------------------------------------------------")
        print("Best Model F1_score: ",bestF1_score,"\t\t","From Training No. : ",bestModelCnt)
        print("-------------------------------------------------------------------------------")
        
        if(inplace == True):
            self.bayes_model = bestModel
            self.vecr = bestVecr
            self.featureMethod = featureMethod
            self.f1_score = bestF1_score
            self.classify_report = bestReport
            self.bestCnt = bestModelCnt
            return None
        
        return bayes_model, bestVecr, bestF1_score, bestReport
        
    def singleTrain(self, dataTrain, featureMethod = "tfidf"):
        ''' Input Value     
                dataTrain:     the dataset for training
                featureMethod: the method used in feature coding, can use "tfidf" or "bow",
                               default will be "tfidf"
            Return Value
                vecr:          the trained vectorizer
                bayes_model: the trained naive Bayes model
        '''
        if(featureMethod == "bow"):
            X_train, y_train, vecr = self.BOWcoding(dataTrain, 0)
        else:
            X_train, y_train, vecr = self.TFIDFcoding(dataTrain, 0)
        
        bayes_model = naive_bayes.MultinomialNB()
        bayes_model.fit(X_train, y_train)
        
        return vecr, bayes_model

    def random_dataSplit(self, dataTrain, percent = 0.2):
        ''' Input Value
                dataTrain: the dataset you are going to use to train the model, it will be split to 
                           two dataset, one is for training (dfTrain) and one is for testing (dfTest)
                percent:   the percentage of the testing set among the whole dataset, default will be 0.2
            Return Value
                dfTrain:   splited dataset for training
                dfTest:    splited dataset for testing
        '''
        
        # Split the data into two parts according to the category, 
        # And split the two parts into training set and testing set according to "percent"
        # Then combine the training set together and the testing set together 
        # to ensure the training set and testing set has a balanced proportion of categories
        
        dataPos = dataTrain.loc[dataTrain["label"] == 1]
        dataNeg = dataTrain.loc[dataTrain["label"] == 0]
        shuffleIdxPos = np.random.permutation(dataPos.shape[0])
        shuffleIdxNeg = np.random.permutation(dataNeg.shape[0])
        
        dataPosTestSize = int(dataPos.shape[0]*percent)
        dataNegTestSize = int(dataNeg.shape[0]*percent)
        
        dfTrainPos = dataPos.iloc[shuffleIdxPos[dataPosTestSize:]]
        dfTestPos = dataPos.iloc[shuffleIdxPos[:dataPosTestSize]]
        
        dfTrainNeg = dataNeg.iloc[shuffleIdxNeg[dataNegTestSize:]]
        dfTestNeg = dataNeg.iloc[shuffleIdxNeg[:dataNegTestSize]]
        
        
        dfTrain = pd.concat([dfTrainPos, dfTrainNeg]).reset_index(drop = True)
        dfTest  = pd.concat([dfTestPos, dfTestNeg]).reset_index(drop = True)
        
        return dfTrain, dfTest
    
    def TFIDFcoding(self, data, dataType, tfidf_vecr = None):
        ''' Input Value
                data:       the data you are going to apply TF-IDF
                dataType:   whether the data is a training set or a testing set
                            dataType == 0: training set
                            dataType == 1: testing set
                tfidf_vecr: the trained TfidfVectorizer, only needed when dataType == 1
            Return Value
                X:          the attribute columns
                y:          the label column
                tfidf_vecr: the trained TfidfVectorizer, only return when dataType == 0
        '''
        # TF-IDF feature coding
        if(dataType == 0):
            tfidf_vecr = TfidfVectorizer(token_pattern = '\[?\w+\]?', stop_words = self.stopwords)
            X = tfidf_vecr.fit_transform(data["text"])
            y = data["label"]
            return X, y, tfidf_vecr
        elif(dataType == 1):
            X = tfidf_vecr.transform(data["text"])
            y = data["label"]
            return X, y
        else:
            return None
    
    def BOWcoding(self, data, dataType, bow_vecr = None):
        ''' Input Value
                data:     the data you are going to apply BOW
                dataType: whether the data is a training set or a testing set
                          dataType == 0: training set
                          dataType == 1: testing set
                bow_vecr: the trained CountVectorizer, only needed when dataType == 1
            Return Value
                X:        the attribute columns
                y:        the label column
                bow_vecr: the trained CountVectorizer, only return when dataType == 0
        '''
        # Bag-of-Word feature coding
        if(dataType == 0):
            bow_vecr = CountVectorizer(token_pattern='\[?\w+\]?', stop_words = self.stopwords)
            X = bow_vecr.fit_transform(data["text"])
            y = data["label"]
            return X, y, bow_vecr
        elif(dataType == 1):
            X = bow_vecr.transform(data["text"])
            y = data["label"]
            return X, y
        else:
            return None
    
    def __test(self, bayes_model, dataTest, vecr, featureMethod = "tfidf"):
        ''' Introduction
                The private method for testing the trained model for selecting the best model by f1score
            Input Value     
                bayes_model:   the naive Bayes model we obtained from singleTrain()
                dataTest:      the testing set
                vecr:          the trained vectorizer
                featureMethod: the method used in feature coding, can use "tfidf" or "bow",
                               default will be "tfidf"
            Return Value
                test_f1score: the f1score
                test_report:  the classification report from sklearn
        '''
        if(self.featureMethod == "bow"):
            X_test, y_test = self.BOWcoding(dataTest, 1, vecr)
        else:
            X_test, y_test = self.TFIDFcoding(dataTest, 1, vecr)
            
        y_pred = bayes_model.predict(X_test)
        
        test_f1score = metrics.f1_score(y_test, y_pred, average = "micro")
        test_report = metrics.classification_report(y_test, y_pred)
        
        return test_f1score, test_report


In [6]:
df = pd.read_csv(path+"train_data_wordCut.csv")
bayes = sentiment_Bayes(df, all_stopwords)

In [7]:
bayes.training(iteration = 5, featureMethod = "bow")

Training Record
-------------------------------------------------------------------------
Total Training No. | Train Iter. No. |  f1_score  |   Time Cost (Seconds)
	 1 		 1 	       0.82641321 	 0.45897032
	 2 		 2 	       0.8084042 	 0.45489501
	 3 		 3 	       0.82091046 	 0.44366024
	 4 		 4 	       0.82291146 	 0.44031402
	 5 		 5 	       0.8064032 	 0.41893889

Training Finish Transcript
-------------------------------------------------------------------------------
Best Model F1_score:  0.8264132066033016 		 From Training No. :  1
-------------------------------------------------------------------------------


In [8]:
bayes.training(iteration = 5, featureMethod = "tfidf")

Training Record
-------------------------------------------------------------------------
Total Training No. | Train Iter. No. |  f1_score  |   Time Cost (Seconds)
	 6 		 1 	       0.81690845 	 0.34349819
	 7 		 2 	       0.81590795 	 0.36476631
	 8 		 3 	       0.82841421 	 0.44520524
	 9 		 4 	       0.8154077 	 0.43353442
	 10 		 5 	       0.83091546 	 0.41478174

Training Finish Transcript
-------------------------------------------------------------------------------
Best Model F1_score:  0.8309154577288644 		 From Training No. :  10
-------------------------------------------------------------------------------


In [9]:
# Bag-of-Word or TF-IDF no difference on naive Bayes
# We use naive Bayes with Bag-of-Word as our model for prediction to make use of all feature coding method
# And now we are going to train a best model
bayes.training(iteration = 50, featureMethod = "bow")

Training Record
-------------------------------------------------------------------------
Total Training No. | Train Iter. No. |  f1_score  |   Time Cost (Seconds)
	 11 		 1 	       0.82441221 	 0.33552251
	 12 		 2 	       0.8074037 	 0.35845905
	 13 		 3 	       0.82591296 	 0.39498022
	 14 		 4 	       0.81990995 	 0.41352856
	 15 		 5 	       0.8194097 	 0.42722716
	 16 		 6 	       0.82291146 	 0.39479089
	 17 		 7 	       0.8074037 	 0.45122582
	 18 		 8 	       0.8134067 	 0.40339311
	 19 		 9 	       0.82791396 	 0.43189732
	 20 		 10 	       0.81590795 	 0.40915596
	 21 		 11 	       0.82491246 	 0.41344304
	 22 		 12 	       0.8154077 	 0.42434863
	 23 		 13 	       0.8084042 	 0.42373957
	 24 		 14 	       0.8154077 	 0.41109939
	 25 		 15 	       0.82391196 	 0.43849284
	 26 		 16 	       0.8194097 	 0.46525624
	 27 		 17 	       0.82091046 	 0.54308668
	 28 		 18 	       0.81490745 	 0.43546585
	 29 		 19 	       0.82441221 	 0.44282613
	 30 		 20 	       0.8064032 	 0.433

In [7]:
df_verify = pd.read_csv(path+"verify_data_wordCut.csv")
display(df_verify)

Unnamed: 0,text,label
0,﻿ 更博 了 ， 爆照 了 ， 帅 的 呀 ， 就是 越来越 爱 你 ！ 生快 傻 缺 [ ...,1
1,土耳其 的 事要 认真对待 [ 哈哈 ] ， 否则 直接 开除 很 是 细心 ， 酒...,1
2,姑娘 都 羡慕 你 呢 … 还有 招财猫 高兴 … … [ 哈哈 ] 小 学徒 一枚 ， 等...,1
3,美 ~ ~ ~ ~ ~ [ 爱 你 ],1
4,梦想 有 多 大 ， 舞台 就 有 多 大 ! [ 鼓掌 ],1
...,...,...
119983,一 公里 不到 ， 县 医院 那个 天桥 下右 拐 200 米 就 到 了 ！ 我 靠 ...,0
119984,今天 真冷 啊 ， 难道 又 要 穿 棉袄 了 [ 晕 ] ？ 今年 的 春天 真的 是 百...,0
119985,最近 几天 就 没 停止 过 ！ ！ ！ [ 伤心 ],0
119986,[ 怒 ] 很惨 !,0


In [8]:
df_verify_predicted = bayes.classify(df_verify)
y_veri = df_verify_predicted["label"]
y_pred = df_verify_predicted["predict"]
test_f1score = metrics.f1_score(y_veri, y_pred, average = "micro")
print(test_f1score)


Predicting Finish Transcript
-----------------------------------------
Time Cost (Seconds):  0.01219614
-----------------------------------------
0.8218738540520719


In [9]:
# Export the verified result for Bagging
df_verified = pd.DataFrame()
df_verified["label"] = y_veri
df_verified["predict"] = y_pred
df_verified.to_csv("verified_data_naiveBayes.csv", index = False)

In [12]:
# Backup the trained model here
dumpModel(bayes, "naiveBayes_trained.model")

In [6]:
bayes_trained = loadModel("naiveBayes_trained.model")

In [14]:
df_predict = pd.read_csv(path+"predict_data_wordCut.csv")

In [15]:
display(df_predict)

Unnamed: 0,text,timestamp
0,哈哈哈哈 哈哈哈 确实 是 ， 我们 有 宝藏 他们 没有 ！,2020-03-12 01:41:00
1,（ 二 ） 真诚 的 认为 遇上 你 是 我 的 缘,2020-03-12 01:41:00
2,# 欧洲 求助 钟南山 # 人 都 是 思变 的 动物 ， 天仙 美眷 女人 也 留不住 完...,2020-03-12 01:41:00
3,钟老 ， 您 就是 夜空 中 最亮 的 星,2020-03-12 01:41:00
4,两人 目光 呆滞 ， 钟南山 全程 英语 分享 中国 经验 走路 脚跟 离 地 ， 九叔 一...,2020-03-12 01:40:00
...,...,...
3112102,【 港新网 / 哈哈 ！ 黄之锋 动用 关系 从 美国 搞来 了 一批 “ 中国 制造 ” ...,2020-02-13 16:58:00
3112103,上海 居委会 预约 的 口罩 质量 还 不错 哦 四毛 六 一个,2020-02-13 16:58:00
3112104, 李现 # 李现 教 你 如何 节约 口罩 # lx # 李现 公益 正 能量 # ...,2020-02-13 16:58:00
3112105,一次性 口罩 消毒 后 可以 反复 使用 ！ 您 知道 吗 ？ 2 长沙 · 长沙县,2020-02-13 16:58:00


In [16]:
df_predicted = bayes_trained.classify(df_predict)


Predicting Finish Transcript
-----------------------------------------
Time Cost (Seconds):  0.66530721
-----------------------------------------


In [17]:
display(df_predicted)

Unnamed: 0,text,timestamp,predict
0,哈哈哈哈 哈哈哈 确实 是 ， 我们 有 宝藏 他们 没有 ！,2020-03-12 01:41:00,1
1,（ 二 ） 真诚 的 认为 遇上 你 是 我 的 缘,2020-03-12 01:41:00,0
2,# 欧洲 求助 钟南山 # 人 都 是 思变 的 动物 ， 天仙 美眷 女人 也 留不住 完...,2020-03-12 01:41:00,1
3,钟老 ， 您 就是 夜空 中 最亮 的 星,2020-03-12 01:41:00,1
4,两人 目光 呆滞 ， 钟南山 全程 英语 分享 中国 经验 走路 脚跟 离 地 ， 九叔 一...,2020-03-12 01:40:00,1
...,...,...,...
3112102,【 港新网 / 哈哈 ！ 黄之锋 动用 关系 从 美国 搞来 了 一批 “ 中国 制造 ” ...,2020-02-13 16:58:00,0
3112103,上海 居委会 预约 的 口罩 质量 还 不错 哦 四毛 六 一个,2020-02-13 16:58:00,1
3112104, 李现 # 李现 教 你 如何 节约 口罩 # lx # 李现 公益 正 能量 # ...,2020-02-13 16:58:00,1
3112105,一次性 口罩 消毒 后 可以 反复 使用 ！ 您 知道 吗 ？ 2 长沙 · 长沙县,2020-02-13 16:58:00,0


In [18]:
df_predicted.to_csv("predict_data_naiveBayes.csv", index = False)