In [1]:
path = r'./'

In [2]:
import re
import time
import pickle
import pandas as pd
import numpy as np
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import naive_bayes
from sklearn import metrics
import xgboost as xgb

In [3]:
all_stopwords = set()

# We revised the Baidu Stopwords and Sichuan University Stopwords to avoid filtering out the emotion words

baidu_swFile = open("baidu_stopwords_revised.txt", encoding = "UTF-8")
for line in baidu_swFile:
    all_stopwords.add(line.strip("\n"))
    
scu_swFile = open("scu_stopwords_revised.txt", encoding = "UTF-8")
for line in scu_swFile:
    all_stopwords.add(line.strip("\n"))

#print(all_stopwords)

In [4]:
def dumpModel(model, filename):
    ''' Input Value
            model:    the model you are going to dump
            filename: the filename of the file to store the model, you are strongly suggested to use
                      ".model" as the suffix
    '''
    f = open(filename, 'wb')
    pickle.dump(model, f)
    
def loadModel(filename):
    ''' Input Value
            filename: the filename of the file storing the model, you are strongly suggested to use
                      ".model" as the suffix
    '''
    f = open(filename, 'rb')
    return pickle.load(f)

In [5]:
class sentiment_XGB():
    def __init__(self, data, stopwords):
        ''' Input Value
                data:      the dataset you are going to use to train the model, it will be split to 
                           two dataset, one is for training (dfTrain) and one is for testing (dfTest)
                stopwords: the stopwords for Bag-of-Word / TF-IDF
        '''
        self.data = data
        self.stopwords = stopwords
        
        self.vecr = None # the vectorizer for feature coding
        self.featureMethod = "undefined" # which feature coding method are we using
        
        self.xgb_model = None # the best trained naive Bayes model
        self.f1_score = 0  # the f1-score
        self.classify_report = None # the deatailed classification report from sklearn
        
        self.bestCnt = 0 # the counter's value when we obtained the best model
        self.trainCnt = 0 # the counter for training
    
    def classify(self, dfClassify):
        ''' Input Value     
                dfClassify: the dataframe includes the data we are going to classify
            Return Value
                resultDF:   the df based on dfClassify which shows the classification result
                            on column "predict"
        '''
        
        X = self.vecr.transform(dfClassify["text"])
        resultDF = dfClassify
        
        start = time.perf_counter()
        dmatrix = xgb.DMatrix(X)
        y = self.xgb_model.predict(dmatrix)
        y[y >= 0.5] = 1
        y[y < 0.5] = 0
        #display(y_pred)
        end = time.perf_counter()
        timeCost = end - start
        
        print("\nPredicting Finish Transcript")
        print("-----------------------------------------")
        print("Time Cost (Seconds): ",round(timeCost,8))
        print("-----------------------------------------")
        
        resultDF["predict"] = y
        return resultDF
    
    def training(self, iteration = 20, percent = 0.2, featureMethod = "tfidf", inplace = True):
        ''' Input Value     
                iteration:     the iteration round of training, default will be 5
                percent:       the percentage of the testing set among the whole dataset, default will be 0.2
                featureMethod: the method used in feature coding, can use "tfidf" or "bow",
                               default will be "tfidf"
                inplace:       whether to update the model if we have trained a better model, 
                               default will be True
        '''
        
        bestModel = self.xgb_model
        bestVecr = self.vecr
        bestF1_score = self.f1_score
        bestReport = self.classify_report
        bestModelCnt = self.bestCnt
        
        print("Training Record")
        print("-------------------------------------------------------------------------")
        print("Total Training No. | Train Iter. No. |  f1_score  |   Time Cost (Seconds)")
        
        for i in range(iteration):
            dfTrain, dfTest = self.random_dataSplit(self.data, percent)
            #print(dfTrain.shape[0], dfTest.shape[0])
            
            start = time.perf_counter()
            vecr, xgb_model = self.singleTrain(dfTrain, featureMethod)
            ac, report = self.__test(xgb_model, dfTest, vecr, featureMethod)
            end = time.perf_counter()
            
            timeCost = end - start
            self.trainCnt += 1
            
            print("\t",self.trainCnt,"\t\t",i+1,"\t      ",round(ac,8),"\t",round(timeCost,8))
            
            if(ac > bestF1_score):
                bestModel = xgb_model
                bestVecr = vecr
                bestF1_score = ac
                bestReport = report
                bestModelCnt = self.trainCnt
        
        print("\nTraining Finish Transcript")
        print("-------------------------------------------------------------------------------")
        print("Best Model F1_score: ",bestF1_score,"\t\t","From Training No. : ",bestModelCnt)
        print("-------------------------------------------------------------------------------")
        
        if(inplace == True):
            self.xgb_model = bestModel
            self.vecr = bestVecr
            self.featureMethod = featureMethod
            self.f1_score = bestF1_score
            self.classify_report = bestReport
            self.bestCnt = bestModelCnt
            return None
        
        return xgb_model, bestVecr, bestF1_score, bestReport
        
    def singleTrain(self, dataTrain, featureMethod = "tfidf"):
        ''' Input Value     
                dataTrain:     the dataset for training
                featureMethod: the method used in feature coding, can use "tfidf" or "bow",
                               default will be "tfidf"
            Return Value
                vecr:          the trained vectorizer
                xgb_model: the trained naive Bayes model
        '''
        if(featureMethod == "bow"):
            X_train, y_train, vecr = self.BOWcoding(dataTrain, 0)
        else:
            X_train, y_train, vecr = self.TFIDFcoding(dataTrain, 0)
        
        param = {
            'booster':'gbtree',
            'max_depth': 6, 
            'scale_pos_weight': 0.5,
            'colsample_bytree': 0.8,
            'objective': 'binary:logistic',
            'eval_metric': 'error',
            'eta': 0.3,
            'nthread': 10,
        }
        dmatrix = xgb.DMatrix(X_train, label = y_train)
        xgb_model = xgb.train(param, dmatrix, num_boost_round = 200)
        
        return vecr, xgb_model

    def random_dataSplit(self, dataTrain, percent = 0.2):
        ''' Input Value
                dataTrain: the dataset you are going to use to train the model, it will be split to 
                           two dataset, one is for training (dfTrain) and one is for testing (dfTest)
                percent:   the percentage of the testing set among the whole dataset, default will be 0.2
            Return Value
                dfTrain:   splited dataset for training
                dfTest:    splited dataset for testing
        '''
        
        # Split the data into two parts according to the category, 
        # And split the two parts into training set and testing set according to "percent"
        # Then combine the training set together and the testing set together 
        # to ensure the training set and testing set has a balanced proportion of categories
        
        dataPos = dataTrain.loc[dataTrain["label"] == 1]
        dataNeg = dataTrain.loc[dataTrain["label"] == 0]
        shuffleIdxPos = np.random.permutation(dataPos.shape[0])
        shuffleIdxNeg = np.random.permutation(dataNeg.shape[0])
        
        dataPosTestSize = int(dataPos.shape[0]*percent)
        dataNegTestSize = int(dataNeg.shape[0]*percent)
        
        dfTrainPos = dataPos.iloc[shuffleIdxPos[dataPosTestSize:]]
        dfTestPos = dataPos.iloc[shuffleIdxPos[:dataPosTestSize]]
        
        dfTrainNeg = dataNeg.iloc[shuffleIdxNeg[dataNegTestSize:]]
        dfTestNeg = dataNeg.iloc[shuffleIdxNeg[:dataNegTestSize]]
        
        
        dfTrain = pd.concat([dfTrainPos, dfTrainNeg]).reset_index(drop = True)
        dfTest  = pd.concat([dfTestPos, dfTestNeg]).reset_index(drop = True)
        
        return dfTrain, dfTest
    
    def TFIDFcoding(self, data, dataType, tfidf_vecr = None):
        ''' Input Value
                data:       the data you are going to apply TF-IDF
                dataType:   whether the data is a training set or a testing set
                            dataType == 0: training set
                            dataType == 1: testing set
                tfidf_vecr: the trained TfidfVectorizer, only needed when dataType == 1
            Return Value
                X:          the attribute columns
                y:          the label column
                tfidf_vecr: the trained TfidfVectorizer, only return when dataType == 0
        '''
        # TF-IDF feature coding
        if(dataType == 0):
            tfidf_vecr = TfidfVectorizer(token_pattern = '\[?\w+\]?', stop_words = self.stopwords)
            X = tfidf_vecr.fit_transform(data["text"])
            y = data["label"]
            return X, y, tfidf_vecr
        elif(dataType == 1):
            X = tfidf_vecr.transform(data["text"])
            y = data["label"]
            return X, y
        else:
            return None
    
    def BOWcoding(self, data, dataType, bow_vecr = None):
        ''' Input Value
                data:     the data you are going to apply BOW
                dataType: whether the data is a training set or a testing set
                          dataType == 0: training set
                          dataType == 1: testing set
                bow_vecr: the trained CountVectorizer, only needed when dataType == 1
            Return Value
                X:        the attribute columns
                y:        the label column
                bow_vecr: the trained CountVectorizer, only return when dataType == 0
        '''
        # Bag-of-Word feature coding
        if(dataType == 0):
            bow_vecr = CountVectorizer(token_pattern='\[?\w+\]?', stop_words = self.stopwords)
            X = bow_vecr.fit_transform(data["text"])
            y = data["label"]
            return X, y, bow_vecr
        elif(dataType == 1):
            X = bow_vecr.transform(data["text"])
            y = data["label"]
            return X, y
        else:
            return None
    
    def __test(self, xgb_model, dataTest, vecr, featureMethod = "tfidf"):
        ''' Introduction
                The private method for testing the trained model for selecting the best model by f1score
            Input Value     
                xgb_model:   the naive Bayes model we obtained from singleTrain()
                dataTest:      the testing set
                vecr:          the trained vectorizer
                featureMethod: the method used in feature coding, can use "tfidf" or "bow",
                               default will be "tfidf"
            Return Value
                test_f1score: the f1score
                test_report:  the classification report from sklearn
        '''
        if(self.featureMethod == "bow"):
            X_test, y_test = self.BOWcoding(dataTest, 1, vecr)
        else:
            X_test, y_test = self.TFIDFcoding(dataTest, 1, vecr)
        
        dmatrix = xgb.DMatrix(X_test)
        y_pred = xgb_model.predict(dmatrix)
        y_pred[y_pred >= 0.5] = 1
        y_pred[y_pred < 0.5] = 0
        #display(y_pred)
        
        test_f1score = metrics.f1_score(y_test, y_pred, average = "micro")
        test_report = metrics.classification_report(y_test, y_pred)
        
        return test_f1score, test_report


In [21]:
df = pd.read_csv(path+"train_data_wordCut.csv")
XGB = sentiment_XGB(df, all_stopwords)

In [22]:
XGB.training(iteration = 5, featureMethod = "bow")

Training Record
-------------------------------------------------------------------------
Total Training No. | Train Iter. No. |  f1_score  |   Time Cost (Seconds)
	 1 		 1 	       0.82341171 	 5.20033998
	 2 		 2 	       0.82841421 	 6.32952548
	 3 		 3 	       0.82841421 	 7.27272253
	 4 		 4 	       0.82291146 	 7.60513378
	 5 		 5 	       0.8174087 	 7.16716193

Training Finish Transcript
-------------------------------------------------------------------------------
Best Model F1_score:  0.8284142071035517 		 From Training No. :  2
-------------------------------------------------------------------------------


In [23]:
XGB.training(iteration = 5, featureMethod = "tfidf")

Training Record
-------------------------------------------------------------------------
Total Training No. | Train Iter. No. |  f1_score  |   Time Cost (Seconds)
	 6 		 1 	       0.7953977 	 11.62900703
	 7 		 2 	       0.79989995 	 12.88836594
	 8 		 3 	       0.81490745 	 12.45530359
	 9 		 4 	       0.8084042 	 12.69905558
	 10 		 5 	       0.80590295 	 16.38711024

Training Finish Transcript
-------------------------------------------------------------------------------
Best Model F1_score:  0.8284142071035517 		 From Training No. :  2
-------------------------------------------------------------------------------


In [24]:
# Clearly that the XGBoost with Bag-of-Word has the best performance
# Therefore we use XGBoost with Bag-of-Word as our model for prediction
# And now we are going to train a best model
XGB.training(iteration = 50, featureMethod = "bow")

Training Record
-------------------------------------------------------------------------
Total Training No. | Train Iter. No. |  f1_score  |   Time Cost (Seconds)
	 11 		 1 	       0.82741371 	 5.74930577
	 12 		 2 	       0.82241121 	 6.42344167
	 13 		 3 	       0.81490745 	 8.43734376
	 14 		 4 	       0.82141071 	 7.8077566
	 15 		 5 	       0.81790895 	 7.73686426
	 16 		 6 	       0.82241121 	 7.30321976
	 17 		 7 	       0.82941471 	 7.70376301
	 18 		 8 	       0.8104052 	 8.08364601
	 19 		 9 	       0.8154077 	 8.09688233
	 20 		 10 	       0.80890445 	 7.8309903
	 21 		 11 	       0.8184092 	 8.00135365
	 22 		 12 	       0.82191096 	 9.41382562
	 23 		 13 	       0.82191096 	 8.33957885
	 24 		 14 	       0.82641321 	 8.26821281
	 25 		 15 	       0.82141071 	 8.20807695
	 26 		 16 	       0.82841421 	 8.00696177
	 27 		 17 	       0.8154077 	 8.05359279
	 28 		 18 	       0.82491246 	 7.98776789
	 29 		 19 	       0.81190595 	 8.08861135
	 30 		 20 	       0.82441221 	 8.

In [6]:
df_verify = pd.read_csv(path+"verify_data_wordCut.csv")
display(df_verify)

Unnamed: 0,text,label
0,﻿ 更博 了 ， 爆照 了 ， 帅 的 呀 ， 就是 越来越 爱 你 ！ 生快 傻 缺 [ ...,1
1,土耳其 的 事要 认真对待 [ 哈哈 ] ， 否则 直接 开除 很 是 细心 ， 酒...,1
2,姑娘 都 羡慕 你 呢 … 还有 招财猫 高兴 … … [ 哈哈 ] 小 学徒 一枚 ， 等...,1
3,美 ~ ~ ~ ~ ~ [ 爱 你 ],1
4,梦想 有 多 大 ， 舞台 就 有 多 大 ! [ 鼓掌 ],1
...,...,...
119983,一 公里 不到 ， 县 医院 那个 天桥 下右 拐 200 米 就 到 了 ！ 我 靠 ...,0
119984,今天 真冷 啊 ， 难道 又 要 穿 棉袄 了 [ 晕 ] ？ 今年 的 春天 真的 是 百...,0
119985,最近 几天 就 没 停止 过 ！ ！ ！ [ 伤心 ],0
119986,[ 怒 ] 很惨 !,0


In [8]:
df_verify_predicted = XGB.classify(df_verify)
y_veri = df_verify_predicted["label"]
y_pred = df_verify_predicted["predict"]
test_f1score = metrics.f1_score(y_veri, y_pred, average = "micro")
print(test_f1score)


Predicting Finish Transcript
-----------------------------------------
Time Cost (Seconds):  0.28205458
-----------------------------------------
0.8215404873820715


In [9]:
# Export the verified result for Bagging
df_verified = pd.DataFrame()
df_verified["label"] = y_veri
df_verified["predict"] = y_pred
df_verified.to_csv("verified_data_XGB.csv", index = False)

In [27]:
# Backup the trained model here
dumpModel(bayes, "XGB_trained.model")

In [7]:
XGB_trained = loadModel("XGB_trained.model")

In [29]:
df_predict = pd.read_csv(path+"predict_data_wordCut.csv")

In [30]:
display(df_predict)

Unnamed: 0,text,timestamp
0,哈哈哈哈 哈哈哈 确实 是 ， 我们 有 宝藏 他们 没有 ！,2020-03-12 01:41:00
1,（ 二 ） 真诚 的 认为 遇上 你 是 我 的 缘,2020-03-12 01:41:00
2,# 欧洲 求助 钟南山 # 人 都 是 思变 的 动物 ， 天仙 美眷 女人 也 留不住 完...,2020-03-12 01:41:00
3,钟老 ， 您 就是 夜空 中 最亮 的 星,2020-03-12 01:41:00
4,两人 目光 呆滞 ， 钟南山 全程 英语 分享 中国 经验 走路 脚跟 离 地 ， 九叔 一...,2020-03-12 01:40:00
...,...,...
3112102,【 港新网 / 哈哈 ！ 黄之锋 动用 关系 从 美国 搞来 了 一批 “ 中国 制造 ” ...,2020-02-13 16:58:00
3112103,上海 居委会 预约 的 口罩 质量 还 不错 哦 四毛 六 一个,2020-02-13 16:58:00
3112104, 李现 # 李现 教 你 如何 节约 口罩 # lx # 李现 公益 正 能量 # ...,2020-02-13 16:58:00
3112105,一次性 口罩 消毒 后 可以 反复 使用 ！ 您 知道 吗 ？ 2 长沙 · 长沙县,2020-02-13 16:58:00


In [34]:
df_predicted = XGB_trained.classify(df_predict)


Predicting Finish Transcript
-----------------------------------------
Time Cost (Seconds):  9.90834145
-----------------------------------------


In [35]:
display(df_predicted)

Unnamed: 0,text,timestamp,predict
0,哈哈哈哈 哈哈哈 确实 是 ， 我们 有 宝藏 他们 没有 ！,2020-03-12 01:41:00,1.0
1,（ 二 ） 真诚 的 认为 遇上 你 是 我 的 缘,2020-03-12 01:41:00,1.0
2,# 欧洲 求助 钟南山 # 人 都 是 思变 的 动物 ， 天仙 美眷 女人 也 留不住 完...,2020-03-12 01:41:00,1.0
3,钟老 ， 您 就是 夜空 中 最亮 的 星,2020-03-12 01:41:00,1.0
4,两人 目光 呆滞 ， 钟南山 全程 英语 分享 中国 经验 走路 脚跟 离 地 ， 九叔 一...,2020-03-12 01:40:00,1.0
...,...,...,...
3112102,【 港新网 / 哈哈 ！ 黄之锋 动用 关系 从 美国 搞来 了 一批 “ 中国 制造 ” ...,2020-02-13 16:58:00,1.0
3112103,上海 居委会 预约 的 口罩 质量 还 不错 哦 四毛 六 一个,2020-02-13 16:58:00,1.0
3112104, 李现 # 李现 教 你 如何 节约 口罩 # lx # 李现 公益 正 能量 # ...,2020-02-13 16:58:00,1.0
3112105,一次性 口罩 消毒 后 可以 反复 使用 ！ 您 知道 吗 ？ 2 长沙 · 长沙县,2020-02-13 16:58:00,1.0


In [36]:
df_predicted.to_csv("predict_data_XGB.csv", index = False)