In [1]:
import pandas as pd
import numpy as np
import jieba as jb

In [2]:
def cut_review(data):
    """将样本进行分词处理"""
    result = []
    for d in data:
        result.append(list(filter(lambda s: s and s.strip(), jb.lcut(d))))
    return result


def create_vocab_list(dataSet):
    """将所有词条集合传入，得到一个所有不重复词条的集合字典"""
    vocabSet = set([])
    for document in dataSet:
        vocabSet = vocabSet | set(document)
    return list(vocabSet)


def setOfWords2Vec(vocabList, inputSet):
    """将词条集合转换为词条向量"""
    returnVec = np.zeros(len(vocabList))
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
    return returnVec

In [8]:
def load_train_dataset(path):
    dataset = pd.read_csv(path, lineterminator='\n') 
    data = cut_review(dataset.values[:, 1])

    target = dataset.values[:, 2].copy()
    target[dataset.values[:, 2] == 'Negative'] = 0
    target[dataset.values[:, 2] == 'Positive'] = 1

    return data, target

In [9]:
def load_test_dataset(path):
    dataset = pd.read_csv(path, lineterminator='\n')
    data_id = dataset.values[:, 0]
    data = cut_review(dataset.values[:, 1])
    return data_id, data

In [20]:
def fit(trainMatrix, trainCategory):
        # 记录词条向量的个数
        numTrainDocs = len(trainMatrix)
        # 记录单个词条向量的长度即词条字典长度
        numWords = len(trainMatrix[0])
        # pPositive是所有词条向量中是Positive言论的概率
        pPositive = np.sum(trainCategory) / float(numTrainDocs)
        # 初始化Negative/Positive词条分布总和向量
        p0Num = np.ones(numWords)
        p1Num = np.ones(numWords)
        # 初始化Negative/Positive言论中词条总个数
        p0Denom = 2.0
        p1Denom = 2.0
        for i in range(numTrainDocs):
            if trainCategory[i] == 1:
                # 如果为Positive言论，记录所有词条向量累加后的总向量及所有Positive言论中总词条个数
                p1Num += trainMatrix[i]
                p1Denom += np.sum(trainMatrix[i])
            else:
                # 如果为Negative言论，记录所有词条向量累加后的总向量及所有Negative言论中总词条个数
                p0Num += trainMatrix[i]
                p0Denom += np.sum(trainMatrix[i])

        p1Vect = p1Num / p1Denom
        p0Vect = p0Num / p0Denom

        # 对每个元素取对数
        for i in range(len(p1Vect)):
            p1Vect[i] = np.log(p1Vect[i])
            p0Vect[i] = np.log(p0Vect[i])

        return pPositive, p1Vect, p0Vect

In [21]:
def predict(vec2Classify, p0Vec, p1Vec, pPositive):
        p1 = np.sum(vec2Classify * p1Vec) + np.log(pPositive)
        p0 = np.sum(vec2Classify * p0Vec) + np.log(1.0 - pPositive)
        if p1 > p0:
            return 1
        else:
            return 0

In [27]:
def score(y_test, y_predict):
    return np.sum(y_test == y_predict) / len(y_test)

### 加载数据集 生成词条向量

In [13]:
X, y = load_train_dataset('datasets/train.csv')

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\zlm31\AppData\Local\Temp\jieba.cache
Loading model cost 0.889 seconds.
Prefix dict has been built succesfully.


In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [16]:
vocabList = create_vocab_list(X_train)
X_trainMatrix = [] #训练词条向量
for x in X_train:
    X_trainMatrix.append(setOfWords2Vec(vocabList, x))
X_testMatrix = [] #测试词条向量
for x in X_test:
    X_testMatrix.append(setOfWords2Vec(vocabList, x))

In [29]:
pPos, p1V, p0V = fit(X_trainMatrix, y_train)

In [33]:
p1V

array([-10.53150937,  -9.83836219,  -9.83836219, ..., -10.53150937,
       -10.53150937, -10.53150937])

In [25]:
result = []
for res in X_testMatrix:
    result.append(predict(res,p0V, p1V, pPos))

In [28]:
score(result, y_test)

0.7635903919089759

In [35]:
def fit(trainMatrix, trainCategory):
        # 记录词条向量的个数
        numTrainDocs = len(trainMatrix)
        # 记录单个词条向量的长度即词条字典长度
        numWords = len(trainMatrix[0])
        # pPositive是所有词条向量中是Positive言论的概率
        pPositive = np.sum(trainCategory) / float(numTrainDocs)
        # 初始化Negative/Positive词条分布总和向量
        p0Num = np.ones(numWords)
        p1Num = np.ones(numWords)
        # 初始化Negative/Positive言论中词条总个数
        p0Denom = 2.0
        p1Denom = 2.0
        for i in range(numTrainDocs):
            if trainCategory[i] == 1:
                # 如果为Positive言论，记录所有词条向量累加后的总向量及所有Positive言论中总词条个数
                p1Num += trainMatrix[i]
                p1Denom += np.sum(trainMatrix[i])
            else:
                # 如果为Negative言论，记录所有词条向量累加后的总向量及所有Negative言论中总词条个数
                p0Num += trainMatrix[i]
                p0Denom += np.sum(trainMatrix[i])

        p1Vect = p1Num / p1Denom
        p0Vect = p0Num / p0Denom

        return pPositive, p1Vect, p0Vect

In [36]:
pPos, p1V, p0V = fit(X_trainMatrix, y_train)

In [37]:
p1V

array([2.66823203e-05, 5.33646406e-05, 5.33646406e-05, ...,
       2.66823203e-05, 2.66823203e-05, 2.66823203e-05])

In [38]:
def predict(vec2Classify, p0Vec, p1Vec, pPositive):
    p1 = 1.0
    p0 = 1.0
    for p in vec2Classify * p1Vec:
        if p != 0.0:
             p1 = p1 * p
    for p in vec2Classify * p0Vec:
        if p != 0.0:
             p0 = p0 * p
    p1 = p1 * pPositive
    p0 = p0 * (1.0 - pPositive)
    if p1 > p0:
        return 1
    else:
        return 0

In [39]:
result = []
for res in X_testMatrix:
    result.append(predict(res,p0V, p1V, pPos))

In [40]:
score(result, y_test)

0.7635903919089759

In [41]:
result

[0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,


In [42]:
def predict(vec2Classify, p0Vec, p1Vec, pPositive):
    p1 = 1.0
    p0 = 1.0
    for p in vec2Classify * p1Vec:
        if p != 0.0:
             p1 = p1 * p
    for p in vec2Classify * p0Vec:
        if p != 0.0:
             p0 = p0 * p
    p1 = p1 * pPositive
    p0 = p0 * (1.0 - pPositive)
    return p1

In [43]:
result = []
for res in X_testMatrix:
    result.append(predict(res,p0V, p1V, pPos))

In [44]:
result

[3.4924755015396952e-12,
 3.970555025168506e-22,
 2.3081317573447233e-54,
 1.5139928902898547e-58,
 4.543677452643683e-89,
 4.2909671023241765e-64,
 1.5965602292752891e-13,
 1.4279026962248221e-34,
 1.4377476849896207e-15,
 1.3668427337645248e-35,
 1.4367156588590337e-89,
 3.615551580626772e-23,
 1.4149550512927102e-23,
 9.5957260509829e-19,
 2.9023844629743453e-47,
 3.5358795256692686e-19,
 8.296309140565955e-19,
 3.104917040454223e-22,
 3.8561689608072725e-75,
 5.845603750052093e-40,
 1.651096955009519e-51,
 3.0803633923580115e-11,
 2.5553531686221252e-31,
 2.7060403249001795e-45,
 0.00018220550325048678,
 6.326089813954331e-16,
 9.628135396679936e-38,
 1.9139199291698345e-141,
 4.353395239000952e-106,
 1.4407458302648156e-39,
 1.305772671302632e-85,
 3.071446467317013e-60,
 1.301507970071508e-28,
 1.6798312063147227e-31,
 1.2707723088087497e-29,
 3.940890158155358e-61,
 2.0707788762215854e-38,
 0.0002522845429622125,
 1.0462937589125526e-156,
 1.3720155901315083e-63,
 4.153689115817

In [None]:
data = {'ID': review_id, 'Pred': result}
df = pd.DataFrame(data, columns=['ID', 'Pred'])
print(df)
df.to_csv('datasets/result.csv',index=False)