In [1]:
from sklearn import naive_bayes
import pandas as pd
import numpy as np
import jieba as jb

In [2]:
def cut_review(data):
    """将样本进行分词处理"""
    result = []
    stopWord = [' ', ',', '?', '.','-','“','”','/','’', 'is']
    for d in data:
        result.append(list(filter(lambda s: s and s not in stopWord, jb.lcut(d))))
    return result


def create_vocab_list(dataSet):
    """将所有词条集合传入，得到一个所有不重复词条的集合字典"""
    vocabSet = set([])
    for document in dataSet:
        vocabSet = vocabSet | set(document)
    return list(vocabSet)


def setOfWords2Vec(vocabList, inputSet):
    """将词条集合转换为词条向量"""
    returnVec = np.zeros(len(vocabList))
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
    return returnVec

def load_train_dataset(path):
    dataset = pd.read_csv(path, lineterminator='\n') 
    data = cut_review(dataset.values[:, 1])

    target = dataset.values[:, 2].copy()
    target[dataset.values[:, 2] == 'Negative'] = 0
    target[dataset.values[:, 2] == 'Positive'] = 1

    return data, np.array(target, dtype='int')

def load_test_dataset(path):
    dataset = pd.read_csv(path, lineterminator='\n')
    data_id = dataset.values[:, 0]
    data = cut_review(dataset.values[:, 1])
    return data_id, data

In [3]:
data, target = load_train_dataset('datasets/train.csv')
data_id, dataf = load_test_dataset('datasets/test.csv')
vocabList = create_vocab_list(data) # 词条字典
trainMatrix = [] # 建立词条向量
for review in data:
    trainMatrix.append(setOfWords2Vec(vocabList, review))

testMatrix = [] # 建立词条向量
for review in dataf:
    testMatrix.append(setOfWords2Vec(vocabList, review))

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\zlm31\AppData\Local\Temp\jieba.cache
Loading model cost 0.650 seconds.
Prefix dict has been built succesfully.


In [84]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(trainMatrix, target, random_state=666)

In [85]:
nb_clf = naive_bayes.BernoulliNB()
nb_clf.fit(X_train, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [86]:
nb_clf.predict_proba(X_test)

array([[2.00373424e-11, 1.00000000e+00],
       [3.72440418e-05, 9.99962756e-01],
       [3.96813791e-02, 9.60318621e-01],
       ...,
       [9.58071389e-01, 4.19286115e-02],
       [6.56808180e-01, 3.43191820e-01],
       [9.89239054e-01, 1.07609455e-02]])

In [87]:
nb_clf.predict(X_test)

array([1, 1, 1, ..., 0, 0, 0])

In [88]:
nb_clf.score(X_test, y_test)

0.7553729456384324

In [9]:
result = nb_clf.predict_proba(testMatrix)

In [10]:
res = {'ID': data_id, 'Pred': result[:, 1]}
df = pd.DataFrame(res, columns=['ID', 'Pred'])
print(df)
df.to_csv('datasets/result.csv',index=False)

        ID          Pred
0        1  9.835102e-01
1        2  7.727091e-01
2        3  2.261073e-01
3        4  9.991258e-01
4        5  1.397580e-02
5        6  9.999639e-01
6        7  9.999886e-01
7        8  7.811839e-01
8        9  7.410139e-01
9       10  3.883135e-01
10      11  9.991434e-01
11      12  1.180573e-01
12      13  8.630645e-01
13      14  9.903312e-01
14      15  9.002625e-01
15      16  7.104020e-02
16      17  6.783145e-03
17      18  6.109343e-01
18      19  9.999242e-01
19      20  9.999997e-01
20      21  6.868052e-01
21      22  1.599020e-05
22      23  4.261495e-01
23      24  9.990261e-01
24      25  1.873580e-01
25      26  2.778957e-08
26      27  6.051532e-02
27      28  3.833371e-01
28      29  3.339785e-01
29      30  9.508784e-01
...    ...           ...
2682  2683  1.277458e-01
2683  2684  9.999520e-01
2684  2685  9.965837e-01
2685  2686  4.490566e-02
2686  2687  9.997511e-01
2687  2688  9.997883e-01
2688  2689  9.999995e-01
2689  2690  1.000000e+00


In [11]:
def wordfrequency(vocabList, inputSet):
    """统计词频"""
    wordFreDict = {}
    for words in inputSet:
        for word in words:
            if word in vocabList:
                if word not in wordFreDict.keys():
                    wordFreDict[word] = 0
                wordFreDict[word] += 1
    words = dict(sorted(wordFreDict.items(),key=lambda x: x[1], reverse=True))
    return list(words.keys())

In [12]:
wordDict = wordfrequency(vocabList, data)

In [13]:
list(wordDict)

['ki',
 'ke',
 'mein',
 'hai',
 'ko',
 'aur',
 'ka',
 'se',
 'k',
 'ne',
 'bhi',
 'to',
 'ho',
 'hain',
 'par',
 'kar',
 'e',
 'na',
 'nahi',
 'tha',
 'Allah',
 'aik',
 'ye',
 'ha',
 'or',
 'hi',
 'in',
 'wo',
 'he',
 'b',
 'K',
 '😂',
 '...',
 'kiya',
 'jo',
 'main',
 'kay',
 ':',
 '..',
 'Pakistan',
 'ap',
 'koi',
 'Ki',
 'un',
 'hy',
 'thi',
 'liye',
 'the',
 'Ka',
 'sath',
 'Me',
 'Khan',
 'h',
 'apni',
 'yeh',
 'me',
 'ga',
 'gaya',
 'bad',
 'kr',
 'bohat',
 'Se',
 'acha',
 'us',
 'apne',
 'sab',
 'say',
 'kuch',
 'tak',
 'tu',
 'aap',
 'o',
 'Ko',
 'kia',
 'nhi',
 ')',
 'hum',
 'ya',
 '!',
 'ab',
 'gai',
 'raha',
 'per',
 'jis',
 'nai',
 'Is',
 'diya',
 'kaam',
 'bht',
 'sy',
 'baat',
 'nay',
 '....',
 'kisi',
 'achi',
 'lekin',
 'ni',
 'hasil',
 'ALLAH',
 'jab',
 'rahe',
 'hota',
 'dua',
 'pak',
 'rahi',
 '(',
 'pe',
 'leye',
 'Or',
 'tou',
 'khud',
 'karne',
 '2',
 'sirf',
 'kam',
 'film',
 'ker',
 'bhai',
 'ma',
 'de',
 'meri',
 'Ye',
 'di',
 'mai',
 'Hai',
 'iss',
 'log',
 'hu

In [101]:
trainMatrix = [] # 建立词条向量
for review in data:
    trainMatrix.append(setOfWords2Vec(wordDict[:13000], review))

In [102]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(trainMatrix, target, random_state=666)

In [103]:
nb_clf = naive_bayes.BernoulliNB()
nb_clf.fit(X_train, y_train)
nb_clf.score(X_test, y_test)

0.7509481668773704