In [1]:
from sklearn import naive_bayes
import pandas as pd
import numpy as np
import jieba as jb

In [2]:
def cut_review(data):
    """将样本进行分词处理"""
    result = []
    stopWord = [' ', ',', '?', '.','-','“','”','/','’', 'is']
    for d in data:
        result.append(list(filter(lambda s: s and s not in stopWord, jb.lcut(d))))
    return result


def create_vocab_list(dataSet):
    """将所有词条集合传入，得到一个所有不重复词条的集合字典"""
    vocabSet = set([])
    for document in dataSet:
        vocabSet = vocabSet | set(document)
    return list(vocabSet)


def setOfWords2Vec(vocabList, inputSet):
    """将词条集合转换为词条向量"""
    returnVec = np.zeros(len(vocabList))
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
    return returnVec

def load_train_dataset(path):
    dataset = pd.read_csv(path, lineterminator='\n') 
    data = cut_review(dataset.values[:, 1])

    target = dataset.values[:, 2].copy()
    target[dataset.values[:, 2] == 'Negative'] = 0
    target[dataset.values[:, 2] == 'Positive'] = 1

    return data, np.array(target, dtype='int')

def load_test_dataset(path):
    dataset = pd.read_csv(path, lineterminator='\n')
    data_id = dataset.values[:, 0]
    data = cut_review(dataset.values[:, 1])
    return data_id, data

In [3]:
data, target = load_train_dataset('datasets/train.csv')

vocabList = create_vocab_list(data) # 词条字典
trainMatrix = [] # 建立词条向量
for review in data:
    trainMatrix.append(setOfWords2Vec(vocabList, review))

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\zlm31\AppData\Local\Temp\jieba.cache
Loading model cost 0.646 seconds.
Prefix dict has been built succesfully.


In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(trainMatrix, target)

In [5]:
nb_clf = naive_bayes.BernoulliNB()
nb_clf.fit(X_train, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [6]:
nb_clf.predict_proba(X_test)

array([[2.25458214e-07, 9.99999775e-01],
       [9.52553414e-01, 4.74465863e-02],
       [9.89132229e-01, 1.08677707e-02],
       ...,
       [9.71468410e-01, 2.85315904e-02],
       [5.79673544e-01, 4.20326456e-01],
       [9.77265369e-01, 2.27346305e-02]])

In [7]:
nb_clf.score(X_test, y_test)

0.7503160556257902

In [8]:
def wordfrequency(vocabList, inputSet):
    """统计词频"""
    wordFreDict = {}
    for words in inputSet:
        for word in words:
            if word in vocabList:
                if word not in wordFreDict.keys():
                    wordFreDict[word] = 0
                wordFreDict[word] += 1
    words = dict(sorted(wordFreDict.items(),key=lambda x: x[1], reverse=True))
    return list(words.keys())

In [9]:
wordDict =wordfrequency(vocabList, data)

In [10]:
list(wordDict)

['ki',
 'ke',
 'mein',
 'hai',
 'ko',
 'aur',
 'ka',
 'se',
 'k',
 'ne',
 'bhi',
 'to',
 'ho',
 'hain',
 'par',
 'kar',
 'e',
 'na',
 'nahi',
 'tha',
 'Allah',
 'aik',
 'ye',
 'ha',
 'or',
 'hi',
 'in',
 'wo',
 'he',
 'b',
 'K',
 '😂',
 '...',
 'kiya',
 'jo',
 'main',
 'kay',
 ':',
 '..',
 'Pakistan',
 'ap',
 'koi',
 'Ki',
 'un',
 'hy',
 'thi',
 'liye',
 'the',
 'Ka',
 'sath',
 'Me',
 'Khan',
 'h',
 'apni',
 'yeh',
 'me',
 'ga',
 'gaya',
 'bad',
 'kr',
 'bohat',
 'Se',
 'acha',
 'us',
 'apne',
 'sab',
 'say',
 'kuch',
 'tak',
 'tu',
 'aap',
 'o',
 'Ko',
 'kia',
 'nhi',
 ')',
 'hum',
 'ya',
 '!',
 'ab',
 'gai',
 'raha',
 'per',
 'jis',
 'nai',
 'Is',
 'diya',
 'kaam',
 'bht',
 'sy',
 'baat',
 'nay',
 '....',
 'kisi',
 'achi',
 'lekin',
 'ni',
 'hasil',
 'ALLAH',
 'jab',
 'rahe',
 'hota',
 'dua',
 'pak',
 'rahi',
 '(',
 'pe',
 'leye',
 'Or',
 'tou',
 'khud',
 'karne',
 '2',
 'sirf',
 'kam',
 'film',
 'ker',
 'bhai',
 'ma',
 'de',
 'meri',
 'Ye',
 'di',
 'mai',
 'Hai',
 'iss',
 'log',
 'hu

In [11]:
trainMatrix = [] # 建立词条向量
for review in data:
    trainMatrix.append(setOfWords2Vec(wordDict[:9000], review))

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(trainMatrix, target)

In [13]:
nb_clf = naive_bayes.BernoulliNB(alpha=0.75)
nb_clf.fit(X_train, y_train)
nb_clf.score(X_test, y_test)

0.7307206068268015

In [14]:
data, target = load_train_dataset('datasets/train.csv')

vocabList = create_vocab_list(data) # 词条字典
trainMatrix = [] # 建立词条向量
for review in data:
    trainMatrix.append(setOfWords2Vec(vocabList, review))

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(trainMatrix, target)

In [15]:
param_grid = {
    'alpha':np.linspace(0, 2, 20)
}

In [None]:
from sklearn.model_selection import GridSearchCV

nb_clf = naive_bayes.BernoulliNB()
grid_search = GridSearchCV(nb_clf, param_grid, n_jobs=-1, verbose=8)
grid_search.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   18.1s


In [None]:
grid_search.best_score_

In [None]:
grid_search.best_params_