In [1]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup

In [2]:
root_dir = "D:\\opt\\kaggle-nlp"
# 载入数据集
train = pd.read_csv('%s/%s' % (root_dir, 'labeledTrainData.tsv'), header=0, delimiter="\t", quoting=3)
test = pd.read_csv('%s/%s' % (root_dir, 'testData.tsv'), header=0, delimiter="\t", quoting=3)

In [3]:
train

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."
5,"""8196_8""",1,"""I dont know why people think this is such a b..."
6,"""7166_2""",0,"""This movie could have been very good, but com..."
7,"""10633_1""",0,"""I watched this video at a friend's house. I'm..."
8,"""319_1""",0,"""A friend of mine bought this film for £1, and..."
9,"""8713_10""",1,"""<br /><br />This movie is full of references...."


In [7]:
print(train.shape)
print("="*100)
print(train.columns.values)
print("="*100)
print(train.head(3))
print("="*100)
print(test.head(3))

(25000, 3)
['id' 'sentiment' 'review']
         id  sentiment                                             review
0  "5814_8"          1  "With all this stuff going down at the moment ...
1  "2381_9"          1  "\"The Classic War of the Worlds\" by Timothy ...
2  "7759_3"          0  "The film starts with a manager (Nicholas Bell...
           id                                             review
0  "12311_10"  "Naturally in a film who's main themes are of ...
1    "8348_2"  "This movie is a disaster within a disaster fi...
2    "5828_4"  "All in all, this is a movie for kids. We saw ...


In [8]:
def review_to_wordlist(review):
    '''
    把IMDB的评论转成词序列
    参考：http://blog.csdn.net/longxinchen_ml/article/details/50629613
    '''
    # 去掉HTML标签，拿到内容
    review_text = BeautifulSoup(review, "html.parser").get_text()
    # 用正则表达式取出符合规范的部分
    review_text = re.sub("[^a-zA-Z]", " ", review_text)
    # 小写化所有的词，并转成词list
    words = review_text.lower().split()
    # 返回words
    return words

In [9]:
# 预处理数据
label = train['sentiment']
train_data = []
for i in range(len(train['review'])):
    train_data.append(' '.join(review_to_wordlist(train['review'][i])))#append只增加一个索引位
test_data = []
for i in range(len(test['review'])):
    test_data.append(' '.join(review_to_wordlist(test['review'][i])))

# 预览数据
print(train_data[0], '\n')
print(test_data[0])

with all this stuff going down at the moment with mj i ve started listening to his music watching the odd documentary here and there watched the wiz and watched moonwalker again maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent moonwalker is part biography part feature film which i remember going to see at the cinema when it was originally released some of it has subtle messages about mj s feeling towards the press and also the obvious message of drugs are bad m kay visually impressive but of course this is all about michael jackson so unless you remotely like mj in anyway then you are going to hate this and find it boring some may call mj an egotist for consenting to the making of this movie but mj and most of his fans would say that he made it for the fans which if true is really nice of him the actual feature film bit when it finally starts is only on for minutes or so

# 特征处理
kaggle-nlp 采用BOW方法处理了特征，下面我们采用TF-IDF向量、Word2vec向量做特征处理
## 1.TF-IDF

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer as TFIDF

"""
min_df: 最小支持度为2（词汇出现的最小次数）
max_features: 默认为None，可设为int，对所有关键词的term frequency进行降序排序，只取前max_features个作为关键词集
strip_accents: 将使用ascii或unicode编码在预处理步骤去除raw document中的重音符号
analyzer: 设置返回类型
token_pattern: 表示token的正则表达式，需要设置analyzer == 'word'，默认的正则表达式选择2个及以上的字母或数字作为token，标点符号默认当作token分隔符，而不会被当作token
ngram_range: 词组切分的长度范围
use_idf: 启用逆文档频率重新加权
use_idf：默认为True，权值是tf*idf，如果设为False，将不使用idf，就是只使用tf，相当于CountVectorizer了。
smooth_idf: idf平滑参数，默认为True，idf=ln((文档总数+1)/(包含该词的文档数+1))+1，如果设为False，idf=ln(文档总数/包含该词的文档数)+1
sublinear_tf: 默认为False，如果设为True，则替换tf为1 + log(tf)
stop_words: 设置停用词，设为english将使用内置的英语停用词，设为一个list可自定义停用词，设为None不使用停用词，设为None且max_df∈[0.7, 1.0)将自动根据当前的语料库建立停用词表
"""
tfidf = TFIDF(min_df=2,
           max_features=None,
           strip_accents='unicode',
           analyzer='word',
           token_pattern=r'\w{1,}',#匹配\w 1次以上
           ngram_range=(1, 3),  # 二元文法模型
           use_idf=1,
           smooth_idf=1,
           sublinear_tf=1,
           stop_words = 'english') # 去掉英文停用词

# 合并训练和测试集以便进行TFIDF向量化操作
data_all = train_data + test_data
len_train = len(train_data)

tfidf.fit(data_all)
data_all = tfidf.transform(data_all)

# 恢复成训练集和测试集部分
train_x = data_all[:len_train]
test_x = data_all[len_train:]

print('TF-IDF处理结束.')

print("train: \n", np.shape(train_x[0]))
print("test: \n", np.shape(test_x[0]))

TF-IDF处理结束.
train: 
 (1, 810866)
test: 
 (1, 810866)


### 朴素贝叶斯训练

In [18]:
# 朴素贝叶斯训练

from sklearn.naive_bayes import MultinomialNB as MNB

model_NB = MNB() # (alpha=1.0, class_prior=None, fit_prior=True)
# 为了在预测的时候使用
model_NB.fit(train_x, label)

from sklearn.model_selection import cross_val_score
import numpy as np

print("多项式贝叶斯分类器10折交叉验证得分:  \n", cross_val_score(model_NB, train_x, label, cv=10, scoring='roc_auc'))
print("\n多项式贝叶斯分类器10折交叉验证得分: ", np.mean(cross_val_score(model_NB, train_x, label, cv=10, scoring='roc_auc')))

多项式贝叶斯分类器10折交叉验证得分:  
 [ 0.95134592  0.94728448  0.951648    0.94707712  0.95122816  0.94939968
  0.95240704  0.95434432  0.94438528  0.94930816]

多项式贝叶斯分类器10折交叉验证得分:  0.949842816


In [19]:
# test_predicted = np.array(model_NB.predict(test_x))
# print('保存结果...')

# submission_df = pd.DataFrame(data ={'id': test['id'], 'sentiment': test_predicted})
# print(submission_df.head(10))
# submission_df.to_csv('/Users/jiangzl/Desktop/submission_br.csv',columns = ['id','sentiment'], index = False)

### 逻辑回归训练

In [20]:
from sklearn.linear_model import LogisticRegression as LR
from sklearn.model_selection import GridSearchCV

# # 设定grid search的参数
# grid_values = {'C': [1, 15, 30, 50]}  
# # grid_values = {'C': [30]}
# # 设定打分为roc_auc
# """
# penalty: l1 or l2, 用于指定惩罚中使用的标准。
# """
# model_LR = GridSearchCV(LR(penalty='l2', dual=True, random_state=0), grid_values, scoring='roc_auc', cv=20)
# model_LR.fit(train_x, label)
# # 20折交叉验证
# # GridSearchCV(cv=20, 
# #         estimator=LR(C=1.0, 
# #             class_weight=None, 
# #             dual=True, 
# #             fit_intercept=True, 
# #             intercept_scaling=1, 
# #             penalty='l2', 
# #             random_state=0, 
# #             tol=0.0001),
# #         fit_params={}, 
# #         iid=True,
# #         n_jobs=1,
# #         param_grid={'C': [30]}, 
# #         pre_dispatch='2*n_jobs',
# #         refit=True,
# #         scoring='roc_auc', 
# #         verbose=0)

# # 输出结果
# # print(model_LR.grid_scores_, '\n', model_LR.best_params_, model_LR.best_params_)
# print(model_LR.cv_results_, '\n', model_LR.best_params_, model_LR.best_score_)

In [23]:
# {'mean_fit_time': array([0.77368994, 1.95680232, 2.88316183, 3.50976259]), 'std_fit_time': array([0.05099312, 0.19345662, 0.39457327, 0.50422455]), 'mean_score_time': array([0.00273149, 0.0025926 , 0.00262785, 0.00249476]), 'std_score_time': array([0.0001698 , 0.00014623, 0.00014215, 0.00024111]), 'param_C': masked_array(data=[1, 15, 30, 50],
#              mask=[False, False, False, False],
#        fill_value='?',
#             dtype=object), 'params': [{'C': 1}, {'C': 15}, {'C': 30}, {'C': 50}], \
#  'split0_test_score': array([0.95273728, 0.95990784, 0.960192  , 0.9602816 ]), \
#  'split1_test_score': array([0.96081408, 0.96953856, 0.96975104, 0.96994816]), \
#  'split2_test_score': array([0.9583616 , 0.96794112, 0.96825856, 0.96836352]), \
#  'split3_test_score': array([0.95249152, 0.96079104, 0.96123136, 0.96137984]), \
#  'split4_test_score': array([0.96460288, 0.9721088 , 0.9724672 , 0.97263104]), \
#  'split5_test_score': array([0.95881216, 0.96733184, 0.96779008, 0.96797184]), \
#  'split6_test_score': array([0.95679232, 0.96563968, 0.96596736, 0.96606976]), \
#  'split7_test_score': array([0.95171072, 0.96053248, 0.96105216, 0.96125952]), \
#  'split8_test_score': array([0.95526656, 0.9604096 , 0.96051712, 0.96053248]), \
#  'split9_test_score': array([0.94979328, 0.95777024, 0.95817472, 0.95834368]), \
#  'split10_test_score': array([0.95965952, 0.9672192 , 0.9675264 , 0.96764672]), \
#  'split11_test_score': array([0.95329024, 0.96009472, 0.96019712, 0.96021504]), \
#  'split12_test_score': array([0.96268544, 0.97140224, 0.97184256, 0.97202944]), \
#  'split13_test_score': array([0.9571968 , 0.96615936, 0.9666048 , 0.96676864]), \
#  'split14_test_score': array([0.95916544, 0.96551936, 0.96583168, 0.96596992]), 'split15_test_score': array([0.96279296, 0.96956928, 0.96978176, 0.96979968]), 'split16_test_score': array([0.95332096, 0.96132352, 0.96161792, 0.96173568]), 'split17_test_score': array([0.94883328, 0.9570816 , 0.95749632, 0.95771136]), 'split18_test_score': array([0.9528448 , 0.96074496, 0.96114176, 0.9612672 ]), 'split19_test_score': array([0.96429824, 0.97186048, 0.972032  , 0.97212416]), 'mean_test_score': array([0.9567735 , 0.9646473 , 0.9649737 , 0.96510246]), 'std_test_score': array([0.0046911 , 0.00476416, 0.00475249, 0.00475557]), 'rank_test_score': array([4, 3, 2, 1], dtype=int32), 'split0_train_score': array([0.99254593, 1.        , 1.        , 1.        ]), 'split1_train_score': array([0.99230078, 1.        , 1.        , 1.        ]), 'split2_train_score': array([0.9923811, 1.       , 1.       , 1.       ]), 'split3_train_score': array([0.9924227, 1.       , 1.       , 1.       ]), 'split4_train_score': array([0.9923401, 1.       , 1.       , 1.       ]), 'split5_train_score': array([0.9924475, 1.       , 1.       , 1.       ]), 'split6_train_score': array([0.99238184, 1.        , 1.        , 1.        ]), 'split7_train_score': array([0.99249388, 1.        , 1.        , 1.        ]), 'split8_train_score': array([0.99257082, 1.        , 1.        , 1.        ]), 'split9_train_score': array([0.99253744, 1.        , 1.        , 1.        ]), 'split10_train_score': array([0.99235201, 1.        , 1.        , 1.        ]), 'split11_train_score': array([0.99243953, 1.        , 1.        , 1.        ]), 'split12_train_score': array([0.99236668, 1.        , 1.        , 1.        ]), 'split13_train_score': array([0.99248181, 1.        , 1.        , 1.        ]), 'split14_train_score': array([0.99254685, 1.        , 1.        , 1.        ]), 'split15_train_score': array([0.99240575, 1.        , 1.        , 1.        ]), 'split16_train_score': array([0.99240521, 1.        , 1.        , 1.        ]), 'split17_train_score': array([0.99248037, 1.        , 1.        , 1.        ]), 'split18_train_score': array([0.99243375, 1.        , 1.        , 1.        ]), 'split19_train_score': array([0.99242053, 1.        , 1.        , 1.        ]), 'mean_train_score': array([0.99243773, 1.        , 1.        , 1.        ]), 'std_train_score': array([7.34564551e-05, 0.00000000e+00, 2.48253415e-17, 2.48253415e-17])} 
#  {'C': 50} 0.965102464

In [24]:
# model_LR = LR(penalty='l2', dual=True, random_state=0)
# model_LR.fit(train_x, label)

# test_predicted = np.array(model_LR.predict(test_x))
# print('保存结果...')
# submission_df = pd.DataFrame(data ={'id': test['id'], 'sentiment': test_predicted})
# print(submission_df.head(10))
# submission_df.to_csv('/Users/jiangzl/Desktop/submission_br.csv',columns = ['id','sentiment'], index = False)
# print('结束.')

## 2.Word2vec

In [25]:
import gensim
import nltk
from nltk.corpus import stopwords

# tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    
# def review_to_wordlist(review, remove_stopwords=False):
#     # review = BeautifulSoup(review, "html.parser").get_text()
#     review_text = re.sub("[^a-zA-Z]"," ", review)

#     words = review_text.lower().split()

#     if remove_stopwords:
#         stops = set(stopwords.words("english"))
#         words = [w for w in words if not w in stops]
#     # print(words)
#     return(words)


# def review_to_sentences(review, tokenizer, remove_stopwords=False):
#     '''
#     1. 将评论文章，按照句子段落来切分(所以会比文章的数量多很多)
#     2. 返回句子列表，每个句子由一堆词组成
#     '''
#     review = BeautifulSoup(review, "html.parser").get_text()
#     # raw_sentences 句子段落集合
#     raw_sentences = tokenizer.tokenize(review)
#     # print(raw_sentences)
    
#     sentences = []
#     for raw_sentence in raw_sentences:
#         if len(raw_sentence) > 0:
#             # 获取句子中的词列表
#             sentences.append(review_to_wordlist(raw_sentence, remove_stopwords))
#     return sentences


# sentences = []
# for i, review in enumerate(train["review"]):
#     # print(i, review)
#     sentences += review_to_sentences(review, tokenizer, True)
    
# unlabeled_train = pd.read_csv("%s/%s" % (root_dir, "unlabeledTrainData.tsv"), header=0, delimiter="\t", quoting=3 )
# for review in unlabeled_train["review"]:
#     sentences += review_to_sentences(review, tokenizer)
# print('预处理 unlabeled_train data...')

import time
from gensim.models import Word2Vec
# # 模型参数
# num_features = 300    # Word vector dimensionality                      
# min_word_count = 40   # Minimum word count                        
# num_workers = 4       # Number of threads to run in parallel
# context = 10          # Context window size                                                                                    
# downsampling = 1e-3   # Downsample setting for frequent words

# %%time
# # 训练模型
# print("训练模型中...")
# model = Word2Vec(sentences, workers=num_workers, \
#             size=num_features, min_count=min_word_count, \
#             window=context, sample=downsampling)
# print("训练完成")

# print('保存模型...')
# model.init_sims(replace=True)
# model_name = "%s/%s" % (root_dir, "300features_40minwords_10context")
# model.save(model_name)
# print('保存结束')



In [27]:
#kaggle-nlp已经做过训练了，这里就直接把训练好的model拿来用
model = Word2Vec.load("D:\\opt\\kaggle-nlp\\300features_40minwords_10context")

In [28]:
#test
model.wv.doesnt_match("man woman child kitchen".split())

'kitchen'

In [29]:
model.wv.most_similar("man", topn=5)

[('woman', 0.6180866956710815),
 ('lad', 0.597815215587616),
 ('lady', 0.5883153676986694),
 ('monk', 0.5241137742996216),
 ('person', 0.5198613405227661)]

### 使用W2V特征
由于对段落中的所有词向量进行取平均操作已经在kaggle-nlp中实现过了，再次仅做后续补充

In [30]:
def makeFeatureVec(words, model, num_features):
    '''
    对段落中的所有词向量进行取平均操作
    '''
    featureVec = np.zeros((num_features,), dtype="float32")
    nwords = 0.

    # Index2word包含了词表中的所有词，为了检索速度，保存到set中
    index2word_set = set(model.wv.index2word)
    for word in words:
        if word in index2word_set:
            nwords = nwords + 1.
            featureVec = np.add(featureVec, model[word])

    # 取平均
    featureVec = np.divide(featureVec, nwords)
    return featureVec


def getAvgFeatureVecs(reviews, model, num_features):
    '''
    给定一个文本列表，每个文本由一个词列表组成，返回每个文本的词向量平均值
    '''
    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews), num_features), dtype="float32")

    for review in reviews:
        if counter % 5000 == 0:
            print("Review %d of %d" % (counter, len(reviews)))

        reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features)
        counter = counter + 1

    return reviewFeatureVecs

In [31]:
# %time trainDataVecs = getAvgFeatureVecs(train_data, model, num_features)
# print(np.shape(trainDataVecs))

In [32]:
# %time testDataVecs = getAvgFeatureVecs(test_data, model, num_features)
# print(np.shape(testDataVecs))

### 高斯贝叶斯+Word2vec训练

In [33]:
# from sklearn.naive_bayes import GaussianNB as GNB

# model_GNB = GNB()
# model_GNB.fit(trainDataVecs, label)

# from sklearn.cross_validation import cross_val_score
# import numpy as np

# print("高斯贝叶斯分类器10折交叉验证得分: ", np.mean(cross_val_score(model_GNB, trainDataVecs, label, cv=10, scoring='roc_auc')))

# print('保存结果...')
# result = model_GNB.predict( testDataVecs )
# submission_df = pd.DataFrame(data ={'id': test['id'], 'sentiment': result})
# print(submission_df.head(10))
# submission_df.to_csv('/Users/jiangzl/Desktop/gnb_word2vec.csv',columns = ['id','sentiment'], index = False)
# print('结束.')

# """
# 从验证结果来看，没有超过基于TF-IDF多项式贝叶斯模型
# """

# 高斯贝叶斯分类器10折交叉验证得分:  0.6163932159999999

### 随机森林+Word2vec训练
此部分已在kaggle-nlp中出现过

In [35]:
# from sklearn.ensemble import RandomForestClassifier

# forest = RandomForestClassifier( n_estimators = 100, n_jobs=2)

# print("Fitting a random forest to labeled training data...")
# %time forest = forest.fit( trainDataVecs, label )
# print("随机森林分类器10折交叉验证得分: ", np.mean(cross_val_score(forest, trainDataVecs, label, cv=10, scoring='roc_auc')))

# # 测试集
# result = forest.predict( testDataVecs )

# print('保存结果...')
# submission_df = pd.DataFrame(data ={'id': test['id'], 'sentiment': result})
# print(submission_df.head(10))
# submission_df.to_csv('/Users/jiangzl/Desktop/rf_word2vec.csv',columns = ['id','sentiment'], index = False)
# print('结束.')

# """
# 改用随机森林之后，效果有提升，但是依然没有超过基于TF-IDF多项式贝叶斯模型
# """
# 随机森林分类器10折交叉验证得分:  0.6428176640000001