+ Use Google's Word2Vec for movie reviews（电影评论）

https://www.kaggle.com/c/word2vec-nlp-tutorial

In [1]:
import pandas as pd

train = pd.read_csv('../input/labeledTrainData.tsv', delimiter='\t')
test = pd.read_csv('../input/testData.tsv', delimiter='\t')

In [2]:
train.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [3]:
from bs4 import BeautifulSoup
import re

from nltk.corpus import stopwords


def review_to_text(review, remove_stopwords):
    raw_text = BeautifulSoup(review, 'lxml').get_text()
    # 去掉非字母的字符，这些特殊的符号，应该被替换成空格
    letters = re.sub('[^a-zA-z]', ' ', raw_text)
    words = letters.lower().split()
    if remove_stopwords:
        stop_words = set(stopwords.words('english'))
        words = [w for w in words if w not in stop_words]
    return words

In [4]:
X_train = []

for review in train['review']:
    X_train.append(' '.join(review_to_text(review, True)))

X_test = []

for review in test['review']:
    X_test.append(' '.join(review_to_text(review, True)))

y_train = train['sentiment']

In [6]:
len(X_train)

25000

In [7]:
X_train[0]

'stuff going moment mj started listening music watching odd documentary watched wiz watched moonwalker maybe want get certain insight guy thought really cool eighties maybe make mind whether guilty innocent moonwalker part biography part feature film remember going see cinema originally released subtle messages mj feeling towards press also obvious message drugs bad kay visually impressive course michael jackson unless remotely like mj anyway going hate find boring may call mj egotist consenting making movie mj fans would say made fans true really nice actual feature film bit finally starts minutes excluding smooth criminal sequence joe pesci convincing psychopathic powerful drug lord wants mj dead bad beyond mj overheard plans nah joe pesci character ranted wanted people know supplying drugs etc dunno maybe hates mj music lots cool things like mj turning car robot whole speed demon sequence also director must patience saint came filming kiddy bad sequence usually directors hate workin

## 分别使用 CountVectorizer 和 TfidfVectorizer 对文本特征进行抽取

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

pipeline_count = Pipeline([('count_vec', CountVectorizer(analyzer='word')),
                           ('mnb', MultinomialNB())])

In [11]:
params_count = {
    'count_vec__binary': [True, False],
    'count_vec__ngram_range': [(1, 1), (1, 2)],
    'mnb__alpha': [0.1, 1.0, 10.0]
}

In [14]:
from sklearn.model_selection import GridSearchCV

gs_count = GridSearchCV(
    pipeline_count, params_count, cv=4, n_jobs=-1, verbose=1)
gs_count.fit(X_train, y_train)

Fitting 4 folds for each of 12 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  2.5min finished


GridSearchCV(cv=4, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('count_vec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preproc...nizer=None, vocabulary=None)), ('mnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'count_vec__binary': [True, False], 'count_vec__ngram_range': [(1, 1), (1, 2)], 'mnb__alpha': [0.1, 1.0, 10.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [15]:
# 输出交叉验证中最佳的准确性得分以及超参数组合
print(gs_count.best_score_)
print(gs_count.best_params_)

0.88204
{'count_vec__binary': True, 'count_vec__ngram_range': (1, 2), 'mnb__alpha': 1.0}


In [16]:
count_y_predict = gs_count.predict(X_test)
count_y_predict

array([1, 0, 1, ..., 0, 1, 0])

In [17]:
# 输出
count_df = pd.DataFrame({'id': test['id'], 'sentiment': count_y_predict})
count_df.to_csv('../output/submission_count.csv', index=None)

---

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

pipeline_tfidf = Pipeline([('tfidf_vec', TfidfVectorizer(analyzer='word')),
                           ('mnb', MultinomialNB())])

In [19]:
# 分别用于配置用于模型超参数搜索的组合

params_tfidf = {
    'tfidf_vec__binary': [True, False],
    'tfidf_vec__ngram_range': [(1, 1), (1, 2)],
    'mnb__alpha': [0.1, 1.0, 10.0]
}

In [20]:
gs_tfidf = GridSearchCV(
    pipeline_tfidf, params_tfidf, cv=4, n_jobs=-1, verbose=1)
gs_tfidf.fit(X_train, y_train)
tfidf_y_predict = gs_tfidf.predict(X_test)
tfidf_y_predict

Fitting 4 folds for each of 12 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  2.5min finished


array([1, 0, 1, ..., 0, 1, 0])

In [21]:
tfidf_df = pd.DataFrame({'id': test['id'], 'sentiment': tfidf_y_predict})
tfidf_df.to_csv('../output/submission_tfidf.csv', index=None)

# 从本地导入未标记的数据



In [22]:
unlabeled_train = pd.read_csv('../input/unlabeledTrainData.tsv',delimiter='\t',quoting=3)
unlabeled_train

Unnamed: 0,id,review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was..."
1,"""45057_0""","""I saw this film about 20 years ago and rememb..."
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B..."
3,"""7161_0""","""I went to see this film with a great deal of ..."
4,"""43971_0""","""Yes, I agree with everyone on this site this ..."
5,"""36495_0""","""Jennifer Ehle was sparkling in \""Pride and Pr..."
6,"""49472_0""","""Amy Poehler is a terrific comedian on Saturda..."
7,"""36693_0""","""A plane carrying employees of a large biotech..."
8,"""316_0""","""A well made, gritty science fiction movie, it..."
9,"""32454_0""","""Incredibly dumb and utterly predictable story..."
