In [1]:
import pandas as pd

train = pd.read_csv('Datasets/IMDB/labeledTrainData.tsv', delimiter='\t')
test = pd.read_csv('Datasets/IMDB/testData.tsv', delimiter='\t')

train.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [5]:
# BeautifulSoup用于处理原始文本
from bs4 import BeautifulSoup
import re
# 从nltk.corpus导入停用词列表
from nltk.corpus import stopwords

def review_to_text(review, remove_stopwords):
    # 去除html标记
    raw_text = BeautifulSoup(review, 'lxml').get_text()
    # 用空格代替所有的非字母字符
    letters = re.sub('[^a-zA-Z]', ' ', raw_text)
    words = letters.lower().split()
    # 去除停止词
    if remove_stopwords:
        stop_words = set(stopwords.words('english'))
        words = [w for w in words if w not in stop_words]
    
    return words

In [10]:
# 数据预处理
X_train = []
for review in train['review']:
    X_train.append(' '.join(review_to_text(review, True)))
X_test = []
for review in test['review']:
    X_test.append(' '.join(review_to_text(review, True)))
 
y_train = train['sentiment']

In [16]:
# 导入文本特征抽取器 
# 导入朴素贝叶斯模型
# Pipeline用于搭建系统流程
# GridSearchCV用于超参组合的Grid Search
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV

pip_count = Pipeline([('count_vec', CountVectorizer(analyzer='word')),
                     ('mnb', MultinomialNB())])
pip_tfodf = Pipeline([('tfidf_vec', TfidfVectorizer(analyzer='word')),
                     ('mnb', MultinomialNB())])

params_count = {'count_vec__binary':[True, False], 'count_vec__ngram_range':[(1, 1), (1, 2)],
                'mnb__alpha':[0.1, 1.0, 10.0]}
params_tfidf = {'count_vec__binary':[True, False], 'tfidf_vec__ngram_range':[(1, 1), (1, 2)],
                'mnb__alpha':[0.1, 1.0, 10.0]}

In [None]:
gs_count = GridSearchCV(pip_count, params_count, cv=4, n_jobs=-1, verbose=1)
gs_count.fit(X_train, y_train)
print(gs_count.best_score_)
print(gs_count.best_params_)

Fitting 4 folds for each of 12 candidates, totalling 48 fits
