In [36]:
# https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews/discussion?sortBy=top&group=all&page=1&pageSize=20&category=all&kind=all
# https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews/discussion/26446
# 词袋模型

In [37]:
import numpy as np
import pandas as pd

from gensim import corpora
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem import SnowballStemmer

from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding
from keras.layers import LSTM

np.random.seed(0)


In [38]:
#load data
train_df = pd.read_csv('train.tsv', sep='\t', header=0)
test_df = pd.read_csv('test.tsv', sep='\t', header=0)

In [39]:
raw_docs_train = train_df['Phrase'].values
raw_docs_test = test_df['Phrase'].values
sentiment_train = train_df['Sentiment'].values
num_labels=len(np.unique(sentiment_train))
print(num_labels) # 标签总数

5


In [40]:
#text pre-processing
stop_words = set(stopwords.words('english'))
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [41]:
stop_words.update(['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}'])
stop_words #添加一些停用词

{'"',
 "'",
 '(',
 ')',
 ',',
 '.',
 ':',
 ';',
 '[',
 ']',
 'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',

In [42]:
stemmer = SnowballStemmer('english') #词干提取
print("pre-processing train docs...")
processed_docs_train=[]
for doc in raw_docs_train:
    tokens=word_tokenize(doc)
    filtered=[word for word in tokens if word not in stop_words]
    stemmed=[stemmer.stem(word) for word in filtered]
    processed_docs_train.append(stemmed)

pre-processing train docs...


In [43]:
print("pre_processing test docs...")
processed_docs_test=[]
for doc in raw_docs_test:
    tokens = word_tokenize(doc)
    filtered = [word for word in tokens if word not in stop_words]
    stemmed = [stemmer.stem(word) for word in filtered]
    processed_docs_test.append(stemmed)

pre_processing test docs...


In [44]:
len(processed_docs_train)

156060

In [45]:
len(processed_docs_test)

66292

In [46]:
processed_docs_all=np.concatenate((processed_docs_train,processed_docs_test),axis=0)

In [47]:
len(processed_docs_all)

222352

In [48]:
dictionary=corpora.Dictionary(processed_docs_all) #为语料库中出现的每个单词分配一个编号

In [49]:
type(dictionary)

gensim.corpora.dictionary.Dictionary

In [50]:
dictionary[0] #gensimd 的Dictionary类型是根据编号能得到单词

'a'

In [51]:
print(dictionary.id2token) 

{0: 'a', 1: 'seri', 2: 'escapad', 3: 'demonstr', 4: 'adag', 5: 'good', 6: 'goos', 7: 'also', 8: 'gander', 9: 'occasion', 10: 'amus', 11: 'none', 12: 'amount', 13: 'much', 14: 'stori', 15: 'this', 16: 'quiet', 17: 'introspect', 18: 'entertain', 19: 'independ', 20: 'worth', 21: 'seek', 22: 'even', 23: 'fan', 24: 'ismail', 25: 'merchant', 26: "'s", 27: 'work', 28: 'i', 29: 'suspect', 30: 'would', 31: 'hard', 32: 'time', 33: 'sit', 34: 'one', 35: 'posit', 36: 'thrill', 37: 'combin', 38: 'ethnographi', 39: 'intrigu', 40: 'betray', 41: 'deceit', 42: 'murder', 43: 'shakespearean', 44: 'tragedi', 45: 'juici', 46: 'soap', 47: 'opera', 48: 'aggress', 49: 'self-glorif', 50: 'manipul', 51: 'whitewash', 52: 'comedy-drama', 53: 'near', 54: 'epic', 55: 'proport', 56: 'root', 57: 'sincer', 58: 'perform', 59: 'titl', 60: 'charact', 61: 'undergo', 62: 'midlif', 63: 'crisi', 64: 'narrat', 65: 'troubl', 66: 'everi', 67: 'day', 68: 'plod', 69: 'mess', 70: 'the', 71: 'import', 72: 'be', 73: 'earnest', 74: '

In [52]:
print(dictionary.token2id) 

{'a': 0, 'seri': 1, 'escapad': 2, 'demonstr': 3, 'adag': 4, 'good': 5, 'goos': 6, 'also': 7, 'gander': 8, 'occasion': 9, 'amus': 10, 'none': 11, 'amount': 12, 'much': 13, 'stori': 14, 'this': 15, 'quiet': 16, 'introspect': 17, 'entertain': 18, 'independ': 19, 'worth': 20, 'seek': 21, 'even': 22, 'fan': 23, 'ismail': 24, 'merchant': 25, "'s": 26, 'work': 27, 'i': 28, 'suspect': 29, 'would': 30, 'hard': 31, 'time': 32, 'sit': 33, 'one': 34, 'posit': 35, 'thrill': 36, 'combin': 37, 'ethnographi': 38, 'intrigu': 39, 'betray': 40, 'deceit': 41, 'murder': 42, 'shakespearean': 43, 'tragedi': 44, 'juici': 45, 'soap': 46, 'opera': 47, 'aggress': 48, 'self-glorif': 49, 'manipul': 50, 'whitewash': 51, 'comedy-drama': 52, 'near': 53, 'epic': 54, 'proport': 55, 'root': 56, 'sincer': 57, 'perform': 58, 'titl': 59, 'charact': 60, 'undergo': 61, 'midlif': 62, 'crisi': 63, 'narrat': 64, 'troubl': 65, 'everi': 66, 'day': 67, 'plod': 68, 'mess': 69, 'the': 70, 'import': 71, 'be': 72, 'earnest': 73, 'thic

In [53]:
dictionary_size = len(dictionary.keys())
print("dictionary size: ", dictionary_size)

dictionary size:  13773


In [54]:
dictionary.save('dictionary.dict')
corpus = [dictionary.doc2bow(doc) for doc in processed_docs_all]

In [55]:
len(corpus)

222352

In [56]:
print("converting to token ids...")
word_id_train,word_id_len=[],[]
for doc in processed_docs_train:
    word_ids=[dictionary.token2id[word] for word in doc]
    word_id_train.append(word_ids)
    word_id_len.append(len(word_ids))
    
word_id_test, word_ids = [], []
for doc in processed_docs_test:
    word_ids = [dictionary.token2id[word] for word in doc]
    word_id_test.append(word_ids)
    word_id_len.append(len(word_ids))

# 设置序列长度，为 每条文本的平均单词数+2倍标准差
seq_len=np.round( ( np.mean(word_id_len) + 2*np.std(word_id_len) ) ).astype(int)

converting to token ids...


In [57]:
max(word_id_len)

56

In [58]:
len(word_id_len)

222352

In [59]:
np.mean(word_id_len)

4.9806612938044186

In [60]:
2*np.std(word_id_len)

10.069872502773944

In [61]:
seq_len

15

In [62]:
#pad sequences  将 训练集和测试集都变成长度固定的序列，不够自动补0,默认在前面补0，超过长度，从前面截断
word_id_train = sequence.pad_sequences(np.array(word_id_train), maxlen=seq_len)
word_id_test = sequence.pad_sequences(np.array(word_id_test), maxlen=seq_len)
y_train_enc = np_utils.to_categorical(sentiment_train, num_labels)

In [63]:
word_id_train.shape

(156060, 15)

In [64]:
word_id_test.shape

(66292, 15)

In [65]:
word_id_train[4]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], dtype=int32)

In [66]:
#LSTM
print('fitting lstm...')
model=Sequential()
model.add(Embedding(dictionary_size,210,dropout=0.2)) # 这个embedding层的参数矩阵其实是 字典长度*输出维度（即每个单词所代表的向量长度）
model.add(LSTM(128,dropout_W=0.2,dropout_U=0.2))
model.add(Dense(units=128,activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_labels))
model.add(Activation('softmax'))

fitting lstm...


  after removing the cwd from sys.path.
  """


In [67]:
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.fit(word_id_train,y_train_enc,nb_epoch=3,batch_size=256,verbose=1,validation_split=0.2)



Train on 124848 samples, validate on 31212 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fee7b8def28>

In [68]:
test_pred=model.predict_classes(word_id_test)



In [69]:
test_df['Sentiment'] = test_pred.reshape(-1,1) 

In [70]:
#make a submission
header = ['PhraseId', 'Sentiment']
test_df.to_csv('./lstm_sentiment.csv', columns=header, index=False, header=True)