In [82]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import model_selection, preprocessing, metrics
import xgboost as xgb
import os
import time
from gensim import corpora
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem import SnowballStemmer

from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding, SpatialDropout1D
from keras.layers import LSTM

color = sns.color_palette()

%matplotlib inline

pd.options.mode.chained_assignment = None  # default='warn'
pd.set_option('display.max_columns', 500)

In [3]:
# constants
DATA_PATH = '/kaggle/dev/sentiment-analysis-on-movie-reviews-data'
TRAIN_PATH = os.path.join(DATA_PATH, 'train.tsv')
TEST_PATH = os.path.join(DATA_PATH, 'test.tsv')
SAMPLE_PATH = os.path.join(DATA_PATH, 'sampleSubmission.csv')

In [4]:
train_df = pd.read_csv(TRAIN_PATH, sep='\t')
test_df = pd.read_csv(TEST_PATH, sep='\t')
sample_sub_df = pd.read_csv(SAMPLE_PATH)

print('train_df.shape', train_df.shape)
print('test_df.shape', test_df.shape)
print('sample_sub_df.shape', sample_sub_df.shape)

train_df.shape (156060, 4)
test_df.shape (66292, 3)
sample_sub_df.shape (66292, 2)


In [5]:
train_df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [6]:
test_df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [8]:
sample_sub_df.head()

Unnamed: 0,PhraseId,Sentiment
0,156061,2
1,156062,2
2,156063,2
3,156064,2
4,156065,2


In [12]:
phrases_train = train_df.Phrase.values
phrases_test = test_df.Phrase.values
sentiments_train = train_df.Sentiment.values
num_labels = len(np.unique(sentiments_train))

In [37]:
stop_words = set(stopwords.words('english'))
stop_words.update(['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}'])
stemmer = SnowballStemmer('english')

In [40]:
processed_phrases_train = []
for phrase in phrases_train:
    tokens = word_tokenize(phrase)
    words = [word for word in tokens if word not in stop_words]
    stemmed = [stemmer.stem(word) for word in words]
    processed_phrases_train.append(stemmed)

In [42]:
processed_phrases_test = []
for phrase in phrases_test:
    tokens = word_tokenize(phrase)
    words = [word for word in tokens if word not in stop_words]
    stemmed = [stemmer.stem(word) for word in words]
    processed_phrases_test.append(stemmed)

In [44]:
processed_phrases_all = np.concatenate((processed_phrases_train, processed_phrases_test), axis=0)
len(processed_phrases_all)

222352

In [52]:
processed_phrases_all

array([ list(['a', 'seri', 'escapad', 'demonstr', 'adag', 'good', 'goos', 'also', 'good', 'gander', 'occasion', 'amus', 'none', 'amount', 'much', 'stori']),
       list(['a', 'seri', 'escapad', 'demonstr', 'adag', 'good', 'goos']),
       list(['a', 'seri']), ..., list(['a', 'long-wind']),
       list(['a', 'long-wind']), list(['predict', 'scenario'])], dtype=object)

In [46]:
dictionary = corpora.Dictionary(processed_phrases_all)
dictionary_size = len(dictionary.keys())
print("dictionary size: ", dictionary_size)

dictionary size:  13759


In [53]:
print(dictionary.token2id('smokey'))

TypeError: 'dict' object is not callable

In [55]:
word_id_train, word_id_len = [], []
for doc in processed_phrases_train:
    word_ids = [dictionary.token2id[word] for word in doc]
    word_id_train.append(word_ids)
    word_id_len.append(len(word_ids))

In [64]:
word_id_test = []
for doc in processed_phrases_test:
    word_ids = [dictionary.token2id[word] for word in doc]
    word_id_test.append(word_ids)
    word_id_len.append(len(word_ids))

In [66]:
seq_len = np.round((np.mean(word_id_len) + 2*np.std(word_id_len))).astype(int)

In [72]:
word_id_train = sequence.pad_sequences(np.array(word_id_train), maxlen=seq_len)
word_id_test = sequence.pad_sequences(np.array(word_id_test), maxlen=seq_len)
y_train_enc = np_utils.to_categorical(sentiments_train, num_labels)

In [86]:
# LSTM
model = Sequential()
model.add(Embedding(dictionary_size, 128))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(128, dropout_W=0.2, dropout_U=0.2))
model.add(Dense(num_labels))
model.add(Activation('softmax'))

model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


  """


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, None, 128)         1761152   
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, None, 128)         0         
_________________________________________________________________
lstm_6 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_6 (Dense)              (None, 5)                 645       
_________________________________________________________________
activation_6 (Activation)    (None, 5)                 0         
Total params: 1,893,381
Trainable params: 1,893,381
Non-trainable params: 0
_________________________________________________________________


In [87]:
model.fit(word_id_train, y_train_enc, epochs=1, batch_size=256, verbose=1)



Epoch 1/1


<keras.callbacks.History at 0x7f1815f25a58>

In [88]:
test_pred = model.predict_classes(word_id_test)
test_pred



array([3, 3, 2, ..., 1, 1, 1])

In [89]:
test_df['Sentiment'] = test_pred.reshape(-1,1) 
header = ['PhraseId', 'Sentiment']
test_df.to_csv('./lstm_sentiment.csv', columns=header, index=False, header=True)
