The dataset is comprised of tab-separated files with phrases from the Rotten Tomatoes dataset. The train/test split has been preserved for the purposes of benchmarking, but the sentences have been shuffled from their original order. Each Sentence has been parsed into many phrases by the Stanford parser. Each phrase has a PhraseId. Each sentence has a SentenceId. Phrases that are repeated (such as short/common words) are only included once in the data.

train.tsv contains the phrases and their associated sentiment labels. 
test.tsv contains just phrases. 


The sentiment labels are:
0 - negative
1 - somewhat negative
2 - neutral
3 - somewhat positive
4 - positive

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import textblob

In [None]:
print("Loading data...")
train = pd.read_csv("../input/movie-review-sentiment-analysis-kernels-only/train.tsv", sep="\t")
print("Train shape:", train.shape)
test = pd.read_csv("../input/movie-review-sentiment-analysis-kernels-only/test.tsv", sep="\t")
print("Test shape:", test.shape)


In [None]:
train.head()

In [None]:
enc = OneHotEncoder(sparse=False)
enc.fit(train["Sentiment"].values.reshape(-1, 1))
print("Number of classes:", enc.n_values_[0])
print("Class distribution:\n{}".format(train["Sentiment"].value_counts()/train.shape[0]))

In [None]:
train["Sentiment"].value_counts().plot.bar()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

train_cv = CountVectorizer()
train_cv.fit(train["Phrase"])

test_cv = CountVectorizer()
test_cv.fit(test["Phrase"])

print("Train Set Vocabulary Size:", len(train_cv.vocabulary_))
print("Test Set Vocabulary Size:", len(test_cv.vocabulary_))
print("Number of Words that occur in both:", len(set(train_cv.vocabulary_.keys()).intersection(set(test_cv.vocabulary_.keys()))))

** Add Numerical Feature**

In [None]:
def add_num_feature_to_df(df):
    df["phrase_count"] = df.groupby("SentenceId")["Phrase"].transform("count")
    df["word_count"] = df["Phrase"].apply(lambda x: len(x.split()))
    df["has_upper"] = df["Phrase"].apply(lambda x: x.lower() != x)
    df["sentence_end"] = df["Phrase"].apply(lambda x: x.endswith("."))
    df["after_comma"] = df["Phrase"].apply(lambda x: x.startswith(","))
    df["sentence_start"] = df["Phrase"].apply(lambda x: "A" <= x[0] <= "Z")
    df["Phrase"] = df["Phrase"].apply(lambda x: x.lower())
    return df

train = add_num_feature_to_df(train)
test = add_num_feature_to_df(test)

dense_features = ["phrase_count", "word_count", "has_upper", "after_comma", "sentence_start", "sentence_end"]

train.groupby("Sentiment")[dense_features].mean()

In [None]:
train.head()

**Transfer Learning Using GLOVE Embeddings**

In [None]:
EMBEDDING_FILE = "../input/glove-global-vectors-for-word-representation/glove.6B.100d.txt"
EMBEDDING_DIM = 100

all_words = set(train_cv.vocabulary_.keys()).union(set(test_cv.vocabulary_.keys()))

def get_embedding():
    embeddings_index = {}
    emp_f = open(EMBEDDING_FILE)
    for line in emp_f:
        values = line.split()
        word = values[0]
        if len(values) == EMBEDDING_DIM + 1 and word in all_words:
            coefs = np.asarray(values[1:], dtype="float32")
            embeddings_index[word] = coefs
    emp_f.close()
    return embeddings_index

embeddings_index = get_embedding()
print("Number of words that don't exist in GLOVE:", len(all_words - set(embeddings_index)))

**Prepare the sequences for LSTM**

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

MAX_SEQUENCE_LENGTH = 70

tokenizer = Tokenizer()
tokenizer.fit_on_texts(np.append(train["Phrase"].values, test["Phrase"].values))
word_index = tokenizer.word_index

nb_words = len(word_index) + 1
embedding_matrix = np.random.rand(nb_words, EMBEDDING_DIM + 2)

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    sent = textblob.TextBlob(word).sentiment
    if embedding_vector is not None:
        embedding_matrix[i] = np.append(embedding_vector, [sent.polarity, sent.subjectivity])
    else:
        embedding_matrix[i, -2:] = [sent.polarity, sent.subjectivity]

**Define the Model**

In [None]:
from keras.layers import *
from keras.models import Model
from keras.callbacks import EarlyStopping, ModelCheckpoint

def build_model():
    embedding_layer = Embedding(nb_words,
                                EMBEDDING_DIM + 2,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True)
    dropout = SpatialDropout1D(0.25)
    mask_layer = Masking()
    lstm_layer = LSTM(200)
    
    seq_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype="int32")
    dense_input = Input(shape=(len(dense_features),))
    
    dense_vector = BatchNormalization()(dense_input)
    
    phrase_vector = lstm_layer(mask_layer(dropout(embedding_layer(seq_input))))
    
    
    feature_vector = concatenate([phrase_vector, dense_vector])
    feature_vector = Dense(150, activation="relu")(feature_vector)
    feature_vector = Dense(50, activation="relu")(feature_vector)
    
    output = Dense(5, activation="softmax")(feature_vector)
    
    model = Model(inputs=[seq_input, dense_input], outputs=output)
    return model

**Train the Model:**

In [None]:
train_seq = pad_sequences(tokenizer.texts_to_sequences(train["Phrase"]), maxlen=MAX_SEQUENCE_LENGTH)
test_seq = pad_sequences(tokenizer.texts_to_sequences(test["Phrase"]), maxlen=MAX_SEQUENCE_LENGTH)

In [None]:
train_dense = train[dense_features]
y_train = enc.transform(train["Sentiment"].values.reshape(-1, 1))

print("Building the model...")
model = build_model()
model.compile(loss="categorical_crossentropy", optimizer="nadam", metrics=["acc"])

early_stopping = EarlyStopping(monitor="val_acc", patience=2, verbose=1)
model_save_path = "./model.hdf5"
model_checkpoint = ModelCheckpoint(model_save_path, monitor='val_acc', save_best_only=True, mode='max', verbose=1)

print("Training the model...")
model.fit([train_seq, train_dense], y_train, validation_split=0.15,
          epochs=15, batch_size=512, shuffle=True, callbacks=[early_stopping, model_checkpoint], verbose=1)