In [1]:
import pandas as pd
import numpy as np

In [2]:
%%time
# Importing the dataset
df = pd.read_csv('clean_dataset.csv')
df.rename(columns = {'text_clean':'text'}, inplace = True)

Wall time: 1.94 s


In [3]:
df.head()

Unnamed: 0,sentiment,text
0,0,aw buyer got david car third day
1,0,upset update facebook texting might cry result...
2,0,dived many time bal managed save rest go bound
3,0,whole body feel itch like fire
4,0,behaving al mad see


In [14]:
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [10]:
%%time
train, test = train_test_split(df, test_size=0.2, random_state=10)

Wall time: 714 ms


In [11]:
%%time
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train.text)
vocab_size = len(tokenizer.word_index) + 1

Wall time: 24.8 s


In [15]:
%%time
sequences_train = tokenizer.texts_to_sequences(train.text) 
sequences_test = tokenizer.texts_to_sequences(test.text) 

X_train = pad_sequences(sequences_train, padding='post', maxlen=100)
X_test = pad_sequences(sequences_test, padding='post', maxlen=100)

y_train = train.sentiment.values
y_test = test.sentiment.values

Wall time: 33.8 s


Word embeddings using GloVe
Word embeddings provide a dense representation of words and their relative meanings. Embedding Matrix is a maxtrix of all words and their corresponding embeddings. Embedding matrix is used in embedding layer in model to embedded a token into it's vector representation, that contains information regarding that token or word.

Embedding vocabulary is taken from the tokenizer and the corresponding vectors from embedding model, which in this case is GloVe model. GloVe stand for Global Vectors for Word Representation and it is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.

Below was used pretrained GloVe embeddings from world known Stanford vector files. The smallest available file contains embeddings created for tiny 6 billions of tokens.

In [17]:
%%time
#load glove 
embeddings_dictionary = dict()
glove_file = open('glove6b100d.txt', encoding="utf8")
for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = np.asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()

Wall time: 17.8 s


In [18]:
embeddings_matrix = np.zeros((vocab_size, 100))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embeddings_matrix[index] = embedding_vector

In [20]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Bidirectional, Dropout, Flatten

In [21]:
embedding_layer = Embedding(vocab_size, 100, input_length=50, weights=[embeddings_matrix], trainable=False)

In [None]:
%%time
#model = Sequential([
#        embedding_layer,
#        Bidirectional(LSTM(128, return_sequences=True)),
#        Dropout(0.4),
#        Bidirectional(LSTM(128)),
#       Dropout(0.4),
#        Dense(64, activation='relu'),
#        Dense(1, activation='sigmoid'),
#    ])
#model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
#model.summary()

In [31]:
%%time
model = Sequential()
model.add(embedding_layer)
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 100)           23475400  
_________________________________________________________________
lstm (LSTM)                  (None, 128)               117248    
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 23,592,777
Trainable params: 117,377
Non-trainable params: 23,475,400
_________________________________________________________________
Wall time: 749 ms


In [32]:
history = model.fit(X_train, y_train, batch_size=1000, epochs=10, validation_data=(X_test, y_test), verbose=2)

Epoch 1/10


KeyboardInterrupt: 

In [None]:
score = model.evaluate(X_test, y_test, verbose=1)

In [None]:
print("Test Score:", score[0])
print("Test Accuracy:", score[1])

In [None]:
import matplotlib.pyplot as plt

In [None]:
#History for accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])

plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train','test'], loc = 'upper left')
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])

plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train','test'], loc = 'upper left')
plt.show()