In [1]:
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
import tensorflow as tf
#from sklearn.metrics import confusion_matrix

Using TensorFlow backend.


In [2]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map
#Obtaining glove vectors for each word
word_to_index,words, word_to_vec_map = read_glove_vecs(r"/home/shravan/Intern_Manav/python-env/Tweet_classification/Glove/glove.twitter.27B.200d.txt")

In [3]:
def read_csv(filename = 'data/emojify_data.csv'): # Reads any csv dataset into the required X,Y
    phrase = []
    emoji = []

    with open (filename) as csvDataFile:
        csvReader = csv.reader(csvDataFile)

        for row in csvReader:
            phrase.append(row[0])
            emoji.append(row[1])

    X = np.asarray(phrase)
    Y = np.asarray(emoji, dtype=int)

    return X, Y
#Loading data set as numpy arrays into variables
X_train,Y_train=read_csv(r"/home/shravan/Intern_Manav/python-env/Tweet_classification/Jupyter/Processed Data-Train-Copy1.csv")
X_test,Y_test=read_csv(r"/home/shravan/Intern_Manav/python-env/Tweet_classification/Jupyter/Processed Data-Test-Copy1.csv")

In [4]:
#To create word vector for non existent dictionary words
def new_word_vec(X):
    m = len(X)
    l=words.values()
    length=len(l)+1
    for i in range(m):    
        # Convert the ith training sentence in lower case and split is into words. You should get a list of words.
        sentence_words =[k.lower() for k in X[i].split()]
        
        # Loop over the words of sentence_words
        for w in sentence_words:
            if w not in l:
                word_to_vec_map[w]=np.random.normal(loc=0.0, scale=.05, size=200)
                word_to_index[w]=length
                words[length]=w
                l=words.values()
                length+=1
new_word_vec(X_train)                
new_word_vec(X_test)


In [5]:
#pretrained_embedding_layer
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    """
    Creating a Keras Embedding() layer
    
    Arguments:
    word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
    word_to_index -- dictionary mapping from words to their indices in the vocabulary
    """
    
    vocab_len = len(word_to_index)+1             # adding 1 to fit Keras embedding 
    emb_dim = word_to_vec_map["cucumber"].shape[0]      # define dimensionality of your GloVe word vectors
    
    # Initialize the embedding matrix as a numpy array of zeros of shape (vocab_len, dimensions of word vectors = emb_dim)
    emb_matrix = np.zeros((vocab_len,emb_dim))
    
    # Set each row "index" of the embedding matrix to be the word vector representation of the "index"th word of the vocabulary
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]
    
    # Define Keras embedding layer with the correct output/input sizes, make it trainable. 
    embedding_layer =Embedding(vocab_len,emb_dim,trainable=False)

    # Build the embedding layer, it is required before setting the weights of the embedding layer.
    embedding_layer.build((None,))
    
    # Set the weights of the embedding layer to the embedding matrix.
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [6]:
embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
#Sample case
print("weights[0][1][3] =", embedding_layer.get_weights()[0][1][3])

('weights[0][1][3] =', -0.31723)


In [7]:
def sentences_to_indices(X, word_to_index, max_len):
    """
    Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
    The output shape should be such that it can be given to `Embedding()`
    
    Arguments:
    X -- array of sentences (strings), of shape (m, 1)
    word_to_index -- a dictionary containing the each word mapped to its index
    max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this. 
    
    Returns:
    X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)
    """
    
    m = X.shape[0]                                   # number of training examples
    
    # Initialize X_indices as a numpy matrix of zeros and the correct shape (≈ 1 line)
    X_indices = np.zeros((m,max_len))
    
    for i in range(m):                               # loop over training examples

        # Convert the ith training sentence in lower case and split is into words. You should get a list of words.
        sentence_words =[k.lower() for k in X[i].split()]
        
        # Initialize j to 0
        j = 0
        
        # Loop over the words of sentence_words
        for w in sentence_words:
            # Set the (i,j)th entry of X_indices to the index of the correct word.
            X_indices[i, j] =word_to_index[w] 
            # Increment j to j + 1
            j = j+1
                
    return X_indices

In [8]:
def Category(input_shape, word_to_vec_map, word_to_index):
    """
    Function creating the Category prediction model's graph.
    
    Arguments:
    input_shape -- shape of the input, usually (max_len,)
    word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation
    word_to_index -- dictionary mapping from words to their indices in the vocabulary
    """
    # Define sentence_indices as the input of the graph, it should be of shape input_shape and dtype 'int32' (as it contains indices).
    sentence_indices = Input(shape=input_shape,dtype='int32')
    
    # Create the embedding layer pretrained with GloVe Vectors (≈1 line)
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    
    # Propagate sentence_indices through your embedding layer, you get back the embeddings
    embeddings = embedding_layer(sentence_indices)
    
    # Propagate X trough another LSTM layer
    X = Bidirectional(LSTM(128,return_sequences=True))(embeddings)
    # Dropout with a probability of 0.5
    X = Dropout(.4)(X)
    
    X =Bidirectional(LSTM(128))(X)
    # Dropout with a probability of 0.5
    X = Dropout(.4)(X)

    # Propagate X through a Dense layer with softmax activation to get back a batch of 7-dimensional vectors.
    X =Dense(7,activation='softmax')(X)
    # Softmax activation
    X = Activation('softmax')(X)

    # Create Model instance which converts sentence_indices into X.
    model = Model(inputs=sentence_indices,outputs=X)
    
    return model

In [9]:
from keras.layers import Bidirectional
maxlen=30
model = Category((maxlen,), word_to_vec_map, word_to_index)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 30)                0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 30, 200)           238917400 
_________________________________________________________________
bidirectional_1 (Bidirection (None, 30, 256)           336896    
_________________________________________________________________
dropout_1 (Dropout)          (None, 30, 256)           0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 256)               394240    
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 7)                 1799      
__________

In [10]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [11]:
def convert_to_one_hot(Y, C): #Converts the O/p into an one hot vector
    Y = np.eye(C)[Y.reshape(-1)]
    return Y
Y_train_oh = convert_to_one_hot(Y_train, C = 7) # Converting expected o/p to numpy arrays
X_train_indices = sentences_to_indices(X_train, word_to_index, maxlen)

In [20]:
print(X_train_indices)

[[174293. 584034. 173855. ...      0.      0.      0.]
 [602713.   4800. 596142. ...      0.      0.      0.]
 [602843. 283379. 428772. ...      0.      0.      0.]
 ...
 [215709. 377072. 607686. ...      0.      0.      0.]
 [466914. 262303. 655001. ...      0.      0.      0.]
 [271682. 339040.  11870. ...      0.      0.      0.]]


In [None]:
model.fit(X_train_indices, Y_train_oh, epochs = 85, batch_size = 8, shuffle=True)

Epoch 1/85
Epoch 2/85
Epoch 3/85
Epoch 4/85
Epoch 5/85
Epoch 6/85
Epoch 7/85
Epoch 8/85
Epoch 9/85
Epoch 10/85
Epoch 11/85
Epoch 12/85
Epoch 13/85
Epoch 14/85
Epoch 15/85
Epoch 16/85
Epoch 17/85
Epoch 18/85
Epoch 19/85
Epoch 20/85
Epoch 21/85
Epoch 22/85
Epoch 23/85
Epoch 24/85
Epoch 25/85
Epoch 26/85
Epoch 27/85
Epoch 28/85
Epoch 29/85
Epoch 30/85
Epoch 31/85
Epoch 32/85

In [62]:
#Testing accuracy after training model with the data
X_test_indices = sentences_to_indices(X_test, word_to_index, max_len = maxlen)
Y_test_oh = convert_to_one_hot(Y_test, C = 7)
loss, acc = model.evaluate(X_test_indices, Y_test_oh)
print()
print("Test accuracy = ", acc)

('Test accuracy = ', 0.43640897759326974)


In [35]:
# This code allows one to see the mislabelled examples
C = 7
y_test_oh = np.eye(C)[Y_test.reshape(-1)]
X_test_indices = sentences_to_indices(X_test, word_to_index, maxlen)
pred = model.predict(X_test_indices)
No_to_output_Dictionary = {"0":"treatment",
                    "1":"not_related_or_irrelevant",
                    "2":"deaths_reports",
                    "3":"disease_signs_or_symptoms",
                    "4":"disease_transmission",
                    "5":"prevention",
                    "6":"other_useful_information"}
c=0
for i in range(len(X_test)):
    x = X_test_indices
    num = np.argmax(pred[i])
    if(num == Y_test[i]):
        c+=1
        #print "Expected o/p:",No_to_output_Dictionary[str(Y_test[i])],' prediction: ',No_to_output_Dictionary[str(num)]
print c

217
