<a href="https://colab.research.google.com/github/piyush5566/Twitter-Sentiment-Analysis/blob/master/ML_CLASS_PRO_lstm_cnn_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, LeakyReLU
from keras import optimizers
from keras.models import load_model
import json, argparse, os
import re
import io
import sys
from keras.layers import Conv1D, GlobalMaxPooling1D
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [None]:
#!wget http://nlp.stanford.edu/data/glove.6B.zip

In [None]:
#!unzip glove*.zip

In [None]:
#!ls
#!pwd

In [None]:
trainDataPath = "/content/drive/My Drive/ML CLASS PROJECT/emocontext/train.txt"
testDataPath = "/content/drive/My Drive/ML CLASS PROJECT/emocontext/dev.txt"

# Path to directory where GloVe file is saved.
gloveDir = "/content/drive/My Drive/glove/"

NUM_FOLDS = 5                      # Value of K in K-fold Cross Validation
NUM_CLASSES = 4                    # Number of classes - Happy, Sad, Angry, Others
MAX_NB_WORDS = 20000                # To set the upper limit on the number of tokens extracted using keras.preprocessing.text.Tokenizer 
MAX_SEQUENCE_LENGTH = 100           # All sentences having lesser number of words than this will be padded
EMBEDDING_DIM = 100                # The dimension of the word embeddings
BATCH_SIZE = 200                  # The batch size to be chosen for training the model.
LSTM_DIM = 128                    # The dimension of the representations learnt by the LSTM model
DROPOUT = 0.2                        # Fraction of the units to drop for the linear transformation of the inputs. Ref - https://keras.io/layers/recurrent/
NUM_EPOCHS = 20                   # Number of epochs to train a model for

LEARNING_RATE = 0.003
label2emotion = {0:"others", 1:"happy", 2: "sad", 3:"angry"}
emotion2label = {"others":0, "happy":1, "sad":2, "angry":3}


In [None]:
train_data = pd.read_csv(trainDataPath,sep = '\t')
train_data.head()

Unnamed: 0,id,turn1,turn2,turn3,label
0,0,Don't worry I'm girl,hmm how do I know if you are,What's ur name?,others
1,1,When did I?,saw many times i think -_-,No. I never saw you,angry
2,2,By,by Google Chrome,Where you live,others
3,3,U r ridiculous,I might be ridiculous but I am telling the truth.,U little disgusting whore,angry
4,4,Just for time pass,wt do u do 4 a living then,Maybe,others


In [None]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30160 entries, 0 to 30159
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      30160 non-null  int64 
 1   turn1   30160 non-null  object
 2   turn2   30160 non-null  object
 3   turn3   30160 non-null  object
 4   label   30160 non-null  object
dtypes: int64(1), object(4)
memory usage: 1.2+ MB


In [None]:
train_data['label'].value_counts()

others    14948
angry      5506
sad        5463
happy      4243
Name: label, dtype: int64

In [None]:
test_data = pd.read_csv(testDataPath,sep = '\t')
test_data.head()

Unnamed: 0,id,turn1,turn2,turn3,label
0,0,Then dont ask me,YOURE A GUY NOT AS IF YOU WOULD UNDERSTAND,IM NOT A GUY FUCK OFF,angry
1,1,Mixed things such as??,the things you do.,Have you seen minions??,others
2,2,Today I'm very happy,and I'm happy for you ❤,I will be marry,happy
3,3,Woah bring me some,left it there oops,Brb,others
4,4,it is thooooo,I said soon master.,he is pressuring me,others


In [None]:
test_data['label'].value_counts()

others    2338
angry      150
happy      142
sad        125
Name: label, dtype: int64

In [None]:
def preprocessData(dataFilePath, mode):
    """Load data from a file, process and return indices, conversations and labels in separate lists
    Input:
        dataFilePath : Path to train/test file to be processed
        mode : "train" mode returns labels. "test" mode doesn't return labels.
    Output:
        indices : Unique conversation ID list
        conversations : List of 3 turn conversations, processed and each turn separated by the <eos> tag
        labels :  List of labels
    """
    indices = []
    conversations = []
    labels = []
    with io.open(dataFilePath, encoding="utf8") as finput:
        finput.readline()
        for line in finput:
            # Convert multiple instances of . ? ! , to single instance
            # okay...sure -> okay . sure
            # okay???sure -> okay ? sure
            # Add whitespace around such punctuation
            # okay!sure -> okay ! sure
            repeatedChars = ['.', '?', '!', ',']
            for c in repeatedChars:
                lineSplit = line.split(c)
                while True:
                    try:
                        lineSplit.remove('')
                    except:
                        break
                cSpace = ' ' + c + ' '    
                line = cSpace.join(lineSplit)
            
            line = line.strip().split('\t')
            
            label = emotion2label[line[4]]
            labels.append(label)
            
            conv = ' <eos> '.join(line[1:4])
            
            # Remove any duplicate spaces
            duplicateSpacePattern = re.compile(r'\ +')
            conv = re.sub(duplicateSpacePattern, ' ', conv)
            
            indices.append(int(line[0]))
            conversations.append(conv.lower())
    
    
    return indices, conversations, labels
    
        

In [None]:
def getMetrics(predictions, ground):
    """Given predicted labels and the respective ground truth labels, display some metrics
    Input: shape [# of samples, NUM_CLASSES]
        predictions : Model output. Every row has 4 decimal values, with the highest belonging to the predicted class
        ground : Ground truth labels, converted to one-hot encodings. A sample belonging to Happy class will be [0, 1, 0, 0]
    Output:
        accuracy : Average accuracy
        microPrecision : Precision calculated on a micro level. Ref - https://datascience.stackexchange.com/questions/15989/micro-average-vs-macro-average-performance-in-a-multiclass-classification-settin/16001
        microRecall : Recall calculated on a micro level
        microF1 : Harmonic mean of microPrecision and microRecall. Higher value implies better classification  
    """
    # [0.1, 0.3 , 0.2, 0.1] -> [0, 1, 0, 0]
    discretePredictions = to_categorical(predictions.argmax(axis=1))
    
    truePositives = np.sum(discretePredictions*ground, axis=0)
    falsePositives = np.sum(np.clip(discretePredictions - ground, 0, 1), axis=0)
    falseNegatives = np.sum(np.clip(ground-discretePredictions, 0, 1), axis=0)
    
    print("True Positives per class : ", truePositives)
    print("False Positives per class : ", falsePositives)
    print("False Negatives per class : ", falseNegatives)
    
    # ------------- Macro level calculation ---------------
    macroPrecision = 0
    macroRecall = 0
    # We ignore the "Others" class during the calculation of Precision, Recall and F1
    for c in range(1, NUM_CLASSES):
        precision = truePositives[c] / (truePositives[c] + falsePositives[c])
        macroPrecision += precision
        recall = truePositives[c] / (truePositives[c] + falseNegatives[c])
        macroRecall += recall
        f1 = ( 2 * recall * precision ) / (precision + recall) if (precision+recall) > 0 else 0
        print("Class %s : Precision : %.3f, Recall : %.3f, F1 : %.3f" % (label2emotion[c], precision, recall, f1))
    
    macroPrecision /= 3
    macroRecall /= 3
    macroF1 = (2 * macroRecall * macroPrecision ) / (macroPrecision + macroRecall) if (macroPrecision+macroRecall) > 0 else 0
    print("Ignoring the Others class, Macro Precision : %.4f, Macro Recall : %.4f, Macro F1 : %.4f" % (macroPrecision, macroRecall, macroF1))   
    
    # ------------- Micro level calculation ---------------
    truePositives = truePositives[1:].sum()
    falsePositives = falsePositives[1:].sum()
    falseNegatives = falseNegatives[1:].sum()    
    
    print("Ignoring the Others class, Micro TP : %d, FP : %d, FN : %d" % (truePositives, falsePositives, falseNegatives))
    
    microPrecision = truePositives / (truePositives + falsePositives)
    microRecall = truePositives / (truePositives + falseNegatives)
    
    microF1 = ( 2 * microRecall * microPrecision ) / (microPrecision + microRecall) if (microPrecision+microRecall) > 0 else 0
    # -----------------------------------------------------
    
    predictions = predictions.argmax(axis=1)
    ground = ground.argmax(axis=1)
    accuracy = np.mean(predictions==ground)
    
    print("Accuracy : %.4f, Micro Precision : %.4f, Micro Recall : %.4f, Micro F1 : %.4f" % (accuracy, microPrecision, microRecall, microF1))
    return accuracy, microPrecision, microRecall, microF1

In [None]:
def getEmbeddingMatrix(wordIndex):
    """Populate an embedding matrix using a word-index. If the word "happy" has an index 19,
       the 19th row in the embedding matrix should contain the embedding vector for the word "happy".
    Input:
        wordIndex : A dictionary of (word : index) pairs, extracted using a tokeniser
    Output:
        embeddingMatrix : A matrix where every row has 100 dimensional GloVe embedding
    """
    embeddingsIndex = {}
    # Load the embedding vectors from ther GloVe file
    with io.open(os.path.join(gloveDir, 'glove.6B.100d.txt'), encoding="utf8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            embeddingVector = np.asarray(values[1:], dtype='float32')
            embeddingsIndex[word] = embeddingVector
    
    print('Found %s word vectors.' % len(embeddingsIndex))
    
    # Minimum word index of any word is 1. 
    embeddingMatrix = np.zeros((len(wordIndex) + 1, EMBEDDING_DIM))
    for word, i in wordIndex.items():
        embeddingVector = embeddingsIndex.get(word)
        if embeddingVector is not None:
            # words not found in embedding index will be all-zeros.
            embeddingMatrix[i] = embeddingVector
    
    return embeddingMatrix

In [None]:
def buildModel(embeddingMatrix):
    """Constructs the architecture of the model
    Input:
        embeddingMatrix : The embedding matrix to be loaded in the embedding layer.
    Output:
        model : A basic LSTM model
    """
    embeddingLayer = Embedding(embeddingMatrix.shape[0],
                                EMBEDDING_DIM,
                                weights=[embeddingMatrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False)
    model_lstm = Sequential()
    model_lstm.add(embeddingLayer)
    model_lstm.add(LSTM(LSTM_DIM, dropout=DROPOUT))
    model_lstm.add(LeakyReLU())
    model_lstm.add(Dense(64, activation = 'relu'))
    model_lstm.add(Dense(NUM_CLASSES, activation='softmax'))
    
    rmsprop = optimizers.rmsprop(lr=LEARNING_RATE)
    model_lstm.compile(loss='categorical_crossentropy',
                  optimizer=rmsprop,
                  metrics=['acc'])
    
    model_cnn = Sequential()
    model_cnn.add(embeddingLayer)
    model_cnn.add(Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1))
    model_cnn.add(GlobalMaxPooling1D())
    model_cnn.add(Dense(256, activation='relu'))
    model_cnn.add(Dense(NUM_CLASSES, activation='softmax'))
    model_cnn.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
    
    
    
    return model_lstm, model_cnn
    

In [None]:


    
        
 
    
    
        
print("Processing training data...")
trainIndices, trainTexts, labels = preprocessData(trainDataPath, mode="train")
    
print("Processing test data...")
testIndices, testTexts, y_test = preprocessData(testDataPath, mode="test")
  

print("Extracting tokens...")
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(trainTexts)
trainSequences = tokenizer.texts_to_sequences(trainTexts)
testSequences = tokenizer.texts_to_sequences(testTexts)

wordIndex = tokenizer.word_index
print("Found %s unique tokens." % len(wordIndex))

print("Populating embedding matrix...")
embeddingMatrix = getEmbeddingMatrix(wordIndex)

data = pad_sequences(trainSequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = to_categorical(np.asarray(labels))
print("Shape of training data tensor: ", data.shape)
print("Shape of label tensor: ", labels.shape)
        
# Randomize data
np.random.shuffle(trainIndices)
data = data[trainIndices]
labels = labels[trainIndices]

X_train, X_dev, y_train, y_dev = train_test_split(
data, labels, test_size=0.10, random_state=42)
      
    
         
    
    
   


Processing training data...
Processing test data...
Extracting tokens...
Found 16831 unique tokens.
Populating embedding matrix...
Found 400000 word vectors.
Shape of training data tensor:  (30160, 100)
Shape of label tensor:  (30160, 4)


In [None]:
import tensorflow as tf

In [None]:

    
        
print("Building model...")
model_lstm,model_cnn = buildModel(embeddingMatrix)

callback = tf.keras.callbacks.EarlyStopping(monitor = 'val_acc', patience = 4, restore_best_weights = True)

print('Training LSTM model\n')        
model_lstm.fit(X_train, y_train, validation_data=(X_dev, y_dev),
               epochs=NUM_EPOCHS, batch_size=BATCH_SIZE, callbacks = [callback])

print('Training CNN model\n')  
        
model_cnn.fit(X_train, y_train, validation_data=(X_dev, y_dev),
               epochs=NUM_EPOCHS, batch_size=BATCH_SIZE, callbacks = [callback])

    


Building model...
Training LSTM model

Train on 27144 samples, validate on 3016 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Training CNN model

Train on 27144 samples, validate on 3016 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20


<keras.callbacks.callbacks.History at 0x7f359c2c94a8>

In [None]:
testData = pad_sequences(testSequences, maxlen=MAX_SEQUENCE_LENGTH)
    
predictions_lstm = model_lstm.predict(testData, batch_size=BATCH_SIZE)
predictions_lstm = predictions_lstm.argmax(axis=1)

print('FOR GLOVE + LSTM')
print(classification_report(y_test,predictions_lstm))

FOR GLOVE + LSTM
              precision    recall  f1-score   support

           0       0.96      0.90      0.93      2338
           1       0.57      0.65      0.61       142
           2       0.49      0.74      0.59       125
           3       0.59      0.77      0.67       150

    accuracy                           0.88      2755
   macro avg       0.65      0.77      0.70      2755
weighted avg       0.90      0.88      0.88      2755



In [None]:
predictions_cnn = model_cnn.predict(testData, batch_size=BATCH_SIZE)
predictions_cnn = predictions_cnn.argmax(axis=1)

print('FOR GLOVE + CNN')
print(classification_report(y_test,predictions_cnn))

FOR GLOVE + CNN
              precision    recall  f1-score   support

           0       0.95      0.87      0.91      2338
           1       0.44      0.53      0.48       142
           2       0.41      0.72      0.52       125
           3       0.54      0.82      0.65       150

    accuracy                           0.84      2755
   macro avg       0.59      0.73      0.64      2755
weighted avg       0.88      0.84      0.86      2755

