# Natural Language Processing for sentiment classification (Deep Learning)

Óscar Poblete Sáenz <br> Course: Introduction to Artificial Intelligence<br>
Teacher: Elizabeth Guevara Martinez<br>
Universidad Anáhuac <br> <br>
Sources:<br>

Zhang, Ye & Wallace, Byron.  A Sensitivity Analysis of (and Practitioners' Guide to) Convolutional Neural Networks for Sentence Classification. arXiv.  2015<br>
Vishnu K https://digitaltesseract.com/sentiment-analyzer-using-convolutional-neural-network/

In [43]:
# Import libraries
import numpy as np # Matrices and vectors
import pandas as pd # Data analysis and manipulation
import tensorflow as tf # Tensorflow
import tensorflow_datasets as tfds # Dataset

from tensorflow.keras import layers # CNN layers
from google.colab import drive # Google drive


# Data

In [None]:
# Mount Google drive
drive.mount("/content/drive")

# Model

In [45]:
class DCNN(tf.keras.Model):
    
    def __init__(self,
                 vocab_size,
                 emb_dim=128,
                 nb_filters=50,
                 FFN_units=512,
                 nb_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="dcnn"):
        super(DCNN, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocab_size,
                                          emb_dim)
        self.bigram = layers.Conv1D(filters=nb_filters, # Convolutions in one dimension
                                    kernel_size=2,
                                    padding="valid",
                                    activation="relu")
        self.trigram = layers.Conv1D(filters=nb_filters,
                                     kernel_size=3,
                                     padding="valid",
                                     activation="relu")
        self.fourgram = layers.Conv1D(filters=nb_filters, # An n-gram is a contiguous sequence of n items from a given sample of text or speech.
                                      kernel_size=4,
                                      padding="valid",
                                      activation="relu")
        self.pool = layers.GlobalMaxPool1D() 
        self.dense_1 = layers.Dense(units=FFN_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if nb_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=nb_classes,
                                           activation="softmax") # odds vector
    
    def call(self, inputs, training):
        x = self.embedding(inputs)
        x_1 = self.bigram(x)
        x_1 = self.pool(x_1)
        x_2 = self.trigram(x)
        x_2 = self.pool(x_2)
        x_3 = self.fourgram(x)
        x_3 = self.pool(x_3)
        
        merged = tf.concat([x_1, x_2, x_3], axis=-1) 
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)
        
        return output

# Architecture and weights

In [46]:
# Initialize the parameters
VOCAB_SIZE = 65540 #tokenizer.vocab_size 

EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2  #len(set(train_labels))


In [47]:
# Establish the architecture of the ANN
Dcnn = DCNN(vocab_size= VOCAB_SIZE,
            emb_dim=EMB_DIM,
            nb_filters=NB_FILTERS,
            FFN_units=FFN_UNITS,
            nb_classes=NB_CLASSES,training=False)

In [48]:
# Generate the ANN architecture
Dcnn.build(input_shape=(1, VOCAB_SIZE))

In [49]:
# Load weights
Dcnn.load_weights('/content/drive/MyDrive/Colab Notebooks/tweets_NLP/Tweets Test/weightsTweet.h5')

In [50]:
# Compile
# Determine if the problem is biclass or multiclass
if NB_CLASSES == 2:
    Dcnn.compile(loss="binary_crossentropy", # For biclass
                 optimizer="adam",
                 metrics=["accuracy"])
else:
    Dcnn.compile(loss="sparse_categorical_crossentropy", # For multiclass
                 optimizer="adam",
                 metrics=["sparse_categorical_accuracy"])

# Evaluation

In [67]:
# Load the encoder
tokenizer = tfds.deprecated.text.SubwordTextEncoder.load_from_file('/content/drive/MyDrive/Colab Notebooks/tweets_NLP/Tweets Test/encodeFile')


In [75]:
# Encode text
encoded = tokenizer.encode("I love being with you")
print(encoded)

[3, 54, 192, 26, 55]


In [69]:
# Test ANN
Dcnn(np.array([encoded]), training=False).numpy()

array([[0.9462931]], dtype=float32)

In [70]:
# Function to get the prediction
def get_prediction(sentence):
  encoded = tokenizer.encode(sentence)
  sentiment = Dcnn(np.array([encoded]), training=False).numpy()
  if sentiment  < 0.5:
    print("Negative sentiment: {} ".format(sentiment))
  else:
    print("Positive sentiment: {} ".format(sentiment))

In [71]:
# See the RNA prediction
get_prediction("I love being with you")

Positive sentiment: [[0.9462931]] 
