# Natural Language Processing for sentiment classification (Deep Learning)

Óscar Poblete Sáenz <br> Course: Introduction to Artificial Intelligence<br>
Teacher: Elizabeth Guevara Martinez<br>
Universidad Anáhuac <br> <br>
Sources:<br>

Zhang, Ye & Wallace, Byron.  A Sensitivity Analysis of (and Practitioners' Guide to) Convolutional Neural Networks for Sentence Classification. arXiv.  2015<br>
Vishnu K https://digitaltesseract.com/sentiment-analyzer-using-convolutional-neural-network/



In [2]:
# Import libraries
import numpy as np # Matrices and vectors
import math # Math instructions
import re # Regex for string cleaning
import pandas as pd # Data analysis and manipulation
import tensorflow as tf # Tensorflow
import tensorflow_datasets as tfds # Dataset

from tensorflow.keras import layers # CNN layers
from bs4 import BeautifulSoup # Pulling data out of HTML and XML files
from google.colab import drive # Google drive

# Data

In [3]:
# Mount Google drive
drive.mount("/content/drive")

Mounted at /content/drive


In [4]:
# Create a list ‘cols’ that holds the column names of our dataset
cols = ["sentiment", "id", "date", "query", "user", "text"]

# Store train.csv file in a variable to use it later for other steps
data = pd.read_csv(
    "/content/drive/MyDrive/Colab Notebooks/tweets_NLP/Tweets/trainTweets.csv",
    header=None,
    names=cols,
    engine="python",
    encoding="latin1"
)


In [5]:
data.shape

(1600000, 6)

In [22]:
data.head()

Unnamed: 0,sentiment,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [23]:
data.tail()

Unnamed: 0,sentiment,id,date,query,user,text
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...
1599999,4,2193602129,Tue Jun 16 08:40:50 PDT 2009,NO_QUERY,RyanTrevMorris,happy #charitytuesday @theNSPCC @SparksCharity...


# Preprocessing
Message cleansing

---



In [None]:
# # Remove columns that are not required
# data.drop(["id", "date", "query", "user"], axis=1, inplace=True)
# # If the columns have already been deleted and this instruction is run again, it will mark an error.

In [None]:
# Message cleansing function
def clean_tweet(tweet):
    tweet = BeautifulSoup(tweet, "lxml").get_text()
   # Remove the @ and its mention
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet) # r is line, + from @ to the end is taken as one
   # Remove URL links
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet) # ? makes the previous element may or may not be
   # Keeping only letters
    tweet = re.sub(r"[^a-zA-Z.!?']", ' ', tweet) # ^ except
   # Remove extra whitespace
    tweet = re.sub(r" +", ' ', tweet) 
    return tweet

In [None]:
# Call this function on all our tweets. A for loop can be used, but a much more compact way is list comprehension 
data_clean = [clean_tweet(tweet) for tweet in data.text]

In [None]:
# Classes must be 0 (negative) and 1 (positive)
set(data.sentiment)

{0, 4}

In [None]:
# 4 is used instead of 1 to denote positive sentiments. So, all the occurrences of 4 have to be replaced by 1
set(data.sentiment)
data_labels = data.sentiment.values
data_labels[data_labels == 4] = 1

# Conversion (vectors)

In [None]:
# # Replace each word with a number
# tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
#     data_clean, target_vocab_size=2**16
# )

# data_inputs = [tokenizer.encode(sentence) for sentence in data_clean]

In [None]:
# # Perform padding so all vectors are the same size
# # We take the largest word and from there the smallest words we make padding, we fill
# # cells with 0
# MAX_LEN = max([len(sentence) for sentence in data_inputs])
# data_inputs = tf.keras.preprocessing.sequence.pad_sequences(data_inputs,
#                                                             value=0,
#                                                             padding="post", #Everything after that is filled with zeros.
#                                                             maxlen=MAX_LEN)

In [None]:
# Load saved inputs
data_inputs=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/tweets_NLP/Tweets/data_inputs.csv').values

In [None]:
data_inputs

array([[65316,  1570,   113, ...,     0,     0,     0],
       [   11,  1090,    23, ...,     0,     0,     0],
       [65316,     3, 41563, ...,     0,     0,     0],
       ...,
       [  927,    12,   229, ...,     0,     0,     0],
       [  366,   337,  1309, ...,     0,     0,     0],
       [  181, 51236,     0, ...,     0,     0,     0]])

Split into training and test sets

In [None]:
from sklearn.model_selection import train_test_split
train_inputs, test_inputs, train_labels, test_labels = train_test_split(data_inputs, data_labels, test_size=0.6)

In [None]:
train_inputs

array([[65316, 15121,    46, ...,     0,     0,     0],
       [65316,   670,    55, ...,     0,     0,     0],
       [ 1668,    11,     4, ...,     0,     0,     0],
       ...,
       [65316,   756,   232, ...,     0,     0,     0],
       [65316,   984,     1, ...,     0,     0,     0],
       [65357, 65323,    19, ...,     0,     0,     0]])

# Model

In [None]:
class DCNN(tf.keras.Model):
    
    def __init__(self,
                 vocab_size,
                 emb_dim=128,
                 nb_filters=50,
                 FFN_units=512,
                 nb_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="dcnn"):
        super(DCNN, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocab_size,
                                          emb_dim)
        self.bigram = layers.Conv1D(filters=nb_filters, # Convolutions in one dimension
                                    kernel_size=2,
                                    padding="valid",
                                    activation="relu")
        self.trigram = layers.Conv1D(filters=nb_filters,
                                     kernel_size=3,
                                     padding="valid",
                                     activation="relu")
        self.fourgram = layers.Conv1D(filters=nb_filters, # An n-gram is a contiguous sequence of n items from a given sample of text or speech.
                                      kernel_size=4,
                                      padding="valid",
                                      activation="relu")
        self.pool = layers.GlobalMaxPool1D() 
        self.dense_1 = layers.Dense(units=FFN_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if nb_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=nb_classes,
                                           activation="softmax") # odds vector
    
    def call(self, inputs, training):
        x = self.embedding(inputs)
        x_1 = self.bigram(x)
        x_1 = self.pool(x_1)
        x_2 = self.trigram(x)
        x_2 = self.pool(x_2)
        x_3 = self.fourgram(x)
        x_3 = self.pool(x_3)
        
        merged = tf.concat([x_1, x_2, x_3], axis=-1) 
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)
        
        return output

In [None]:
# Define training parameters

VOCAB_SIZE = 65540 #tokenizer.vocab_size 

EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2  #len(set(train_labels))

DROPOUT_RATE = 0.2

BATCH_SIZE = 32 # Grouping
NB_EPOCHS = 4

In [None]:
# Model
Dcnn = DCNN(vocab_size=VOCAB_SIZE,
            emb_dim=EMB_DIM,
            nb_filters=NB_FILTERS,
            FFN_units=FFN_UNITS,
            nb_classes=NB_CLASSES,
            dropout_rate=DROPOUT_RATE)

In [None]:
# Determine if the problem is biclass or multiclass
if NB_CLASSES == 2:
    Dcnn.compile(loss="binary_crossentropy", # If it is biclass then loss must be like this
                 optimizer="adam",
                 metrics=["accuracy"])
else:
    Dcnn.compile(loss="sparse_categorical_crossentropy", # If it is multiclass then loss must be like this
                 optimizer="adam",
                 metrics=["sparse_categorical_accuracy"])

In [None]:
# Save the training results
# In google colab you need to save the results because otherwise you lose everything if you exceed
# the execution time determined by colab
checkpoint_path = "/content/drive/MyDrive/Colab Notebooks/tweets_NLP/Tweets/checkpoint"

ckpt = tf.train.Checkpoint(Dcnn=Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Last checkpoint restored!")

In [None]:
# Train the model
Dcnn.fit(train_inputs,
         train_labels,
         batch_size=BATCH_SIZE,
         epochs=NB_EPOCHS)
ckpt_manager.save()

# Evaluation

In [None]:
results = Dcnn.evaluate(test_inputs, test_labels, batch_size=BATCH_SIZE)
print(results)

[0.6242788434028625, 0.8312177062034607]


In [None]:
Dcnn(np.array([[3, 54, 192, 26, 55]]), training=False).numpy()

array([[0.7388818]], dtype=float32)

In [None]:
Dcnn(np.array([[3, 161, 192, 26, 55]]), training=False).numpy()

array([[0.00071795]], dtype=float32)