# Sentiment140 - A Twitter Sentiment Analysis Tool
# Written by Abiola Obembe
## Date: 2020-12-


Abstract
The data is a CSV with emoticons removed. Data file format has 6 fields:
0 - the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
1 - the id of the tweet (2087)
2 - the date of the tweet (Sat May 16 23:58:44 UTC 2009)
3 - the query (lyx). If there is no query, then this value is NO_QUERY.
4 - the user that tweeted (robotickilldozr)
5 - the text of the tweet (Lyx is cool)



In [20]:
# Tokenization library installation
!pip install -q tensorflow-text

## Step 1: Import dependencies

In [21]:
#Import libraries
import numpy as np
import pandas as pd
import re
import math
from bs4 import BeautifulSoup
#from nltk.corpus import stopwords
#from nltk.stem.porter import PorterStemmer
# Mount data from google drive
from google.colab import drive


# Tensorflow v2

try:
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_datasets as tfds
import tensorflow_text as text
print("The tensorflow version is :", tf.__version__)


print("Dependencies installed succesffuly!")


The tensorflow version is : 2.3.0
Dependencies installed succesffuly!


## Step 2: Data Preprocessing

In [22]:
# Mount data from google drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [23]:
# Load data files
cols = ["sentiment", "id", "date", "query", "user", "text"]

train_data = pd.read_csv(
    "/content/drive/MyDrive/DS_Projects/CNN_4_NLP/traindata.csv",error_bad_lines=False,
    header=None,names=cols,engine="python",encoding="latin1",nrows = 100)

test_data = pd.read_csv(
    "/content/drive/MyDrive/DS_Projects/CNN_4_NLP/testdata.csv",error_bad_lines=False,
    header=None, names=cols,engine="python",encoding="latin1")


print("Training set :", train_data.shape )
print("Test set :", test_data.shape)

train_data.head()

Training set : (100, 6)
Test set : (498, 6)


Unnamed: 0,sentiment,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [24]:
# clean data
train_data.drop(['id','date', 'query','user'], axis = 1, inplace= True)
train_data.head()

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [25]:
# Examine sentiment data labels
train_data['sentiment'].value_counts()

0    100
Name: sentiment, dtype: int64

In [26]:
# Function to clean
def clean_tweet(tweet):
    tweet = BeautifulSoup(tweet, "lxml").get_text()
    # Removing the @
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet)
    # Removing the URL links
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)
    # Keeping only letters
    tweet = re.sub(r"[^a-zA-Z.!?']", ' ', tweet)
    # Removing additional whitespaces
    tweet = re.sub(r" +", ' ', tweet)
    tweet = tweet.lower()
    
    return tweet

In [27]:
# apply function on dataset
data_train_clean = [ clean_tweet(tweet)  for tweet in train_data.text]


In [28]:
# Evaluate the sentiment column
set(train_data.sentiment.values)

{0}

In [29]:
# Let's set the sentiment values labelled 4 as 1
data_labels = train_data.sentiment.values
data_labels[data_labels == 4] = 1
set(data_labels)

{0}

In [35]:
# Tokenization
#tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(data_train_clean, target_vocab_size=2**10)
#data_inputs = [tokenizer.encode(sentence) for sentence in data_train_clean]
from keras.preprocessing.text import Tokenizer  
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(data_train_clean)
#data_inputs = [tokenizer.texts_to_sequences(sentence) for sentence in data_train_clean]
data_inputs = tokenizer.texts_to_sequences(data_train_clean)

In [36]:
data_inputs

[[100, 101, 6, 173, 7, 174, 51, 175, 176, 15, 177, 23, 3, 34, 8, 178],
 [9,
  179,
  11,
  180,
  52,
  181,
  182,
  183,
  102,
  184,
  8,
  4,
  185,
  70,
  53,
  6,
  186,
  103,
  44,
  104,
  187],
 [1, 188, 189, 190, 10, 2, 191, 192, 3, 193, 2, 194, 71, 28, 15, 195],
 [5, 105, 106, 196, 197, 4, 29, 72, 35, 198],
 [30,
  45,
  12,
  199,
  16,
  73,
  31,
  107,
  36,
  74,
  1,
  54,
  200,
  1,
  52,
  24,
  7,
  73,
  201,
  75],
 [12, 2, 105, 202],
 [55, 6, 108],
 [56,
  109,
  46,
  30,
  24,
  203,
  204,
  6,
  110,
  76,
  6,
  110,
  111,
  31,
  205,
  206,
  207,
  7],
 [208, 209, 77, 57, 18, 8],
 [210, 13, 211],
 [212, 112, 14, 213, 214, 45, 215],
 [1, 20, 216, 217, 5, 218],
 [1, 219, 220, 3, 78, 8, 4, 1, 79, 2, 221, 222, 21, 223],
 [8, 8, 224, 225, 36, 1, 80, 81, 7, 82, 226, 3, 13, 227],
 [1,
  228,
  229,
  2,
  230,
  17,
  1,
  57,
  18,
  6,
  231,
  12,
  37,
  58,
  232,
  233,
  20,
  6,
  234],
 [1, 113, 1, 51, 3, 78, 8, 32, 7, 1, 47, 7, 4, 59, 21, 2, 235],

In [37]:
# Padding
MAX_LEN = max([len(sentence) for sentence in data_inputs])
data_inputs = tf.keras.preprocessing.sequence.pad_sequences(data_inputs,
                                                            value=0,
                                                            padding="post",
                                                            maxlen=MAX_LEN)

In [39]:
data_inputs

array([[100, 101,   6, ...,   0,   0,   0],
       [  9, 179,  11, ...,   0,   0,   0],
       [  1, 188, 189, ...,   0,   0,   0],
       ...,
       [ 56,   1, 622, ...,   0,   0,   0],
       [  7,  33, 627, ...,   0,   0,   0],
       [  6,  69, 172, ...,   0,   0,   0]], dtype=int32)

In [38]:
# Split dataset into test and train set (due to orderd nature of traget values 0 and 1)
test_idx = np.random.randint(0, 800000, 8000)
test_idx = np.concatenate((test_idx, test_idx+800000))
test_inputs = data_inputs[test_idx]
test_labels = data_labels[test_idx]
train_inputs = np.delete(data_inputs, test_idx, axis=0)
train_labels = np.delete(data_labels, test_idx)


IndexError: ignored

In [17]:
train_inputs

array([[455, 137,   4, ...,   0,   0,   0],
       [  8, 789,  17, ...,   0,   0,   0],
       [  1, 313, 352, ...,   0,   0,   0],
       ...,
       [ 35,   7, 201, ...,   0,   0,   0],
       [113, 283, 267, ...,   0,   0,   0],
       [113,   0,   0, ...,   0,   0,   0]], dtype=int32)

## Step 3: Build Model

In [18]:
class DCNN(tf.keras.Model):
    
    def __init__(self,
                 vocab_size,
                 emb_dim=128,
                 nb_filters=50,
                 FFN_units=512,
                 nb_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="dcnn"):
        super(DCNN, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocab_size,emb_dim)

        self.bigram = layers.Conv1D(filters=nb_filters, kernel_size=2, padding="valid", activation="relu")
        
        self.trigram = layers.Conv1D(filters=nb_filters,kernel_size=3, padding="valid", activation="relu")
        
        self.fourgram = layers.Conv1D(filters=nb_filters, kernel_size=4, padding="valid", activation="relu")
        
        self.pool = layers.GlobalMaxPool1D() # no training variable so we can
                                             # use the same layer for each
                                             # pooling step
        self.dense_1 = layers.Dense(units=FFN_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if nb_classes == 2:
            self.last_dense = layers.Dense(units=1, activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=nb_classes, activation="softmax")
    
    def call(self, inputs, training):
        x = self.embedding(inputs)
        x_1 = self.bigram(x)
        x_1 = self.pool(x_1)
        x_2 = self.trigram(x)
        x_2 = self.pool(x_2)
        x_3 = self.fourgram(x)
        x_3 = self.pool(x_3)
        
        merged = tf.concat([x_1, x_2, x_3], axis=-1) # (batch_size, 3 * nb_filters)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)
        
        return output

## Step 4: Training the Model

In [19]:
# Configuration details
VOCAB_SIZE = tokenizer.vocab_size +1

EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = len(set(train_labels))

DROPOUT_RATE = 0.2

BATCH_SIZE = 32
NB_EPOCHS = 2

AttributeError: ignored

In [None]:
# Let's train the model
Dcnn = DCNN(vocab_size=VOCAB_SIZE,
            emb_dim=EMB_DIM,
            nb_filters=NB_FILTERS,
            FFN_units=FFN_UNITS,
            nb_classes=NB_CLASSES,
            dropout_rate=DROPOUT_RATE)

In [None]:
if NB_CLASSES == 2:
    Dcnn.compile(loss="binary_crossentropy",
                 optimizer="adam",
                 metrics=["accuracy"])
else:
    Dcnn.compile(loss="sparse_categorical_crossentropy",
                 optimizer="adam",
                 metrics=["sparse_categorical_accuracy"])

In [None]:
checkpoint_path = "./drive/MyDrive/DS_Projects/CNN_4_NLP/ckpt/"

ckpt = tf.train.Checkpoint(Dcnn=Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored!!")

In [None]:
Dcnn.fit(train_inputs,
         train_labels,
         batch_size=BATCH_SIZE,
         epochs=NB_EPOCHS)
ckpt_manager.save()

## Step 6: Evaluate the model

In [None]:
results = Dcnn.evaluate(test_inputs, test_labels, batch_size=BATCH_SIZE)
print(results)

In [None]:
Dcnn(np.array([tokenizer.encode("bad teacher")]), training=False).numpy()

In [None]:
# save trained model
import pickle
tokenizer.encode("bad")
filename = 'twitter_sentiment.pickle'
pickle.dump(Dcnn, open(filename, 'wb'))

In [None]:
# load model
loaded_model = pickle.load(open(filename, 'rb'))