<a href="https://colab.research.google.com/github/madarshb19/Application-Of-CNNs-in-Sentiment-Analysis/blob/main/ApplicationsOfCNNforSentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Stage 1: Importing dependencies

In [1]:
import numpy as np
import math 
import re #for processing strings in python
import pandas as pd
from bs4 import BeautifulSoup #we use this as we want to work with texts which comes with different encodings.In our case,training data comes from tweets.
from google.colab import drive

In [2]:
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_datasets as tfds #comes with tokenizer used later

# Stage 2: Data preprocessing

## Loading files

In [3]:
drive.mount("/content/drive") #used to link google drive to our notebook

Mounted at /content/drive


In [4]:
cols = ["sentiment","id","date","query","user","text"]
train_data = pd.read_csv("/content/drive/MyDrive/CNN_For_NLP/train.csv",
                         header = None,
                         names = cols,
                         engine = "python",
                         encoding = "latin1"
                         ) #no headers and specifying python engine important to avoid some parsing errors
test_data = pd.read_csv("/content/drive/MyDrive/CNN_For_NLP/test.csv",
                         header = None,
                         names = cols,
                         engine = "python",
                         encoding = "latin1"
                         )

In [5]:
train_data.head()

Unnamed: 0,sentiment,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [6]:
test_data.head()

Unnamed: 0,sentiment,id,date,query,user,text
0,4,3,Mon May 11 03:17:40 UTC 2009,kindle2,tpryan,@stellargirl I loooooooovvvvvveee my Kindle2. ...
1,4,4,Mon May 11 03:18:03 UTC 2009,kindle2,vcu451,Reading my kindle2... Love it... Lee childs i...
2,4,5,Mon May 11 03:18:54 UTC 2009,kindle2,chadfu,"Ok, first assesment of the #kindle2 ...it fuck..."
3,4,6,Mon May 11 03:19:04 UTC 2009,kindle2,SIX15,@kenburbary You'll love your Kindle2. I've had...
4,4,7,Mon May 11 03:21:41 UTC 2009,kindle2,yamarama,@mikefish Fair enough. But i have the Kindle2...


## Preprocessing

### Cleaning

In [7]:
train_data.drop(["id","date","query","user"], #irrelevant columns
          axis = 1, #columns
          inplace = True) #no need to write data = data.drop

In [8]:
train_data.head()

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [9]:
def clean_tweet(tweet): 
  tweet = BeautifulSoup(tweet, "lxml").get_text() # to interpret tweets with xml format
  tweet = re.sub(r"@[A-Za-z0-9]+",' ',tweet) #removes the @ in strings in tweets.we use RegEx here.
  #r means its a regex string. "@[A-Za-z0-9]+" : @ followed by A-Z,a-z or 0-9 in any amount(+) must be removed and replaced with white space
  #do this replacement in out tweet
  tweet = re.sub(r"https?://[A-Za-z0-9./]+",' ',tweet) #? shows that s is optional
  tweet  =re.sub(r"[^A-Za-z.!?']",' ',tweet) #anything other than the strings inside the [] must be removed and replaced with ' '
  tweet = re.sub(r" +,",' ',tweet) #too many white spaces replaced with a single white space
  return tweet

In [10]:
data_clean = [clean_tweet(tweet) for tweet in train_data.text] #.text is the column containing all the tweets

In [11]:
data_labels = train_data.sentiment.values

In [12]:
set(data_labels) #the sentiment values are 2 classes:0 and 4.want to make it 0 and 1

{0, 4}

In [13]:
data_labels[data_labels == 4] = 1

In [14]:
set(data_labels)

{0, 1}

### Tokenization

In [16]:
#from sequence of characters,get list of numbers where each number corresponds to a word.Corpus is the list fo strings
tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    data_clean, target_vocab_size  = 2**16
)
data_inputs = [tokenizer.encode(sentence) for sentence in data_clean]

### Padding

In [19]:
#training in batches and for that,we need all sentences to have same length.We use 0 because it is not used in the tokenizer
MAX_LEN = max([len(sentence) for sentence in data_inputs])
data_inputs = tf.keras.preprocessing.sequence.pad_sequences(data_inputs,
                                                            value = 0,
                                                            padding = 'post',
                                                            maxlen = MAX_LEN) #'post i.e padding at the end

### Spliting into training/testing set

In [20]:
test_idx = np.random.randint(0,800000,8000)
test_idx = np.concatenate((test_idx,test_idx+800000))

In [21]:
test_inputs = data_inputs[test_idx]
test_labels = data_labels[test_idx]
#now we get rid of data from training set
train_inputs = np.delete(data_inputs,test_idx, axis = 0) #since this is a list
train_labels = np.delete(data_labels,test_idx) #since this is a vector

# Stage 3: Model building

In [39]:
class DCNN(tf.keras.Model):
  def __init__(self,vocab_size,
               emb_dim = 128,
               nb_filters = 50,
               FFN_units = 512,
               nb_classes = 2,
               dropout_rate = 0.1,
               training = False,
               name = "dcnn"):
    #the training boolean is to know whether the model is in training phase or evaluation phase
    super(DCNN,self).__init__(name=name) # to class the function of the class we are inheriting it from
    self.embedding = layers.Embedding(vocab_size,emb_dim)
    self.bigram = layers.Conv1D(filters = nb_filters,kernel_size = 2,padding = 'valid',activation = 'relu') #first filter,1D convolution layer.
    #padding method doesnt matter as our stride value = 1. Bigram is the 2X5 layer in our architecture
    self.pool_1 = layers.GlobalMaxPool1D()
    self.trigram = layers.Conv1D(filters = nb_filters,kernel_size = 3,padding = 'valid',activation = 'relu') # 3X5 layer
    self.pool_2 = layers.GlobalMaxPool1D()
    self.fourgram = layers.Conv1D(filters = nb_filters,kernel_size = 4,padding = 'valid',activation = 'relu') # 4X5 layer
    self.pool_3 = layers.GlobalMaxPool1D()
    self.dense_1 = layers.Dense(units = FFN_units,activation = 'relu')
    self.dropout = layers.Dropout(rate = dropout_rate)#too many variables here,dropout layer would be ideal
    if(nb_classes == 2):
      self.last_dense = layers.Dense(units=1,activation = 'sigmoid')
    else:
      self.last_dense = layers.Dense(units=nb_classes,activation = 'softmax') #softmax will return the corresponding probabilities

  def call(self,inputs,training): #to call our model
    x = self.embedding(inputs)
    x_1 = self.bigram(x)
    x_1 = self.pool_1(x_1)
    x_2 = self.trigram(x)
    x_2 = self.pool_2(x_2)
    x_3 = self.fourgram(x)
    x_3 = self.pool_3(x_3)

    merged = tf.concat([x_1,x_2,x_3],axis = -1) #along the last axis. 1st axis refers to batches,2nd axis refers to values after max pooling
    # after this concat, we get a shape of (batch_size,3*nb_filters)

    #now,applying first dense layer
    merged = self.dense_1(merged)
    merged = self.dropout(merged,training)
    output = self.last_dense(merged)
    return output


# Stage 4: Application

## Config

In [40]:
VOCAB_SIZE = tokenizer.vocab_size
EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = len(set(train_labels))
DROPOUT_RATE = 0.2
BATCH_SIZE = 32
NB_EPOCHS = 5  #hyperparameters based on trail and error.Yet to be optimized

## Training

In [41]:
Dcnn = DCNN(vocab_size = VOCAB_SIZE,
            emb_dim = EMB_DIM,
            nb_filters = NB_FILTERS ,
            FFN_units = FFN_UNITS,
            nb_classes = NB_CLASSES,
            dropout_rate = DROPOUT_RATE)

In [42]:
if(NB_CLASSES == 2):
  Dcnn.compile(loss = 'binary_crossentropy',
               optimizer = 'adam',
               metrics = ['accuracy'])
else:
  Dcnn.compile(loss = 'sparse_categorical_crossentropy',
               optimizer = 'adam',
               metrics = ['sparse_categorical_accuracy'])

In [43]:
#checkpoint before evaluation
checkpoint_path = "./drive/MyDrive/CNN_For_NLP"
ckpt = tf.train.Checkpoint(Dcnn = Dcnn)
ckpt_manager = tf.train.CheckpointManager(ckpt,checkpoint_path,max_to_keep = 1) #maximum of 1 checkpoint

if ckpt_manager.latest_checkpoint:
  ckpt.restore(ckpt.manager.latest_checkpoint)
  print("Latest checkpoint restored...")

In [44]:
Dcnn.fit(train_inputs,train_labels,batch_size = BATCH_SIZE,epochs = NB_EPOCHS)
ckpt.manager.save()

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


AttributeError: ignored

## Evaluation

In [45]:
results = Dcnn.evaluate(test_inputs,test_labels,batch_size = BATCH_SIZE)
print(results)

[0.5255874395370483, 0.8224375247955322]


In [46]:
Dcnn(np.array([tokenizer.encode('Bro,this project has been really stressful and my eyes hurt')]),training = False).numpy()

array([[0.39973444]], dtype=float32)

In [47]:
Dcnn(np.array([tokenizer.encode('But I am glad its over now and I learnt a lot on resolving errors and use of CNNs')]),training = False).numpy()

array([[0.99068904]], dtype=float32)