<a href="https://colab.research.google.com/github/meghna2312/SentimentCNN/blob/master/meg__Sentiment_analyzer_using_Convolutional_Neural_Network.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###Sentiment analysis is one of the essential tasks done in NLP. We will build a sentiment analyzer using CNN (Convolutional Neural Network). It will take as inputs tweets and outputs whether it conveys a positive or negative sentiment. 




### Dataset Link: http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip

In [None]:
import numpy as np
import math
import re #regex for string cleaning
import pandas as pd
from bs4 import BeautifulSoup

from google.colab import drive
#The best way to use files in Google colab is via Google Drive. So, we import drive module to connect it with Google colab

In [None]:
#We are asking for the Tensorflow version of 2.x (it can be 2.1, 2.0.2 or any such ones but it should start with 2)
#If it doesn't have that it gives any version it has
try:
    %tensorflow_version 2.x
except Exception:
    pass
import tensorflow as tf

from tensorflow.keras import layers #Used to create layers in our deep learning model
import tensorflow_datasets as tfds #Tensorflow datasets are ready-to-use datasets with Tensorflow or other Python ML frameworks

In [None]:
drive.mount("/content/drive")

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


###cols is the list of column names of our dataset.

###`sentiment` : indicates if the sentiment is positive or negative. 0 denotes negative, 1 denotes positive

###`id` : ID of the tweet

###`date` : date on which the tweet was sent

###`query` : this column is not very useful. All the values for this column are 'NO_QUERY' which means none of the tweets have any query.

###`user` : the user who tweeted (Twitter handle)

###`text` : the tweet 

In [None]:
cols = ["sentiment", "id", "date", "query", "user", "text"]

In [None]:
actual_data = pd.read_csv(
    "/content/drive/My Drive/Sentiment Analyzer Datasets/train.csv",
    header=None,
    names=cols,
    skiprows = 1,
    engine="python",
    encoding="latin1"
)

In [None]:
#1.6M tweets
actual_data.shape

(1599999, 6)

In [None]:
#no imbalance in classes
actual_data.sentiment.value_counts()

4    800000
0    799999
Name: sentiment, dtype: int64

In [None]:
#We store our train.csv file in a variable called train_data to use it later for other steps.
train_data = pd.read_csv(
    "/content/drive/My Drive/Sentiment Analyzer Datasets/data.csv",
    header=None,
    names=cols,
    skiprows = 1,
    engine="python",
    encoding="latin1"
)

In [None]:
#Shape returns the number of rows and columns in our dataset. There are 30000 rows and 6 columns
train_data.shape

(30000, 6)

In [None]:
#Seeing first five rows of our dataset
train_data.head()

Unnamed: 0,sentiment,id,date,query,user,text
0,0,1553795194,Sat Apr 18 15:13:59 PDT 2009,NO_QUERY,t_win,It's been the longest day ever! I still haven'...
1,0,2179002334,Mon Jun 15 08:30:28 PDT 2009,NO_QUERY,badsotheynv,I feel uber bad little ol lady is sick wanted ...
2,0,1936039755,Wed May 27 07:20:42 PDT 2009,NO_QUERY,mubi_just_do_it,goose just died...saddest scene i've seen...
3,0,2185132296,Mon Jun 15 16:56:05 PDT 2009,NO_QUERY,walkthistown,@alexamarzi I KNOWW dont move
4,0,2180496762,Mon Jun 15 10:33:02 PDT 2009,NO_QUERY,clare666,@Piewacket1 awwww pie... the 'once in a lifeti...


In [None]:
#no imbalance: distribution of data is preserved
train_data.sentiment.value_counts()

4    15000
0    15000
Name: sentiment, dtype: int64

In [None]:

data = train_data

## Preprocessing

In [None]:
#Pandas gives a method called drop to drop rows or columns from the dataframe.
#We need to specify which columns to drop (remove), axis : if 0: removes the row, if 1: removes that column
#Once this cell is executed, Pandas modifies our dataframe only for this cell but not permanently. To permanently modify the dataframe, there are 2 methods:
#1. Assigning this statement to data (the variable containing our data). In this we overwrite the old data
#2. Using the inplace argument, setting it to True tells it to modify it in the original dataframe.
data.drop(["id", "date", "query", "user"],
          axis=1,
          inplace=True)

In [None]:
def clean_tweet(tweet):
    tweet = BeautifulSoup(tweet, "lxml").get_text()
    # Removing the @
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet)
    # Removing the URL links
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)
    # Keeping only letters and common punctuations used in text
    tweet = re.sub(r"[^a-zA-Z.!?']", ' ', tweet)
    # Removing additional whitespaces
    tweet = re.sub(r" +", ' ', tweet)
    return tweet

In [None]:
data_clean = [clean_tweet(tweet) for tweet in data.text]

In [None]:
data['sentiment'].value_counts()

4    15000
0    15000
Name: sentiment, dtype: int64

In [None]:
data_labels = data.sentiment.values
data_labels[data_labels == 4] = 1

In [None]:
data.shape

(30000, 2)

### Tokenization

In [None]:
tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    data_clean, target_vocab_size=2**11
)

data_inputs = [tokenizer.encode(sentence) for sentence in data_clean]

In [None]:
MAX_LEN = max([len(sentence) for sentence in data_inputs])
data_inputs = tf.keras.preprocessing.sequence.pad_sequences(data_inputs,
                                                            value=0,
                                                            padding="post",
                                                            maxlen=MAX_LEN)

### Spliting into training/testing set



In [None]:
test_neg_idx = np.random.randint(0, 15001, 1500)
test_pos_idx = np.random.randint(15001, 30001, 1500)
test_idx = np.concatenate((test_neg_idx, test_pos_idx))

In [None]:
test_inputs = data_inputs[test_idx]
test_labels = data_labels[test_idx]
train_inputs = np.delete(data_inputs, test_idx, axis=0)
train_labels = np.delete(data_labels, test_idx)

In [None]:
class DCNN(tf.keras.Model):
    
    def __init__(self,
                 vocab_size,
                 emb_dim=128, #128 dimensions for embedding layer is default
                 nb_filters=50, #number of filters = 50 (default value), number of times to apply convolution
                 FFN_units=512, #number of units in the feed forward neural network = 512 (default value)
                 nb_classes=2, #number of classes = 2 (positive or negative)
                 dropout_rate=0.1, #default value. Dropout is a tool to turn off certain parameters and variables in order to avoid overfitting
                 training=False, #boolean variable indicating if the model is in training phase. Mainly used to know if we need to apply dropout as dropout is 
                 #only applied during training. 
                 name="dcnn"): #name of our model
        
        #call the init function from the class we are inheriting from. Done by calling the super method giving the name of the class we are writing now and 
        #self. Give the name of our model to init method to initialize properly
        super(DCNN, self).__init__(name=name)  
        
        #Defining layers
        #1. Embedding layer with vocab size and embedding dimensions
        self.embedding = layers.Embedding(vocab_size,
                                          emb_dim)
        self.bigram = layers.Conv1D(filters=nb_filters,
                                    kernel_size=2, #filter size
                                    padding="valid", #isn't very important which padding method is used because our stride (step size) is 1 (applying filter 
                            #word by word). During last convolutions when the filter exceeds the length of the sequence, valid method pads those spaces by zero. 
                                    activation="relu") #ReLU (Rectified Linear Unit) is a standard activation function to introduce non-linearity into our model
        self.trigram = layers.Conv1D(filters=nb_filters,
                                     kernel_size=3,
                                     padding="valid",
                                     activation="relu")
        self.fourgram = layers.Conv1D(filters=nb_filters,
                                      kernel_size=4,
                                      padding="valid",
                                      activation="relu")
        
        #1D Max Pooling since it's a 1D convolution 
        self.pool = layers.GlobalMaxPool1D() # no training variable so we can use the same layer for each pooling step
        self.dense_1 = layers.Dense(units=FFN_units, activation="relu") #Dense layer 
        self.dropout = layers.Dropout(rate=dropout_rate) #Since there is a lot of variables and connections between them, this is a good place to apply Dropout 
        #to avoid overfitting
        
        #The last dense layer depends on how many classes we have. If there are 2 classes, we need a single number between 0 and 1 as the output.
        #Below 0.5, belongs to class 0 (Negative sentiment). Above 0.5, belongs to class 1 (Positive sentiment)
        if nb_classes == 2:
            self.last_dense = layers.Dense(units=1, #1 unit means a single number 
                                           activation="sigmoid") #Sigmoid takes a number between -infinity and +infinity and returns a value between 0 and 1. 
                                           #This is the choice of activation in binary classification tasks
        else:
            self.last_dense = layers.Dense(units=nb_classes,
                                           activation="softmax") #Softmax gives the number of values (equal to number of classes) between 0 and 1 whose sum is 
                                           #1. It basically indicates the probability of belonging to each class
    
    #After defining the functions, we have to call them. Let's do this using a call function. This function gives outputs from inputs
    def call(self, inputs, training): #self and inputs are obviously needed. training to indicate whether to apply dropout or not 
        x = self.embedding(inputs) #applying embedding
        x_1 = self.bigram(x)
        x_1 = self.pool(x_1)
        x_2 = self.trigram(x)
        x_2 = self.pool(x_2)
        x_3 = self.fourgram(x)
        x_3 = self.pool(x_3)
        
        merged = tf.concat([x_1, x_2, x_3], axis=-1) # (batch_size, 3 * nb_filters), axis = -1 indicates last axis where all the pooling values are present
        merged = self.dense_1(merged) #First dense layer (starting feedforward process)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged) #outputs
        
        return output

In [None]:
#Model parameters (Global variables)
#Rather than passing the values as arguments, it's better to pass the variables containing those values. In this way, we can change all the values easily to 
#modify our model

VOCAB_SIZE = tokenizer.vocab_size

EMB_DIM = 200

NB_FILTERS = 100

FFN_UNITS = 256

NB_CLASSES = len(set(train_labels))

DROPOUT_RATE = 0.2

BATCH_SIZE = 32

NB_EPOCHS = 5
#You can play around with these parameters (hyperparameter tuning) to acheive the highest accuracy.

In [None]:
#Creating an instance of the model and pass all the required parameters as defined before
Dcnn = DCNN(vocab_size=VOCAB_SIZE,
            emb_dim=EMB_DIM,
            nb_filters=NB_FILTERS,
            FFN_units=FFN_UNITS,
            nb_classes=NB_CLASSES,
            dropout_rate=DROPOUT_RATE)

In [None]:
#Model compilation depending on the number of classes
if NB_CLASSES == 2:
    Dcnn.compile(loss="binary_crossentropy", #standard loss in binary classification
                 optimizer="adam", #standard
                 metrics=["accuracy"]) #metrics to track during training
else: #if more than 2 classes in a different application
    Dcnn.compile(loss="sparse_categorical_crossentropy",
                 optimizer="adam",
                 metrics=["sparse_categorical_accuracy"])

In [None]:
#Defining the path
checkpoint_path = "./drive/My Drive/NLP/ckpt/" 

#Creating checkpoint object
ckpt = tf.train.Checkpoint(Dcnn=Dcnn)

#Creating checkpoint manager
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5) #max_to_keep is maximum number of checkpoints we want to keep

#Checking if there is already a checkpoint in the checkpoint path. If so, we will restore it and print a message saying the same.
if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored!!")

In [None]:
#Fitting the model
Dcnn.fit(train_inputs,
         train_labels,
         batch_size=BATCH_SIZE,
         epochs=NB_EPOCHS)
#Saving the checkpoint after training
ckpt_manager.save()

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


'./drive/My Drive/NLP/ckpt/ckpt-1'

## Evaluation

In [None]:
results = Dcnn.evaluate(test_inputs, test_labels, batch_size=BATCH_SIZE)
print(results)
#Ouputs [loss, accuracy]

[0.9236537218093872, 0.7450000047683716]


In [None]:
Dcnn.metrics_names

['loss', 'accuracy']

In [None]:
Dcnn(np.array([tokenizer.encode("He is the best")]), training=False).numpy()

In [None]:
Dcnn(np.array([tokenizer.encode("Doesn't make sense")]), training=False).numpy()

In [None]:
Dcnn(np.array([tokenizer.encode("He sucks at playing")]), training=False).numpy()

In [None]:
Dcnn(np.array([tokenizer.encode("Why does he look ugly")]), training=False).numpy()

In [None]:
Dcnn(np.array([tokenizer.encode("He is a great guy")]), training=False).numpy()

In [None]:
Dcnn(np.array([tokenizer.encode("You are so funny")]), training=False).numpy()

In [None]:
from joblib import dump
dump(Dcnn, filename='model.joblib')

TypeError: ignored