# Stage 1: Importing dependencies

In [None]:
Dataset Link: http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip

In [44]:
!pip uninstall tensorflow
!pip install tensorflow

Found existing installation: tensorflow 2.11.0
Uninstalling tensorflow-2.11.0:
  Would remove:
    /usr/local/bin/estimator_ckpt_converter
    /usr/local/bin/import_pb_to_tensorboard
    /usr/local/bin/saved_model_cli
    /usr/local/bin/tensorboard
    /usr/local/bin/tf_upgrade_v2
    /usr/local/bin/tflite_convert
    /usr/local/bin/toco
    /usr/local/bin/toco_from_protos
    /usr/local/lib/python3.9/dist-packages/tensorflow-2.11.0.dist-info/*
    /usr/local/lib/python3.9/dist-packages/tensorflow/*
Proceed (Y/n)? y
  Successfully uninstalled tensorflow-2.11.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow
  Downloading tensorflow-2.11.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (588.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m588.3/588.3 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorflow
Successfully installed tensorflow-2.11.0


In [2]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup

import os
from google.colab import drive

In [3]:
try:
     %tensorflow_version 2.х
except Exception:
     pass
import tensorflow as tf
import tensorflow_datasets as tfds



Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.


# Stage 2: Data preprocessing

## Loading files

In [4]:
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
cols = ["sentiment", "id", "date", "query", "user", "text"]
train_data = pd.read_csv(
    "/content/drive/My Drive/projects/CNN_for_NLP/train.csv",
    header=None,
    names=cols,
    engine="python",
    encoding="latin1"
)


## Preprocessing

In [6]:
data = train_data

### Cleaning

In [7]:
data.drop(["id", "date", "query", "user"],
         axis=1,
         inplace=True)

In [8]:
def clean_tweet(tweet):
    tweet = BeautifulSoup(tweet,"lxml").get_text()
    tweet = re.sub(r"@[A-Za-z0-9]+",' ',tweet)
    tweet = re.sub(r"https?://[A-Za-z0-9./]+",' ',tweet)
    tweet = re.sub(r"[^a-zA-Z.!?]",' ',tweet)
    tweet = re.sub(r" +",' ',tweet)
    return tweet

In [9]:
data_clean = [clean_tweet(tweet) for tweet in data.text]

  tweet = BeautifulSoup(tweet,"lxml").get_text()


In [10]:
data_labels = data.sentiment.values
data_labels[data_labels == 4] = 1

In [11]:
set(data_labels)

{0, 1}

In [12]:
tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
     data_clean, target_vocab_size=2**16
   )
data_inputs = [tokenizer.encode(sentence) for sentence in data_clean]

### Padding

In [13]:
MAX_LEN = max([len(sentence) for sentence in data_inputs])
data_inputs = tf.keras.preprocessing.sequence.pad_sequences(data_inputs,
                                                           value=0,
                                                           padding="post",
                                                           maxlen=MAX_LEN)

### Spliting into training/testing set

In [14]:
test_idx = np.random.randint(0,800000, 8000)
test_idx = np.concatenate((test_idx, test_idx+800000 ))

In [15]:
test_inputs = data_inputs[test_idx]
test_labels = data_labels[test_idx]
train_inputs = np.delete(data_inputs, test_idx, axis=0)
train_labels = np.delete(data_labels, test_idx)
                        


# Stage 3: Model building

In [40]:
class DCNN(tf.keras.Model):
    
    def __init__(self, vocab_size, emb_dim=128, nb_filters=50, FFN_units=512, nb_classes=2, dropout_rate=0.1, training=False, name="dcnn"): 
             
        super(DCNN, self).__init__(name=name)
        
        self.embedding = tf.keras.layers.Embedding(vocab_size,
                                         emb_dim)
        self.bigram = tf.keras.layers.Conv1D(filters=nb_filters,
                                   kernel_size=2,
                                   padding="valid",
                                   activation="relu")
        self.pool_1 = tf.keras.layers.GlobalMaxPool1D()
        self.trigram = tf.keras.layers.Conv1D(filters=nb_filters,
                                   kernel_size=3,
                                   padding="valid",
                                   activation="relu")
        self.pool_2 = tf.keras.layers.GlobalMaxPool1D()
        self.fourgram = tf.keras.layers.Conv1D(filters=nb_filters,
                                   kernel_size=4,
                                   padding="valid",
                                   activation="relu")
        self.pool_3 = tf.keras.layers.GlobalMaxPool1D()
        self.dense_1 = tf.keras.layers.Dense(units=FFN_units, activation="relu")
        self.dropout = tf.keras.layers.Dropout(rate=dropout_rate)
        if nb_classes == 2:
            self.last_dense = tf.keras.layers.Dense(units=1,
                                          activation="sigmoid")
        else:
            self.last_dense = tf.keras.layers.Dense(units=nb_classes,
                                          activation="softmax")
            
    def call(self, inputs, training):
        x = self.embedding(inputs)
        x_1 = self.bigram(x)
        x_1 = self.pool_1(x_1)
        x_2 = self.trigram(x)
        x_2 = self.pool_1(x_2)
        x_3 = self.fourgram(x)
        x_3 = self.pool_3(x_3)
        
        merged = tf.concat([x_1, x_2, x_3], axis=-1)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)
        
        return output
        

            
            
        
        
        
        

# Stage 4: Application

## Config

In [41]:
VOCAB_SIZE = tokenizer.vocab_size

EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = len(set(train_labels))

DROPOUT_RATE = 0.2

BATCH_SIZE = 32
NB_EPOCHS = 5


## Training

In [42]:

Dcnn = DCNN(vocab_size=VOCAB_SIZE,
           emb_dim=EMB_DIM,
           nb_filters=NB_FILTERS,
           FFN_units=FFN_UNITS,
           nb_classes=NB_CLASSES,
           dropout_rate=DROPOUT_RATE)
            

In [43]:
if NB_CLASSES ==2:

  Dcnn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

else:

  Dcnn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["sparse_categorical_accuracy"])

In [44]:
checkpoint_path = "./drive/My Drive/projects/CNN_for_NLP/ckpt"

ckpt = tf.train.Checkpoint(DCNN=Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored!")

In [46]:
Dcnn.fit(train_inputs,
        train_labels,
        batch_size=BATCH_SIZE,
        epochs=NB_EPOCHS)
ckpt_manager.save()

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


'./drive/My Drive/projects/CNN_for_NLP/ckpt/ckpt-1'

## Evaluation

In [47]:
Dcnn(np.array([tokenizer.encode("You are so nice")]), training=False).numpy()

array([[0.96032214]], dtype=float32)