<a href="https://colab.research.google.com/github/obeabi/Sentiment-Labelled-Sentences-Data-Set/blob/main/Main_CNN_Sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Written by Abiola Obembe
# Sentiment Labelled Sentences Data Set
## Date: 2020-12-13
## Algorithm: Convnet

In [217]:
"""
Sentiment Labelled Sentences Data Set

Data Set Information:

This dataset was created for the Paper 'From Group to Individual Labels using Deep Features', Kotzias et. al,. KDD 2015
Please cite the paper if you want to use it :)

It contains sentences labelled with positive or negative sentiment.

=======
Format:
=======
sentence score


=======
Details:
=======
Score is either 1 (for positive) or 0 (for negative)
The sentences come from three different websites/fields:

imdb.com
amazon.com
yelp.com

For each website, there exist 500 positive and 500 negative sentences. Those were selected randomly for larger datasets of reviews.
We attempted to select sentences that have a clearly positive or negative connotaton, the goal was for no neutral sentences to be selected.


"""



## Step 1: Data Preprocessing

In [218]:
# Import libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords
print("libraries installed succesffully!")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
libraries installed succesffully!


In [219]:
# Install tensorflow
try:
    %tensorflow_version 2.x
except Exception:
    pass
import tensorflow as tf

from tensorflow.keras import layers
import tensorflow_datasets as tfds
print("Tensorflow version  :", tf.__version__)

Tensorflow version  : 2.3.0


In [220]:
# Import dataset
title = ['Review', 'Label']

df1 = pd.read_csv('amazon_cells_labelled.txt', delimiter = '\t',  names= title,  quoting= 3, engine = 'python', encoding = 'latin-1')

df2 = pd.read_csv('imdb_labelled.txt', quoting = 3,  delimiter = '\t',  sep = '.' , names = title, engine = 'python', encoding = 'latin-1')

df3 = pd.read_csv('yelp_labelled.txt', delimiter = '\t', sep = '.' ,  names = title, quoting = 3, engine = 'python', encoding = 'latin-1')

df = pd.concat([df1, df2, df3], axis = 0, sort=False, ignore_index= True)
#df = pd.concat([df_a, df3], axis = 0, sort=False, ignore_index= True)
df.reset_index(drop = True)

print(" The shape of the dataframe is: " , df.shape)

 The shape of the dataframe is:  (3000, 2)


In [221]:
# Let's preuse the dataframe
df.head()

Unnamed: 0,Review,Label
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


## Step 2: Data/Text Cleaning

In [222]:
# Create a corpus 
corpus = []

# Function to clean
def clean_review(review):
    # Removing the @
    review = re.sub(r"@[A-Za-z0-9]+", ' ', review)
    # Removing the URL links
    review = re.sub(r"https?://[A-Za-z0-9./]+", ' ', review)
    # Keeping only letters
    review = re.sub(r"[^a-zA-Z.!?']", ' ', review)
    # Removing additional whitespaces
    review = re.sub(r" +", ' ', review)
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    all_stopwords = stopwords.words('english')
    #review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    review = [lemmatizer.lemmatize(word) for word in review  if not word in set(all_stopwords)]
    review = ' '.join(review)
    corpus.append(review)
    
    return review

In [223]:
# apply function on dataset
review_clean = [ clean_review(reviews)  for reviews in df.Review]

review_clean

['way plug u unless go converter.',
 'good case excellent value.',
 'great jawbone.',
 'tied charger conversation lasting minutes.major problems!!',
 'mic great.',
 'jiggle plug get line right get decent volume.',
 'several dozen several hundred contact imagine fun sending one one.',
 'razr owner...you must this!',
 'needle say wasted money.',
 'waste money time!.',
 'sound quality great.',
 'impressed going original battery extended battery.',
 'two seperated mere ft started notice excessive static garbled sound headset.',
 'good quality though',
 'design odd ear clip comfortable all.',
 'highly recommend one blue tooth phone.',
 'advise everyone fooled!',
 'far good!.',
 'work great!.',
 'click place way make wonder long mechanism would last.',
 "went motorola's website followed direction could get pair again.",
 'bought use kindle fire absolutely loved it!',
 'commercial misleading.',
 "yet run new battery two bar that's three day without charging.",
 'bought mother problem battery.

In [224]:
# print corpus
print(corpus)



In [225]:
# Evaluate the sentiment column
set(df.Label.values)


{0, 1}

In [226]:
data_labels = df.Label.values


In [227]:
# Tokenization
tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(review_clean, target_vocab_size=2**10)

data_inputs = [tokenizer.encode(sentence) for sentence in review_clean]

In [228]:
# Padding
MAX_LEN = max([len(sentence) for sentence in data_inputs])
data_inputs = tf.keras.preprocessing.sequence.pad_sequences(data_inputs, value=0, padding="post", maxlen=MAX_LEN)

In [229]:
#Split data to training and test set
rows = (df.shape[0])
rows = int(np.multiply(rows,0.1))
rowz = int(rows*0.5)
test_idx = np.random.randint(0, rowz, 500)
test_idx = np.concatenate((test_idx, test_idx+rowz))
test_idx
test_inputs = data_inputs[test_idx]
test_labels = data_labels[test_idx]
train_inputs = np.delete(data_inputs, test_idx, axis=0)
train_labels = np.delete(data_labels, test_idx)

In [230]:
train_inputs

array([[125, 240, 247, ...,   0,   0,   0],
       [752, 394, 644, ...,   0,   0,   0],
       [865, 799,  21, ...,   0,   0,   0],
       ...,
       [220, 214, 792, ...,   0,   0,   0],
       [267, 729, 792, ...,   0,   0,   0],
       [541, 133, 423, ...,   0,   0,   0]], dtype=int32)

## Step 4: Build the Model

In [231]:
class DCNN(tf.keras.Model):
    
    def __init__(self,
                 vocab_size,
                 emb_dim=128,
                 nb_filters=50,
                 FFN_units=512,
                 nb_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="dcnn"):
        super(DCNN, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocab_size,emb_dim)

        self.bigram = layers.Conv1D(filters=nb_filters, kernel_size=2, padding="valid", activation="relu")
        
        self.trigram = layers.Conv1D(filters=nb_filters,kernel_size=3, padding="valid", activation="relu")
        
        self.fourgram = layers.Conv1D(filters=nb_filters, kernel_size=4, padding="valid", activation="relu")
        
        self.pool = layers.GlobalMaxPool1D() # no training variable so we can
                                             # use the same layer for each
                                             # pooling step
        self.dense_1 = layers.Dense(units=FFN_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if nb_classes == 2:
            self.last_dense = layers.Dense(units=1, activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=nb_classes, activation="softmax")
    
    def call(self, inputs, training):
        x = self.embedding(inputs)
        x_1 = self.bigram(x)
        x_1 = self.pool(x_1)
        x_2 = self.trigram(x)
        x_2 = self.pool(x_2)
        x_3 = self.fourgram(x)
        x_3 = self.pool(x_3)
        
        merged = tf.concat([x_1, x_2, x_3], axis=-1) # (batch_size, 3 * nb_filters)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)
        
        return output

In [232]:
# Configuration details
VOCAB_SIZE = tokenizer.vocab_size

EMB_DIM = 128
NB_FILTERS = 8
FFN_UNITS = 8
NB_CLASSES = 2

DROPOUT_RATE = 0.2

BATCH_SIZE = 32
NB_EPOCHS = 5

In [233]:
# Let's train the model
Dcnn = DCNN(vocab_size=VOCAB_SIZE,
            emb_dim=EMB_DIM,
            nb_filters=NB_FILTERS,
            FFN_units=FFN_UNITS,
            nb_classes=NB_CLASSES,
            dropout_rate=DROPOUT_RATE)

if NB_CLASSES == 2:
    Dcnn.compile(loss="binary_crossentropy",
                 optimizer="adam",
                 metrics=["accuracy"])
else:
    Dcnn.compile(loss="sparse_categorical_crossentropy",
                 optimizer="adam",
                 metrics=["sparse_categorical_accuracy"])

In [234]:
Dcnn.fit(train_inputs,
         train_labels,
         batch_size=BATCH_SIZE,
         epochs=NB_EPOCHS)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7efd096e25f8>

# Evaluate test set

In [235]:
results = Dcnn.evaluate(test_inputs, test_labels, batch_size=BATCH_SIZE)
print(results)

[0.37431076169013977, 0.8410000205039978]


In [236]:
tokenizer.encode("bad")

[535]

In [237]:
Dcnn(np.array([tokenizer.encode("bad teacher")]), training=False).numpy()

array([[0.04507154]], dtype=float32)