<a href="https://colab.research.google.com/github/nebilarega/review_to_ratings/blob/main/Sentiment_analysis_exercise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from six.moves import urllib
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import tarfile

In [None]:
# To have the data permanently mount drive
from google.colab import drive
drive.mount('/content/drive')

# Get the data and move the rating and review text in two files

## fetching and extracting the data to the wanted path

In [None]:
def download_progress(count, block_size, total_size):
    percent = count * block_size * 100 // total_size
    sys.stdout.write("\rDownloading: {}%".format(percent))
    sys.stdout.flush()

def fetch_data(url, file_path, extracted_path):
  file_name = file_path+'/mul_domain_dataset.tar.gz'
  urllib.request.urlretrieve(url, filename=file_name, reporthook=download_progress)
  tar = tarfile.open(file_name)
  tar.extractall(extracted_path)
  tar.close()

In [None]:
file_path = '/content/drive/MyDrive/multi_domain_dataset'
url = 'http://www.cs.jhu.edu/~mdredze/datasets/sentiment/unprocessed.tar.gz'
# fetch the data and extract it
fetch_data(url, file_path, file_path)

# Get the necessary data from the list of directories

## get the directory list

In [None]:
# list the directory where the extracted directories are
main_path = '/content/drive/MyDrive/multi_domain_dataset/sorted_data/'
dir_list = os.listdir(main_path)

# if there are other files that are not directories filter them out
for i in range(len(dir_list)-1):
  dir = os.path.join(main_path,dir_list[i])
  if not os.path.isdir(dir):
    dir_list.pop(i)

## Separate the necessary features from the given data

### the data holds many features separted by opend and closed tab. I extract the features that are between  <review_text> </review_text> and <ratings> </ratings>

In [None]:
file1 = '/content/drive/MyDrive/multi_domain_dataset/review_text'
file2 = '/content/drive/MyDrive/multi_domain_dataset/ratings'

def separate_text_ratings(path, str1, str2):
  with open(path, 'a+', encoding='ISO-8859-1') as f_re_w:
    for rev_path in dir_list:
      all_review = main_path + rev_path + '/all.review'
      intermidate_value = open(all_review, 'r', encoding='ISO-8859-1')
      line_length = len(intermidate_value.readlines())
      intermidate_value.close()
      with open(all_review, 'r', encoding='ISO-8859-1') as f_r:
        for file_ in range(line_length):
          if f_r.readline().find(str1) != -1:
            written_line = ''
            next_line = f_r.readline()
            while next_line.find(str2) == -1:
              written_line = written_line + ' ' + next_line.rstrip('\n')
              next_line = f_r.readline()
            f_re_w.write(written_line+'\n')
        print('finished file', rev_path)

In [None]:
# Separate the review text and save them in a separate file
separate_text_ratings(file1, '<review_text>', '</review_text>')

In [None]:
# Separate the ratings and save them in a separate file
separate_text_ratings(file2,'<rating>', '</rating>')

In [None]:
# Save the total lines in the files, which are the same so just use one of the files
with open(file1, 'r', encoding='ISO-8859-1') as f:
  dataset_size = len(f.readlines())

## Check the level of data imbalance and balance it.

### Count the occurance of the five ratings and get the line they occured

In [None]:
# This below is an inefficient way to find occurance
ones, twos, threes, fours, fives = 0,0,0,0,0  # counting lists
one_line, two_line, three_line, four_line, five_line = [],[],[],[],[] # occurance lines
with open(file2, 'r', encoding='ISO-8859-1') as f:
  for i in range(dataset_size):
    line = f.readline()
    if float(line.rstrip('\n').lstrip(' ')) == 1.0:
      ones += 1
      one_line.append(i)
    elif float(line.rstrip('\n').lstrip(' ')) == 2.0:
      twos += 1
      two_line.append(i)
    elif float(line.rstrip('\n').lstrip(' ')) == 3.0:
      threes += 1
      three_line.append(i)
    elif float(line.rstrip('\n').lstrip(' ')) == 4.0:
      fours += 1
      four_line.append(i)
    elif float(line.rstrip('\n').lstrip(' ')) == 5.0:
      fives += 1
      five_line.append(i)

In [None]:
print('num of ones', ones)
print('num of twos', twos)
print('num of threes', threes)
print('num of fours', fours)
print('num of fives', fives)

num of ones 103953
num of twos 80278
num of threes 0
num of fours 320681
num of fives 917618


### As we can see from the above the classes are highly imbalanced. There is no 3 rating even. Thus we must balance the data

In [None]:
# using the lines we will randomly select 80000 reviews from each class and save it to have a balanced file
file_b1 = '/content/drive/MyDrive/multi_domain_dataset/balanced_review_text'
file_b2 = '/content/drive/MyDrive/multi_domain_dataset/balanced_ratings'

def balanced_classes(one_line=one_line, two_line=two_line, four_line=four_line, five_line=five_line):
  shuffle_index_1 = np.random.permutation(len(one_line))
  shuffle_index_2 = np.random.permutation(len(two_line))
  shuffle_index_4 = np.random.permutation(len(four_line))
  shuffle_index_5 = np.random.permutation(len(five_line))

  one_line = np.array(one_line)
  two_line = np.array(two_line)
  four_line = np.array(four_line)
  five_line = np.array(five_line)

  acc_lines = []
  acc_lines = np.append(acc_lines, one_line[shuffle_index_1][:80000])
  acc_lines = np.append(acc_lines, two_line[shuffle_index_2][:80000])
  acc_lines = np.append(acc_lines, four_line[shuffle_index_4][:80000])
  acc_lines = np.append(acc_lines, five_line[shuffle_index_5][:80000])

  with open(file1, 'r', encoding='ISO-8859-1') as f1, open(file2, 'r', encoding='ISO-8859-1') as f2, open(file_b1, 'w',encoding='ISO-8859-1') as fb1, open(file_b2, 'w', encoding='ISO-8859-1') as fb2:
    for index in range(dataset_size):
      review = f1.readline()
      rating = f2.readline()
      if index in acc_lines:
        fb1.write(review)
        fb2.write(rating)

In [None]:
# Balance the data there is no 3 rating thus we will not use 3 
# every class will have 80,000 of data

balanced_classes()

# Preprocess the data

In [None]:
# get the size of the new dataset to be read from file
with open(file_b1, encoding='ISO-8859-1') as f:
  new_dataset_size = len(f.readlines())

In [None]:
new_dataset_size

320000

In [None]:
with open(file_b1, encoding='ISO-8859-1') as f:
  da = f.readlines()

In [None]:
da[new_dataset_size-2]

In [None]:
# We will create a tensorflow dataset using Textline dataset
# TextLineDataset is usefull when creating a dataset from text files
datasetX = tf.data.TextLineDataset([file_b1])
datasety = tf.data.TextLineDataset([file_b2])

# Combine the rating and review
dataset = tf.data.Dataset.zip((datasetX, datasety))

In [None]:
# Preprocess the data as follows:
  # 1. Limit the number of characters to 300
  # 2. Replace any special charcter with space
  # 3. Replace any non alphabetical character with space
  # 4. Split the batch to strings
  # 5. As the data is read from file it is saved as byte array. Change this to number in y_batch
  # 6. As the final output should be a Dense of 4 leave out 3 and shift rating values to start from 0
  # 7. Finaly change X_batch to tensor and pad it  
def preprocess(X_batch, y_batch):
  X_batch = tf.strings.substr(X_batch, 0, 300)
  X_batch = tf.strings.regex_replace(X_batch, b"<br\\s*/?>", b" ")
  X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ")
  X_batch = tf.strings.split(X_batch)
  y_batch = tf.strings.to_number(y_batch)
  y_batch = tf.where(y_batch > 2, tf.subtract(y_batch, 2), tf.subtract(y_batch,1))
  return X_batch.to_tensor(default_value=b"<pad>"), y_batch

### Generate vocabulary

In [None]:
# Get the list of words in our dataset
from collections import Counter
vocabulary = Counter()
for X_batch, y_batch in dataset.batch(32).map(preprocess):
  for review in X_batch:
    value = review.numpy()
    vocabulary.update(list(value))

#### Truncate the vocabulary to a smaller size and create out of vocabulary set

In [None]:
vocab_size = 20000
truncated_vocabulary = [word for word,count in vocabulary.most_common(vocab_size)]

In [None]:
num_oov = 2000
words = tf.constant(truncated_vocabulary)
word_ids = tf.range(vocab_size, dtype=tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov)

In [None]:
table.lookup(tf.constant([b'funny world'.split()]))

<tf.Tensor: shape=(1, 2), dtype=int64, numpy=array([[365, 193]])>

### Make the dataset ( train, valid, and test)

In [None]:
# Make train, valid and test datasets
train_size = int(new_dataset_size*.9)
test_size = int(new_dataset_size * .05)

shuffled_dataset = dataset.shuffle(10000)
train_set = shuffled_dataset.take(train_size)
train_valid_set = shuffled_dataset.skip(train_size)

test_set = train_valid_set.take(test_size)
valid_set = train_valid_set.skip(test_size)

#### Encode the dataset with our lookup table

In [None]:
def encode_words(X_batch, y_batch):
  return table.lookup(X_batch), y_batch

train_set = train_set.batch(32).map(preprocess)
train_set = train_set.map(encode_words).prefetch(1)

In [None]:
list(train_set.as_numpy_iterator())[0]

(array([[  15, 7678, 7285, ...,    0,    0,    0],
        [ 472,  682,   16, ...,    0,    0,    0],
        [   5,   44,  145, ...,    0,    0,    0],
        ...,
        [  21,  826, 1665, ...,    0,    0,    0],
        [   5,  145,    8, ...,    0,    0,    0],
        [ 215,    5,  982, ...,    0,    0,    0]]),
 array([2., 2., 0., 3., 1., 0., 0., 3., 3., 0., 2., 3., 2., 0., 0., 0., 0.,
        2., 1., 2., 1., 1., 2., 0., 2., 3., 0., 2., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 1., 0., 0., 3., 3., 1., 0., 1., 0., 3., 0.,
        3., 0., 2., 0., 3., 1., 2., 3., 0., 3., 3., 0., 1.], dtype=float32))

# Create a model and train, and save 

In [None]:
embed_size = 128
model = tf.keras.Sequential([
  tf.keras.layers.Embedding(vocab_size + num_oov, embed_size, input_shape=[None], mask_zero=True),
  tf.keras.layers.GRU(128, return_sequences=True),
  tf.keras.layers.GRU(128),
  tf.keras.layers.Dense(4, activation='sigmoid')
])
model.compile(
    optimizer=tf.keras.optimizers.Nadam(learning_rate=0.001),
    loss=tf.keras.losses.sparse_categorical_crossentropy,
    metrics=tf.keras.metrics.sparse_categorical_accuracy)
history = model.fit(train_set, epochs=10, callbacks=tf.keras.callbacks.ModelCheckpoint('/content/drive/MyDrive/multi_domain_dataset/model_1.h5'))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
test = table.lookup(tf.constant(b'disgusted'.split()))

In [None]:
model.predict(test)

array([[0.93728346, 0.913895  , 0.15867427, 0.06335643]], dtype=float32)

# Using nnlm-en-50 sentence embedding module

In [None]:
import tensorflow_hub as hub
new_model = tf.keras.Sequential([
  hub.KerasLayer("https://tfhub.dev/google/nnlm-en-dim50/2",
                      dtype=tf.string, input_shape=[], output_shape=[50]),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(4, activation='sigmoid')
])
new_model.compile(optimizer='adam', loss=tf.keras.losses.sparse_categorical_crossentropy,
                  metrics=tf.keras.metrics.sparse_categorical_accuracy)


In [None]:
def new_preprocess(X_batch, y_batch):
  X_batch = tf.strings.substr(X_batch, 0, 300)
  y_batch = tf.strings.to_number(y_batch)
  y_batch = tf.where(y_batch > 2, tf.subtract(y_batch, 2), tf.subtract(y_batch,1))
  return X_batch, y_batch


In [None]:
new_dataset = shuffled_dataset.take(train_size)
new_dataset = new_dataset.batch(64).map(new_preprocess)

In [None]:
new_model.fit(new_dataset, epochs=200)

# Using Universal Sentence Encoder

In [None]:
from absl import logging
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns

module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url)
print ("module %s loaded" % module_url)
def embed(input):
  return model(input)


#### From tensorflow hub tutorials

In [None]:
word = "Elephant"
sentence = "I am a sentence for which I would like to get its embedding."
paragraph = (
    "Universal Sentence Encoder embeddings also support short paragraphs. "
    "There is no hard limit on how long the paragraph is. Roughly, the longer "
    "the more 'diluted' the embedding will be.")
messages = [word, sentence, paragraph]

# Reduce logging output.
logging.set_verbosity(logging.ERROR)

message_embeddings = embed(messages)

for i, message_embedding in enumerate(np.array(message_embeddings).tolist()):
  print("Message: {}".format(messages[i]))
  print("Embedding size: {}".format(len(message_embedding)))
  message_embedding_snippet = ", ".join(
      (str(x) for x in message_embedding[:3]))
  print("Embedding: [{}, ...]\n".format(message_embedding_snippet))






Message: Elephant
Embedding size: 512
Embedding: [0.008344486355781555, 0.00048085825983434916, 0.06595248728990555, ...]

Message: I am a sentence for which I would like to get its embedding.
Embedding size: 512
Embedding: [0.050808604806661606, -0.01652429811656475, 0.01573782227933407, ...]

Message: Universal Sentence Encoder embeddings also support short paragraphs. There is no hard limit on how long the paragraph is. Roughly, the longer the more 'diluted' the embedding will be.
Embedding size: 512
Embedding: [-0.02833269163966179, -0.0558621808886528, -0.012941432185471058, ...]



## Now lets use this embedder (encoder) in our model

In [None]:
third_model = tf.keras.Sequential([
  hub.KerasLayer('https://tfhub.dev/google/universal-sentence-encoder/4',
                  dtype=tf.string, input_shape=[], output_shape=[512]),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(4, activation='softmax')
])
third_model.compile(optimizer='adam', loss=tf.keras.losses.sparse_categorical_crossentropy,
                  metrics=tf.keras.metrics.sparse_categorical_accuracy)

In [None]:
third_model.fit(new_dataset, epochs=10)