<a href="https://colab.research.google.com/github/omkarwazulkar/NaturalLanguageProcessing/blob/main/N_Bag_of_Words_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import string
import re

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Loading Doc

def load_doc(filename):
  file = open(filename, 'r')
  text = file.read()
  file.close()
  return text

In [3]:
# Cleaning Doc

def clean_doc(doc):
  tokens = doc.split()
  re_punc = re.compile('[%s]' % re.escape(string.punctuation))
  tokens = [re_punc.sub('', w) for w in tokens]
  tokens = [word for word in tokens if word.isalpha()]
  stop_words = set(stopwords.words('english'))
  tokens = [w for w in tokens if not w in stop_words]
  tokens = [word for word in tokens if len(word) > 1]
  return tokens

In [4]:
# Load Doc and Add To Vocab

def add_doc_to_vocab(filename, vocab):
  doc = load_doc(filename)
  tokens = clean_doc(doc)
  vocab.update(tokens)

In [5]:
# Load All Docs in a Directory

def process_docs(directory, vocab):
  for filename in listdir(directory):
    if filename.startswith('cv9'):
      continue
    path = directory + '/' + filename
    add_doc_to_vocab(path, vocab)

In [6]:
# Save List to File
def save_list(lines, filename):
  data = '\n'.join(lines)
  file = open(filename, 'w')
  file.write(data)
  file.close()

In [7]:
filename = '/content/drive/MyDrive/ReviewPopularity/txt_sentoken/pos/cv000_29590.txt'
text = load_doc(filename)
tokens = clean_doc(text)
print(tokens)

['films', 'adapted', 'comic', 'books', 'plenty', 'success', 'whether', 'theyre', 'superheroes', 'batman', 'superman', 'spawn', 'geared', 'toward', 'kids', 'casper', 'arthouse', 'crowd', 'ghost', 'world', 'theres', 'never', 'really', 'comic', 'book', 'like', 'hell', 'starters', 'created', 'alan', 'moore', 'eddie', 'campbell', 'brought', 'medium', 'whole', 'new', 'level', 'mid', 'series', 'called', 'watchmen', 'say', 'moore', 'campbell', 'thoroughly', 'researched', 'subject', 'jack', 'ripper', 'would', 'like', 'saying', 'michael', 'jackson', 'starting', 'look', 'little', 'odd', 'book', 'graphic', 'novel', 'pages', 'long', 'includes', 'nearly', 'consist', 'nothing', 'footnotes', 'words', 'dont', 'dismiss', 'film', 'source', 'get', 'past', 'whole', 'comic', 'book', 'thing', 'might', 'find', 'another', 'stumbling', 'block', 'hells', 'directors', 'albert', 'allen', 'hughes', 'getting', 'hughes', 'brothers', 'direct', 'seems', 'almost', 'ludicrous', 'casting', 'carrot', 'top', 'well', 'anythi

In [8]:
# Define Vocab

from os import listdir
from collections import Counter

vocab = Counter()
process_docs('/content/drive/MyDrive/ReviewPopularity/txt_sentoken/pos', vocab)
process_docs('/content/drive/MyDrive/ReviewPopularity/txt_sentoken/neg', vocab)
print(len(vocab))
print(vocab.most_common(50))

44276
[('film', 7983), ('one', 4946), ('movie', 4826), ('like', 3201), ('even', 2262), ('good', 2080), ('time', 2041), ('story', 1907), ('films', 1873), ('would', 1844), ('much', 1824), ('also', 1757), ('characters', 1735), ('get', 1724), ('character', 1703), ('two', 1643), ('first', 1588), ('see', 1557), ('way', 1515), ('well', 1511), ('make', 1418), ('really', 1407), ('little', 1351), ('life', 1334), ('plot', 1288), ('people', 1269), ('could', 1248), ('bad', 1248), ('scene', 1241), ('movies', 1238), ('never', 1201), ('best', 1179), ('new', 1140), ('scenes', 1135), ('man', 1131), ('many', 1130), ('doesnt', 1118), ('know', 1092), ('dont', 1086), ('hes', 1024), ('great', 1014), ('another', 992), ('action', 985), ('love', 977), ('us', 967), ('go', 952), ('director', 948), ('end', 946), ('something', 945), ('still', 936)]


In [9]:
# Define Vocab with Minimum Occurence

from os import listdir
from collections import Counter

vocab = Counter()
process_docs('/content/drive/MyDrive/ReviewPopularity/txt_sentoken/pos', vocab)
process_docs('/content/drive/MyDrive/ReviewPopularity/txt_sentoken/neg', vocab)
print(len(vocab))

min_occurence = 2
tokens = [k for k,c in vocab.items() if c >= min_occurence]
print(len(tokens))

44276
25767


In [10]:
# Saving Vocab to File
save_list(tokens, 'vocab.txt')

# **Bag of Words Representation**

In [11]:
import string
import re
from os import listdir
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer

In [12]:
# Loading the Document

def load_doc(filename):
  file = open(filename, 'r')
  text = file.read()
  file.close()
  return text

In [13]:
# Turning a Doc into Clean Tokens

def clean_doc(doc):
  tokens = doc.split()
  re_punc = re.compile('[%s]' % re.escape(string.punctuation))
  tokens = [re_punc.sub('', w) for w in tokens]
  tokens = [word for word in tokens if word.isalpha()]
  stop_words = set(stopwords.words('english'))
  tokens = [w for w in tokens if not w in stop_words]
  tokens = [word for word in tokens if len(word) > 1]
  return tokens

In [14]:
# Loading Doc, Cleaning and Returning Lines of Tokens

def doc_to_line(filename, vocab):
  doc = load_doc(filename)
  tokens = clean_doc(doc)
  tokens = [w for w in tokens if w in vocab]
  return ' '.join(tokens)

In [15]:
# Load All Docs in a Directory

def process_docs(directory, vocab, is_train):
  lines = list()
  for filename in listdir(directory):
    if is_train and filename.startswith('cv9'):
      continue
    if not is_train and not filename.startswith('cv9'):
      continue
  path = directory + '/' + filename
  line = doc_to_line(path, vocab)
  lines.append(line)
  return lines

In [16]:
# Loading Clean a Dataset

def load_clean_dataset(vocab, is_train):
  neg = process_docs('/content/drive/MyDrive/ReviewPopularity/txt_sentoken/neg', vocab, True)
  pos = process_docs('/content/drive/MyDrive/ReviewPopularity/txt_sentoken/pos', vocab, True)
  docs = neg + pos
  labels = [0 for _ in range(len(neg))] + [1 for _ in range(len(pos))]
  return docs, labels

In [17]:
# Fitting a Tokenizer

def create_tokenizer(lines):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(lines)
  return tokenizer

In [18]:
# Defining Model 

def define_model(n_words):
  # define network
  model = Sequential()
  model.add(Dense(50, input_shape=(n_words,), activation='relu'))
  model.add(Dense(1, activation='sigmoid'))
  # compile network
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  # summarize defined model
  model.summary()
  plot_model(model, to_file='model.png', show_shapes=True)
  return model

In [19]:
# Loading the Vocabulary

vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = set(vocab.split())

In [20]:
# Load All Reviews

train_docs, ytrain = load_clean_dataset(vocab, True)
test_docs, ytest = load_clean_dataset(vocab, False)

In [21]:
# Creating Tokenizer

tokenizer = create_tokenizer(train_docs)

In [22]:
# Encoding Data

Xtrain = tokenizer.texts_to_matrix(train_docs, mode='freq')
Xtest = tokenizer.texts_to_matrix(test_docs, mode='freq')
print(Xtrain.shape, Xtest.shape)

(2, 516) (2, 516)


# **Creating and Fitting the Model and Applying the Model on New Reviews**

In [23]:
import string
import numpy as np
import re
from os import listdir
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Dense

In [24]:
# Loading Doc
def load_doc(filename):
  file = open(filename, 'r')
  text = file.read()
  file.close()
  return text

In [25]:
# Cleaning Doc
def clean_doc(doc):
  tokens = doc.split()
  re_punc = re.compile('[%s]' % re.escape(string.punctuation))
  tokens = [re_punc.sub('', w) for w in tokens]
  tokens = [word for word in tokens if word.isalpha()]
  stop_words = set(stopwords.words('english'))
  tokens = [w for w in tokens if not w in stop_words]
  tokens = [word for word in tokens if len(word) > 1]
  return tokens

In [26]:
# Coverting into Lines
def doc_to_line(filename, vocab):
  doc = load_doc(filename)
  tokens = clean_doc(doc)
  tokens = [w for w in tokens if w in vocab]
  return ' '.join(tokens)

In [27]:
# Processing Doc
def process_docs(directory, vocab):
  lines = list()
  for filename in listdir(directory):
    path = directory + '/' + filename
    line = doc_to_line(path, vocab)
    lines.append(line)
  return lines

In [28]:
# Loading Clean Dataset
def load_clean_dataset(vocab):
  neg = process_docs('/content/drive/MyDrive/ReviewPopularity/txt_sentoken/neg', vocab)
  pos = process_docs('/content/drive/MyDrive/ReviewPopularity/txt_sentoken/pos', vocab)
  docs = neg + pos
  labels = [0 for _ in range(len(neg))] + [1 for _ in range(len(pos))]
  return docs, labels

In [29]:
# Creating Tokenizer
def create_tokenizer(lines):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(lines)
  return tokenizer

In [30]:
# Defining Model
def define_model(n_words):
  model = Sequential()
  model.add(Dense(50, input_shape=(n_words,), activation='relu'))
  model.add(Dense(1, activation='sigmoid'))
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  model.summary()
  plot_model(model, to_file='model.png', show_shapes=True)
  return model

In [31]:
# Predicting Sentiment
def predict_sentiment(review, vocab, tokenizer, model):
  tokens = clean_doc(review)
  tokens = [w for w in tokens if w in vocab]
  line = ' '.join(tokens)
  encoded = tokenizer.texts_to_matrix([line], mode='binary')
  yhat = model.predict(encoded, verbose=0)
  percent_pos = yhat[0,0]
  if round(percent_pos) == 0:
    return (1-percent_pos), 'NEGATIVE'
  return percent_pos, 'POSITIVE'

In [32]:
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = set(vocab.split())

# Load All Reviews
train_docs, ytrain = load_clean_dataset(vocab)
test_docs, ytest = load_clean_dataset(vocab)

# Convert from List to Array
yTrain = np.array(ytrain)

# Create the Tokenizer
tokenizer = create_tokenizer(train_docs)

# Encode Data
Xtrain = tokenizer.texts_to_matrix(train_docs, mode='binary')
Xtest = tokenizer.texts_to_matrix(test_docs, mode='binary')

# Define Network
n_words = Xtrain.shape[1]
model = define_model(n_words)

# Fit Network
model.fit(Xtrain, yTrain, epochs=10, verbose=2)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 50)                1288450   
                                                                 
 dense_1 (Dense)             (None, 1)                 51        
                                                                 
Total params: 1,288,501
Trainable params: 1,288,501
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
63/63 - 1s - loss: 0.4661 - accuracy: 0.7820 - 1s/epoch - 23ms/step
Epoch 2/10
63/63 - 1s - loss: 0.0620 - accuracy: 0.9920 - 790ms/epoch - 13ms/step
Epoch 3/10
63/63 - 1s - loss: 0.0149 - accuracy: 1.0000 - 820ms/epoch - 13ms/step
Epoch 4/10
63/63 - 1s - loss: 0.0073 - accuracy: 1.0000 - 794ms/epoch - 13ms/step
Epoch 5/10
63/63 - 1s - loss: 0.0043 - accuracy: 1.0000 - 802ms/epoch - 13ms/step
Epoch 6/10
63/63 - 1s - loss: 0.002

<keras.callbacks.History at 0x7f199d95d710>

In [33]:
# Test Positive Text
text = 'Best movie ever! It was great, I recommend it.'
percent, sentiment = predict_sentiment(text, vocab, tokenizer, model)
print('Review: [%s]\nSentiment: %s (%.3f%%)' % (text, sentiment, percent*100))

# Test Negative Text
text = 'This is a bad movie.'
percent, sentiment = predict_sentiment(text, vocab, tokenizer, model)
print('Review: [%s]\nSentiment: %s (%.3f%%)' % (text, sentiment, percent*100))

Review: [Best movie ever! It was great, I recommend it.]
Sentiment: POSITIVE (56.095%)
Review: [This is a bad movie.]
Sentiment: NEGATIVE (64.865%)
