In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import string
import re
from os import listdir
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Dense
import numpy as np
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# load doc into memory
def load_doc(filename):
  # open the file as read only
  file = open(filename, 'r' )
  # read all text
  text = file.read()
  # close the file
  file.close()
  return text

In [None]:
text = load_doc('drive/MyDrive/review_polarity/txt_sentoken/pos/cv026_29325.txt')

In [None]:
# turn a doc into clean tokens
def clean_doc(doc):
  # split into tokens by white space
  tokens = doc.split()
  # prepare regex for char filtering
  re_punc = re.compile( '[%s]' % re.escape(string.punctuation))
  # remove punctuation from each word
  tokens = [re_punc.sub( '' , w) for w in tokens]
  # remove remaining tokens that are not alphabetic
  tokens = [word for word in tokens if word.isalpha()]
  # filter out stop words
  stop_words = set(stopwords.words( 'english' ))
  tokens = [w for w in tokens if not w in stop_words]
  # filter out short tokens
  tokens = [word for word in tokens if len(word) > 1]
  return tokens

In [None]:
text = load_doc('drive/MyDrive/review_polarity/txt_sentoken/pos/cv026_29325.txt')
clean_doc(text)

['us',
 'werent',
 'yet',
 'born',
 'rock',
 'rolled',
 'around',
 'monterey',
 'pop',
 'affords',
 'affectionate',
 'glimpse',
 'music',
 'influenced',
 'parents',
 'hippies',
 'otis',
 'redding',
 'jimi',
 'hendrix',
 'janis',
 'joplin',
 'mamas',
 'papas',
 'jefferson',
 'airplane',
 'documentary',
 'jampacked',
 'contagious',
 'energy',
 'give',
 'fair',
 'reveal',
 'ending',
 'rest',
 'film',
 'justice',
 'deserves',
 'shot',
 'outdoor',
 'concert',
 'precluded',
 'woodstock',
 'film',
 'defies',
 'stereotype',
 'general',
 'population',
 'time',
 'sure',
 'painted',
 'faces',
 'smoke',
 'joints',
 'pennebaker',
 'war',
 'room',
 'moon',
 'broadway',
 'surprisingly',
 'chooses',
 'show',
 'broad',
 'spectrum',
 'audience',
 'matter',
 'watching',
 'comes',
 'back',
 'talented',
 'musicians',
 'stir',
 'soul',
 'excitement',
 'starts',
 'music',
 'even',
 'begins',
 'young',
 'girl',
 'cleaning',
 'thousands',
 'seats',
 'asked',
 'interviewer',
 'replies',
 'feels',
 'lucky',
 'mo

In [None]:
# load doc, clean and return line of tokens
def doc_to_line(filename, vocab):
  # load the doc
  doc = load_doc(filename)
  # clean doc
  tokens = clean_doc(doc)
  # filter by vocab
  tokens = [w for w in tokens if w in vocab]
  return ' ' .join(tokens)

In [None]:
f = open('vocab.txt')
vocab = f.read().split()
doc_to_line('drive/MyDrive/review_polarity/txt_sentoken/pos/cv026_29325.txt', vocab)



In [None]:
# load all docs in a directory
def process_docs(directory, vocab):
  lines = []
  # walk through all files in the folder
  for filename in listdir(directory):
    # create the full path of the file to open
    path = directory + '/' + filename
    # load and clean the doc
    line = doc_to_line(path, vocab)
    # add to list
    lines.append(line)
  return lines

In [None]:
# load and clean a dataset
def load_clean_dataset(vocab):
  # load documents
  neg = process_docs('drive/MyDrive/review_polarity/txt_sentoken/neg', vocab)
  pos = process_docs('drive/MyDrive/review_polarity/txt_sentoken/pos', vocab)
  docs = neg + pos
  # prepare labels
  labels = [0 for _ in range(len(neg))] + [1 for _ in range(len(pos))]
  return docs, labels

In [None]:
d, l = load_clean_dataset(vocab)

In [None]:
len(l)

2000

In [None]:
from collections import Counter
Counter(l)

Counter({0: 1000, 1: 1000})

In [None]:
# fit a tokenizer
def create_tokenizer(lines):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(lines)
  return tokenizer

In [None]:
# define the model
def define_model(n_words):
  # define network
  model = Sequential()
  model.add(Dense(50, input_shape=(n_words,), activation= 'relu' ))
  model.add(Dense(1, activation= 'sigmoid' ))
  # compile network
  model.compile(loss= 'binary_crossentropy' , optimizer= 'adam' ,
                metrics=[ 'accuracy' ])
  # summarize defined model
  model.summary()
  plot_model(model, to_file= 'model.png' , show_shapes=True)
  return model

In [None]:
# load the vocabulary
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = set(vocab.split())

In [None]:
# load all reviews
train_docs, ytrain = load_clean_dataset(vocab)
test_docs, ytest = load_clean_dataset(vocab)

In [None]:
# create the tokenizer
tokenizer = create_tokenizer(train_docs)

In [None]:
# encode data
Xtrain = tokenizer.texts_to_matrix(train_docs, mode= 'binary' )
Xtest = tokenizer.texts_to_matrix(test_docs, mode= 'binary' )

In [None]:
Xtrain.shape

In [None]:
# define network
n_words = Xtrain.shape[1]
model = define_model(n_words)

In [None]:
# fit network
model.fit(Xtrain, np.array(ytrain), epochs=10, batch_size=10)

In [None]:
# classify a review as negative or positive
def predict_sentiment(review):
  # clean
  tokens = clean_doc(review)
  # filter by vocab
  tokens = [w for w in tokens if w in vocab]
  # convert to line
  line = ' ' .join(tokens)
  # encode
  encoded = tokenizer.texts_to_matrix([line], mode= 'binary' )
  # predict sentiment
  yhat = model.predict(encoded, verbose=0)
  # retrieve predicted percentage and label
  percent_pos = yhat[0,0]
  if round(percent_pos) == 0:
    return (1-percent_pos), 'NEGATIVE'
  return percent_pos, 'POSITIVE'

In [None]:
# test positive text
text = ' Best movie ever! It was great, I recommend it. '
percent, sentiment = predict_sentiment(text, vocab, tokenizer, model)
print( ' Review: [%s]\nSentiment: %s (%.3f%%) ' % (text, sentiment, percent*100))

In [None]:
# test negative text
text = ' This is a bad movie. '
percent, sentiment = predict_sentiment(text, vocab, tokenizer, model)
print( ' Review: [%s]\nSentiment: %s (%.3f%%) ' % (text, sentiment, percent*100))