# Library Imports

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os
from bs4 import BeautifulSoup
import spacy
import re
import string
import unicodedata

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics.pairwise import linear_kernel

# Loading the Dataset

In [None]:
# loading the dataset from a url
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
dataset = tf.keras.utils.get_file("aclImdb_v1", url,
                                    untar=True, cache_dir='.',
                                    cache_subdir='')
datasetDir = os.path.join(os.path.dirname(dataset), 'aclImdb')

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [None]:
# investigating the structure of dataset directory with ls
os.listdir(datasetDir)

['imdbEr.txt', 'train', 'test', 'imdb.vocab', 'README']

In [None]:
# training directory structure
trainDir = os.path.join(datasetDir, 'train')
os.listdir(trainDir)

['pos',
 'neg',
 'labeledBow.feat',
 'urls_pos.txt',
 'urls_neg.txt',
 'unsupBow.feat',
 'unsup',
 'urls_unsup.txt']

In [None]:
# sampling out a review from training data
sampleFile = os.path.join(trainDir, 'neg/36_4.txt')
with open(sampleFile) as f:
  print(f.read())

This film seemed way too long even at only 75 minutes. The problem with jungle horror films is that there is always way too much footage of people walking (through the jungle, up a rocky cliff, near a river or lake) to pad out the running time. The film is worth seeing for the laughable and naked native zombie with big bulging, bloody eyes which is always accompanied on the soundtrack with heavy breathing and lots of reverb. Eurotrash fans will be plenty entertained by the bad English dubbing, gratuitous female flesh and very silly makeup jobs on the monster and native extras. For a zombie/cannibal flick this was pretty light on the gore but then I probably didn't see an uncut version.


In [None]:
# creating a dataframe of reviews
# from both testing and training - negative and positive reviews
df = pd.DataFrame({"Review": [], "Sentiment": []})
for i in ('test', 'train'):
  for j in ('pos', 'neg'):
    path = os.path.join(datasetDir, i, j)
    sentiment = "Positive" if j == "pos" else "Negative"
    for fl in os.listdir(path):
      with open(os.path.join(path, fl), 'r', encoding = 'utf-8') as f:
        df = df.append({"Review": f.read(), "Sentiment": sentiment},
                       ignore_index = True)

In [None]:
# summary of the dataset
df.describe()

Unnamed: 0,Review,Sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,Positive
freq,5,25000


In [None]:
df.head()

Unnamed: 0,Review,Sentiment
0,The story of Sweeney Todd evokes memories of t...,Positive
1,Margaret Colin stars as the principal figure i...,Positive
2,The first bottom movie was an absolute laugh f...,Positive
3,This film is completely underrated.<br /><br /...,Positive
4,I stumbled upon this movie whilst flipping cha...,Positive


In [None]:
reviews, sentiments = df["Review"], df["Sentiment"]
print(reviews.shape, sentiments.shape)

(50000,) (50000,)


In [None]:
# the reviews
print(reviews)

0        The story of Sweeney Todd evokes memories of t...
1        Margaret Colin stars as the principal figure i...
2        The first bottom movie was an absolute laugh f...
3        This film is completely underrated.<br /><br /...
4        I stumbled upon this movie whilst flipping cha...
                               ...                        
49995    This was a disappointing film. The people seem...
49996    I should put out an alert all over saying that...
49997    Shazbot, is this embarrassing. In fact, here's...
49998    This film wasn't programmed in Italian cinemas...
49999    What the *bliep* is it with this movie? Couldn...
Name: Review, Length: 50000, dtype: object


# Cleaning the Text

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# tokenization of text
tokenizer = ToktokTokenizer()

# setting English stopwords
stopwords = nltk.corpus.stopwords.words('english')

In [None]:
# removal of html
def removeHTML(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

In [None]:
# removal of square brackets
def removeSqBr(text):
    return re.sub('\[[^]]*\]', '', text)

In [None]:
# removing html and square brackets
def removeHTMLSqBr(text):
    text = removeHTML(text)
    text = removeSqBr(text)
    return text

In [None]:
# removing the html and square brackets of the reviews
df["Review"] = df['Review'].apply(removeHTMLSqBr)

In [None]:
df.head()

Unnamed: 0,Review,Sentiment
0,The story of Sweeney Todd evokes memories of t...,Positive
1,Margaret Colin stars as the principal figure i...,Positive
2,The first bottom movie was an absolute laugh f...,Positive
3,This film is completely underrated.It's a film...,Positive
4,I stumbled upon this movie whilst flipping cha...,Positive


In [None]:
# removal of special characters
def removeSpecial(text, removeDigits = True):
    pattern = r'[^a-zA-z0-9\s]'
    text = re.sub(pattern, '', text)
    return text

In [None]:
# application of removal of special characters
df["Review"] = df["Review"].apply(removeSpecial)

In [None]:
df.head()

Unnamed: 0,Review,Sentiment
0,The story of Sweeney Todd evokes memories of t...,Positive
1,Margaret Colin stars as the principal figure i...,Positive
2,The first bottom movie was an absolute laugh f...,Positive
3,This film is completely underratedIts a film s...,Positive
4,I stumbled upon this movie whilst flipping cha...,Positive


# Stemming and Stop Words

In [None]:
# defining a porter stemmer
def porterStemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

In [None]:
# removal the stopwords
def removeStopwords(text, isLowerCase = False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    # obtaining the list of tokens that can be added
    if isLowerCase:
        filteredTokens = [token for token in tokens if token not in stopwords]
    else:
        filteredTokens =[token for token in tokens if token.lower() not in stopwords]
    filteredText = ' '.join(filteredTokens)    
    return filteredText

In [None]:
# list of english stopwords
print(stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [None]:
# application of removal of stopwords
df['Review'] = df['Review'].apply(removeStopwords)

In [None]:
df.head()

Unnamed: 0,Review,Sentiment
0,story Sweeney Todd evokes memories work classi...,Positive
1,Margaret Colin stars principal figure story wa...,Positive
2,first bottom movie absolute laugh beginning Ex...,Positive
3,film completely underratedIts film similar Kee...,Positive
4,stumbled upon movie whilst flipping channels t...,Positive


In [None]:
# application of Porter Stemmer
df['Review'] = df['Review'].apply(porterStemmer)

In [None]:
df.head()

Unnamed: 0,Review,Sentiment
0,stori sweeney todd evok memori work classic wr...,Positive
1,margaret colin star princip figur stori watch ...,Positive
2,first bottom movi absolut laugh begin excel ma...,Positive
3,film complet underratedit film similar keenan ...,Positive
4,stumbl upon movi whilst flip channel teeve lat...,Positive


In [None]:
normalizedReviews = df["Review"]
normalizedReviews

0        stori sweeney todd evok memori work classic wr...
1        margaret colin star princip figur stori watch ...
2        first bottom movi absolut laugh begin excel ma...
3        film complet underratedit film similar keenan ...
4        stumbl upon movi whilst flip channel teeve lat...
                               ...                        
49995    disappoint film peopl seem substanc lead prota...
49996    put alert say movi shouldnt watch fail fit tri...
49997    shazbot embarrass fact here list 100 make emba...
49998    film wasnt program italian cinemasi seen manif...
49999    bliep movi couldnt fiend better script nice mo...
Name: Review, Length: 50000, dtype: object

# Bag of Words Model

In [None]:
# creation of bag of words for review
# feature selection with term frequency across corpus
countVectorizer = CountVectorizer(min_df = 0, max_df = 1,
                            binary = False, ngram_range = (1, 1),
                            max_features = 10000)
bowReviews = countVectorizer.fit_transform(normalizedReviews)

In [None]:
# size of bow
print(bowReviews.shape)

(50000, 10000)


In [None]:
# the features used - that is the vocabulary
print(countVectorizer.get_feature_names())





In [None]:
# printing the non-zero entries in the sparse BOW
print(bowReviews)

  (4, 2906)	1
  (5, 1410)	11
  (5, 3189)	1
  (16, 1223)	6
  (16, 1418)	4
  (16, 1940)	7
  (16, 5104)	1
  (23, 6106)	1
  (23, 5388)	1
  (27, 4176)	1
  (37, 8390)	1
  (43, 8547)	1
  (51, 2047)	1
  (54, 3176)	1
  (54, 8570)	1
  (66, 7922)	1
  (68, 409)	3
  (68, 3352)	1
  (68, 3472)	1
  (76, 7086)	1
  (81, 8616)	1
  (86, 67)	5
  (87, 1406)	2
  (94, 2590)	1
  (103, 2095)	1
  :	:
  (49872, 9816)	2
  (49888, 5015)	1
  (49893, 7477)	1
  (49896, 2815)	1
  (49897, 2695)	1
  (49899, 9356)	2
  (49913, 2429)	1
  (49913, 3909)	1
  (49916, 1300)	3
  (49916, 395)	2
  (49917, 979)	2
  (49926, 6158)	1
  (49926, 3562)	1
  (49927, 4475)	1
  (49933, 2663)	1
  (49935, 9055)	1
  (49938, 819)	2
  (49940, 1638)	2
  (49951, 5408)	1
  (49963, 9113)	1
  (49965, 5021)	1
  (49968, 7821)	1
  (49973, 5842)	1
  (49974, 985)	2
  (49999, 5149)	1


In [None]:
# printing the sparse BOW matrix
bowReviews.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

# TF-IDF Representation

In [None]:
# creation of the tf-idf model
# using a maximum of 10000 features by term frequency accross the corpus
tfidfVectorizer = TfidfVectorizer(min_df = 0, max_df = 1, 
                                  use_idf = True, ngram_range=(1,1),
                                  max_features = 10000)
tfidfReviews = tfidfVectorizer.fit_transform(normalizedReviews)

In [None]:
# size of tf-idf
print(tfidfReviews.shape)

(50000, 10000)


In [None]:
# the features used - that is the vocabulary used
print(tfidfVectorizer.get_feature_names())





In [None]:
# printing the non-zero entries in the sparse matrix
print(tfidfReviews)

  (4, 2906)	1.0
  (5, 3189)	0.09053574604251853
  (5, 1410)	0.9958932064677037
  (16, 5104)	0.09901475429766743
  (16, 1940)	0.693103280083672
  (16, 1418)	0.3960590171906697
  (16, 1223)	0.5940885257860047
  (23, 5388)	0.7071067811865475
  (23, 6106)	0.7071067811865475
  (27, 4176)	1.0
  (37, 8390)	1.0
  (43, 8547)	1.0
  (51, 2047)	1.0
  (54, 8570)	0.7071067811865475
  (54, 3176)	0.7071067811865475
  (66, 7922)	1.0
  (68, 3472)	0.3015113445777636
  (68, 3352)	0.3015113445777636
  (68, 409)	0.9045340337332909
  (76, 7086)	1.0
  (81, 8616)	1.0
  (86, 67)	1.0
  (87, 1406)	1.0
  (94, 2590)	1.0
  (103, 2095)	1.0
  :	:
  (49872, 9816)	1.0
  (49888, 5015)	1.0
  (49893, 7477)	1.0
  (49896, 2815)	1.0
  (49897, 2695)	1.0
  (49899, 9356)	1.0
  (49913, 3909)	0.7071067811865475
  (49913, 2429)	0.7071067811865475
  (49916, 395)	0.554700196225229
  (49916, 1300)	0.8320502943378436
  (49917, 979)	1.0
  (49926, 3562)	0.7071067811865475
  (49926, 6158)	0.7071067811865475
  (49927, 4475)	1.0
  (49933, 2

In [None]:
# printing the sparse BOW matrix
tfidfReviews.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# Similarity Ranking

In [None]:
# the 118th document in tf-idf
docToCompare = tfidfReviews[68:69]
print(docToCompare)

  (0, 3472)	0.3015113445777636
  (0, 3352)	0.3015113445777636
  (0, 409)	0.9045340337332909


In [None]:
# calculate the cosine similarities
cosineSimilarities = linear_kernel(docToCompare, tfidfReviews).flatten()

In [None]:
# top 5 similar docs (including the doc itself)
relatedDocs = cosineSimilarities.argsort()[:-5:-1]
relatedDocs

array([   68, 49999, 16673, 16671])

68th is the query document. 499999th is the first closest match as per the cosine similarity

In [None]:
print(df["Review"][68])
print(df["Review"][49999])

akin prizewin 2004 movi headongegen die wand depict appealingli chaotic world selfdestruct dynam turkishgerman rocker name cahit birol nel documentari offshoot headon explor rang music one might find istanbul today one energet curiou german avantrock musician alexand hack group einstuerzend neubauten arrang sound track perform music headon assist film crew turkish speaker provid director akin get everyth rap tradit turkish classic song rock kurdish music turkish pop chaotic openend world cahit one east east west west twainsomehowdo meetlik istanbul sit edg europ asia bring two world togeth remain sui generi mlang includ turkish pop turkish tradit song kurdish lament roma jazz musician group street busker siyasiyabend live offbeat shot istanbul street life talk camera synthesi person music histori singer musician work grand hotel de londr istanbul beyoglu quarter cahit stay end headon look belov hack roam around citi crew equip interview peopl record musich begin loud rock neopsychedel 